diff --git a/packages/llama-cpp-opencl/0001-fix-compile-options.patch b/packages/llama-cpp-opencl/0001-fix-compile-options.patch new file mode 100644 index 000000000000000..760ab0e3b13e3d7 --- /dev/null +++ b/packages/llama-cpp-opencl/0001-fix-compile-options.patch @@ -0,0 +1,11 @@ +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -434,7 +434,7 @@ + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") + # Raspberry Pi 2 +- add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations) ++ add_compile_options(-mno-unaligned-access -funsafe-math-optimizations) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") + # Raspberry Pi 3, 4, Zero 2 (32-bit) diff --git a/packages/llama-cpp-opencl/0002-impl-missing-functions.patch b/packages/llama-cpp-opencl/0002-impl-missing-functions.patch new file mode 100644 index 000000000000000..46522f8be057de1 --- /dev/null +++ b/packages/llama-cpp-opencl/0002-impl-missing-functions.patch @@ -0,0 +1,92 @@ +--- a/k_quants.c ++++ b/k_quants.c +@@ -43,6 +43,89 @@ + // 2-6 bit quantization in super-blocks + // + ++#if defined(__ARM_NEON) ++ ++#if !defined(__aarch64__) ++ ++inline static uint16_t vaddvq_u8(uint8x16_t v) { ++ return ++ (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) + ++ (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) + ++ (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) + ++ (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) + ++ (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) + ++ (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) + ++ (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) + ++ (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15); ++} ++ ++inline static int16_t vaddvq_s8(int8x16_t v) { ++ return ++ (int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) + ++ (int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) + ++ (int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) + ++ (int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) + ++ (int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) + ++ (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) + ++ (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) + ++ (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15); ++} ++ ++inline static int32_t vaddvq_s16(int16x8_t v) { ++ return ++ (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + ++ (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + ++ (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + ++ (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); ++} ++ ++inline static uint32_t vaddvq_u16(uint16x8_t v) { ++ return ++ (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) + ++ (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) + ++ (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) + ++ (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7); ++} ++ ++inline static int32_t vaddvq_s32(int32x4_t v) { ++ return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); ++} ++ ++inline static float vaddvq_f32(float32x4_t v) { ++ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); ++} ++ ++inline static float vminvq_f32(float32x4_t v) { ++ return ++ MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), ++ MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); ++} ++ ++inline static float vmaxvq_f32(float32x4_t v) { ++ return ++ MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), ++ MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); ++} ++ ++inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { ++ int32x4_t res; ++ ++ res[0] = roundf(vgetq_lane_f32(v, 0)); ++ res[1] = roundf(vgetq_lane_f32(v, 1)); ++ res[2] = roundf(vgetq_lane_f32(v, 2)); ++ res[3] = roundf(vgetq_lane_f32(v, 3)); ++ ++ return res; ++} ++ ++inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { ++ const int16x4_t c = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); ++ const int16x4_t d = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); ++ return vcombine_s16(c, d); ++} ++ ++#endif ++#endif + + // + // ===================== Helper functions diff --git a/packages/llama-cpp-opencl/build.sh b/packages/llama-cpp-opencl/build.sh new file mode 100644 index 000000000000000..591317120ce240b --- /dev/null +++ b/packages/llama-cpp-opencl/build.sh @@ -0,0 +1,39 @@ +TERMUX_PKG_HOMEPAGE=https://github.com/ggerganov/llama.cpp +TERMUX_PKG_DESCRIPTION="Port of Facebook's LLaMA model in C/C++ (OpenCL)" +TERMUX_PKG_LICENSE=GPL-3.0 +TERMUX_PKG_MAINTAINER=@termux +TERMUX_PKG_VERSION=0.0.0-b1094 +TERMUX_PKG_SRCURL=https://github.com/ggerganov/llama.cpp/archive/refs/tags/${TERMUX_PKG_VERSION#*-}.tar.gz +TERMUX_PKG_SHA256=315071e1034846e8ed448008cda35da481f056d6495696cb862ef8b94aaae0f6 +TERMUX_PKG_AUTO_UPDATE=true +TERMUX_PKG_CONFLICTS="llama-cpp" +TERMUX_PKG_PROVIDES="llama-cpp" +TERMUX_PKG_DEPENDS="libc++, clblast, openmpi" +TERMUX_PKG_RECOMMENDS="python-numpy, python-sentencepiece" +TERMUX_PKG_EXTRA_CONFIGURE_ARGS=" +-DLLAMA_MPI=ON +-DBUILD_SHARED_LIBS=ON +-DLLAMA_CLBLAST=ON +" + +# XXX: llama.cpp uses `int64_t`, but on 32-bit Android `size_t` is `int32_t`. +# XXX: I don't think it will work if we simply casting it. +TERMUX_PKG_BLACKLISTED_ARCHES="arm, i686" + +termux_pkg_auto_update() { + local latest_tag + latest_tag="$( + termux_github_api_get_tag "${TERMUX_PKG_SRCURL}" "${TERMUX_PKG_UPDATE_TAG_TYPE}" + )" + + if [[ -z "${latest_tag}" ]]; then + termux_error_exit "ERROR: Unable to get tag from ${TERMUX_PKG_SRCURL}" + fi + termux_pkg_upgrade_version "0.0.0-${latest_tag}" +} + +termux_step_post_make_install() { + cd "$TERMUX_PREFIX/bin" || exit 1 + mv main llama + mv server llama-server +}