From 24a388e8a6ac04d21dd189bc7a909624cdea20cf Mon Sep 17 00:00:00 2001
From: Wu Zhenyu <wuzhenyu@ustc.edu>
Date: Thu, 20 Jul 2023 06:26:48 +0800
Subject: [PATCH] new package: llama-cpp-opencl

---
 .../0001-fix-compile-options.patch            | 11 +++
 .../0002-impl-missing-functions.patch         | 92 +++++++++++++++++++
 packages/llama-cpp-opencl/build.sh            | 39 ++++++++
 3 files changed, 142 insertions(+)
 create mode 100644 packages/llama-cpp-opencl/0001-fix-compile-options.patch
 create mode 100644 packages/llama-cpp-opencl/0002-impl-missing-functions.patch
 create mode 100644 packages/llama-cpp-opencl/build.sh

diff --git a/packages/llama-cpp-opencl/0001-fix-compile-options.patch b/packages/llama-cpp-opencl/0001-fix-compile-options.patch
new file mode 100644
index 000000000000000..760ab0e3b13e3d7
--- /dev/null
+++ b/packages/llama-cpp-opencl/0001-fix-compile-options.patch
@@ -0,0 +1,11 @@
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -434,7 +434,7 @@
+         endif()
+         if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
+             # Raspberry Pi 2
+-            add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations)
++            add_compile_options(-mno-unaligned-access -funsafe-math-optimizations)
+         endif()
+         if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
+             # Raspberry Pi 3, 4, Zero 2 (32-bit)
diff --git a/packages/llama-cpp-opencl/0002-impl-missing-functions.patch b/packages/llama-cpp-opencl/0002-impl-missing-functions.patch
new file mode 100644
index 000000000000000..46522f8be057de1
--- /dev/null
+++ b/packages/llama-cpp-opencl/0002-impl-missing-functions.patch
@@ -0,0 +1,92 @@
+--- a/k_quants.c
++++ b/k_quants.c
+@@ -43,6 +43,89 @@
+ // 2-6 bit quantization in super-blocks
+ //
+ 
++#if defined(__ARM_NEON)
++
++#if !defined(__aarch64__)
++
++inline static uint16_t vaddvq_u8(uint8x16_t v) {
++    return
++        (uint16_t)vgetq_lane_u8(v, 0)  + (uint16_t)vgetq_lane_u8(v, 1)  +
++        (uint16_t)vgetq_lane_u8(v, 2)  + (uint16_t)vgetq_lane_u8(v, 3)  +
++        (uint16_t)vgetq_lane_u8(v, 4)  + (uint16_t)vgetq_lane_u8(v, 5)  +
++        (uint16_t)vgetq_lane_u8(v, 6)  + (uint16_t)vgetq_lane_u8(v, 7)  +
++        (uint16_t)vgetq_lane_u8(v, 8)  + (uint16_t)vgetq_lane_u8(v, 9)  +
++        (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
++        (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
++        (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
++}
++
++inline static int16_t vaddvq_s8(int8x16_t v) {
++    return
++        (int16_t)vgetq_lane_s8(v, 0)  + (int16_t)vgetq_lane_s8(v, 1)  +
++        (int16_t)vgetq_lane_s8(v, 2)  + (int16_t)vgetq_lane_s8(v, 3)  +
++        (int16_t)vgetq_lane_s8(v, 4)  + (int16_t)vgetq_lane_s8(v, 5)  +
++        (int16_t)vgetq_lane_s8(v, 6)  + (int16_t)vgetq_lane_s8(v, 7)  +
++        (int16_t)vgetq_lane_s8(v, 8)  + (int16_t)vgetq_lane_s8(v, 9)  +
++        (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
++        (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
++        (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
++}
++
++inline static int32_t vaddvq_s16(int16x8_t v) {
++    return
++        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
++        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
++        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
++        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
++}
++
++inline static uint32_t vaddvq_u16(uint16x8_t v) {
++    return
++        (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
++        (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
++        (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
++        (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
++}
++
++inline static int32_t vaddvq_s32(int32x4_t v) {
++    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
++}
++
++inline static float vaddvq_f32(float32x4_t v) {
++    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
++}
++
++inline static float vminvq_f32(float32x4_t v) {
++    return
++        MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
++            MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
++}
++
++inline static float vmaxvq_f32(float32x4_t v) {
++    return
++        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
++            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
++}
++
++inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
++    int32x4_t res;
++
++    res[0] = roundf(vgetq_lane_f32(v, 0));
++    res[1] = roundf(vgetq_lane_f32(v, 1));
++    res[2] = roundf(vgetq_lane_f32(v, 2));
++    res[3] = roundf(vgetq_lane_f32(v, 3));
++
++    return res;
++}
++
++inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
++    const int16x4_t c = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
++    const int16x4_t d = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
++    return vcombine_s16(c, d);
++}
++
++#endif
++#endif
+ 
+ //
+ // ===================== Helper functions
diff --git a/packages/llama-cpp-opencl/build.sh b/packages/llama-cpp-opencl/build.sh
new file mode 100644
index 000000000000000..591317120ce240b
--- /dev/null
+++ b/packages/llama-cpp-opencl/build.sh
@@ -0,0 +1,39 @@
+TERMUX_PKG_HOMEPAGE=https://github.com/ggerganov/llama.cpp
+TERMUX_PKG_DESCRIPTION="Port of Facebook's LLaMA model in C/C++ (OpenCL)"
+TERMUX_PKG_LICENSE=GPL-3.0
+TERMUX_PKG_MAINTAINER=@termux
+TERMUX_PKG_VERSION=0.0.0-b1094
+TERMUX_PKG_SRCURL=https://github.com/ggerganov/llama.cpp/archive/refs/tags/${TERMUX_PKG_VERSION#*-}.tar.gz
+TERMUX_PKG_SHA256=315071e1034846e8ed448008cda35da481f056d6495696cb862ef8b94aaae0f6
+TERMUX_PKG_AUTO_UPDATE=true
+TERMUX_PKG_CONFLICTS="llama-cpp"
+TERMUX_PKG_PROVIDES="llama-cpp"
+TERMUX_PKG_DEPENDS="libc++, clblast, openmpi"
+TERMUX_PKG_RECOMMENDS="python-numpy, python-sentencepiece"
+TERMUX_PKG_EXTRA_CONFIGURE_ARGS="
+-DLLAMA_MPI=ON
+-DBUILD_SHARED_LIBS=ON
+-DLLAMA_CLBLAST=ON
+"
+
+# XXX: llama.cpp uses `int64_t`, but on 32-bit Android `size_t` is `int32_t`.
+# XXX: I don't think it will work if we simply casting it.
+TERMUX_PKG_BLACKLISTED_ARCHES="arm, i686"
+
+termux_pkg_auto_update() {
+	local latest_tag
+	latest_tag="$(
+		termux_github_api_get_tag "${TERMUX_PKG_SRCURL}" "${TERMUX_PKG_UPDATE_TAG_TYPE}"
+	)"
+
+	if [[ -z "${latest_tag}" ]]; then
+		termux_error_exit "ERROR: Unable to get tag from ${TERMUX_PKG_SRCURL}"
+	fi
+	termux_pkg_upgrade_version "0.0.0-${latest_tag}"
+}
+
+termux_step_post_make_install() {
+	cd "$TERMUX_PREFIX/bin" || exit 1
+	mv main llama
+	mv server llama-server
+}