Skip to content

Commit

Permalink
new package: llama-cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
Freed-Wu authored and TomJo2000 committed Apr 4, 2024
1 parent 6cae8f2 commit 5a842a6
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 0 deletions.
11 changes: 11 additions & 0 deletions packages/llama-cpp/0001-fix-compile-options.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -434,7 +434,7 @@
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
# Raspberry Pi 2
- add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations)
+ add_compile_options(-mno-unaligned-access -funsafe-math-optimizations)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
# Raspberry Pi 3, 4, Zero 2 (32-bit)
92 changes: 92 additions & 0 deletions packages/llama-cpp/0002-impl-missing-functions.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
--- a/k_quants.c
+++ b/k_quants.c
@@ -43,6 +43,89 @@
// 2-6 bit quantization in super-blocks
//

+#if defined(__ARM_NEON)
+
+#if !defined(__aarch64__)
+
+inline static uint16_t vaddvq_u8(uint8x16_t v) {
+ return
+ (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
+ (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
+ (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
+ (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
+ (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
+ (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
+ (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
+ (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
+}
+
+inline static int16_t vaddvq_s8(int8x16_t v) {
+ return
+ (int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
+ (int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
+ (int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
+ (int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
+ (int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
+ (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
+ (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
+ (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
+}
+
+inline static int32_t vaddvq_s16(int16x8_t v) {
+ return
+ (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
+ (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
+ (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
+ (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
+}
+
+inline static uint32_t vaddvq_u16(uint16x8_t v) {
+ return
+ (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
+ (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
+ (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
+ (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
+}
+
+inline static int32_t vaddvq_s32(int32x4_t v) {
+ return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
+}
+
+inline static float vaddvq_f32(float32x4_t v) {
+ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
+}
+
+inline static float vminvq_f32(float32x4_t v) {
+ return
+ MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
+ MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
+}
+
+inline static float vmaxvq_f32(float32x4_t v) {
+ return
+ MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
+ MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
+}
+
+inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
+ int32x4_t res;
+
+ res[0] = roundf(vgetq_lane_f32(v, 0));
+ res[1] = roundf(vgetq_lane_f32(v, 1));
+ res[2] = roundf(vgetq_lane_f32(v, 2));
+ res[3] = roundf(vgetq_lane_f32(v, 3));
+
+ return res;
+}
+
+inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
+ const int16x4_t c = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
+ const int16x4_t d = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
+ return vcombine_s16(c, d);
+}
+
+#endif
+#endif

//
// ===================== Helper functions
38 changes: 38 additions & 0 deletions packages/llama-cpp/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
TERMUX_PKG_HOMEPAGE=https://github.com/ggerganov/llama.cpp
TERMUX_PKG_DESCRIPTION="Port of Facebook's LLaMA model in C/C++"
TERMUX_PKG_LICENSE=GPL-3.0
TERMUX_PKG_MAINTAINER=@termux
TERMUX_PKG_VERSION=0.0.0-b1094
TERMUX_PKG_SRCURL=https://github.com/ggerganov/llama.cpp/archive/refs/tags/${TERMUX_PKG_VERSION#*-}.tar.gz
TERMUX_PKG_SHA256=315071e1034846e8ed448008cda35da481f056d6495696cb862ef8b94aaae0f6
TERMUX_PKG_AUTO_UPDATE=true
TERMUX_PKG_DEPENDS="libc++, libopenblas, openmpi"
TERMUX_PKG_RECOMMENDS="python-numpy, python-sentencepiece"
TERMUX_PKG_EXTRA_CONFIGURE_ARGS="
-DLLAMA_MPI=ON
-DBUILD_SHARED_LIBS=ON
-DLLAMA_BLAS=ON
-DLLAMA_BLAS_VENDOR=OpenBLAS
"

# XXX: llama.cpp uses `int64_t`, but on 32-bit Android `size_t` is `int32_t`.
# XXX: I don't think it will work if we simply casting it.
TERMUX_PKG_BLACKLISTED_ARCHES="arm, i686"

termux_pkg_auto_update() {
local latest_tag
latest_tag="$(
termux_github_api_get_tag "${TERMUX_PKG_SRCURL}" "${TERMUX_PKG_UPDATE_TAG_TYPE}"
)"

if [[ -z "${latest_tag}" ]]; then
termux_error_exit "ERROR: Unable to get tag from ${TERMUX_PKG_SRCURL}"
fi
termux_pkg_upgrade_version "0.0.0-${latest_tag}"
}

termux_step_post_make_install() {
cd "$TERMUX_PREFIX/bin" || exit 1
mv main llama
mv server llama-server
}

0 comments on commit 5a842a6

Please sign in to comment.