Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama cpp #17457

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions packages/llama-cpp/0001-fix-compile-options.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -434,7 +434,7 @@
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
# Raspberry Pi 2
- add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations)
+ add_compile_options(-mno-unaligned-access -funsafe-math-optimizations)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
# Raspberry Pi 3, 4, Zero 2 (32-bit)
92 changes: 92 additions & 0 deletions packages/llama-cpp/0002-impl-missing-functions.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
--- a/k_quants.c
+++ b/k_quants.c
@@ -43,6 +43,89 @@
// 2-6 bit quantization in super-blocks
//

+#if defined(__ARM_NEON)
+
+#if !defined(__aarch64__)
+
+inline static uint16_t vaddvq_u8(uint8x16_t v) {
+ return
+ (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
+ (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
+ (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
+ (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
+ (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
+ (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
+ (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
+ (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
+}
+
+inline static int16_t vaddvq_s8(int8x16_t v) {
+ return
+ (int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
+ (int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
+ (int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
+ (int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
+ (int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
+ (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
+ (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
+ (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
+}
+
+inline static int32_t vaddvq_s16(int16x8_t v) {
+ return
+ (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
+ (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
+ (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
+ (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
+}
+
+inline static uint32_t vaddvq_u16(uint16x8_t v) {
+ return
+ (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
+ (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
+ (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
+ (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
+}
+
+inline static int32_t vaddvq_s32(int32x4_t v) {
+ return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
+}
+
+inline static float vaddvq_f32(float32x4_t v) {
+ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
+}
+
+inline static float vminvq_f32(float32x4_t v) {
+ return
+ MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
+ MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
+}
+
+inline static float vmaxvq_f32(float32x4_t v) {
+ return
+ MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
+ MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
+}
+
+inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
+ int32x4_t res;
+
+ res[0] = roundf(vgetq_lane_f32(v, 0));
+ res[1] = roundf(vgetq_lane_f32(v, 1));
+ res[2] = roundf(vgetq_lane_f32(v, 2));
+ res[3] = roundf(vgetq_lane_f32(v, 3));
+
+ return res;
+}
+
+inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
+ const int16x4_t c = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
+ const int16x4_t d = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
+ return vcombine_s16(c, d);
+}
+
+#endif
+#endif

//
// ===================== Helper functions
38 changes: 38 additions & 0 deletions packages/llama-cpp/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
TERMUX_PKG_HOMEPAGE=https://github.com/ggerganov/llama.cpp
TERMUX_PKG_DESCRIPTION="Port of Facebook's LLaMA model in C/C++"
TERMUX_PKG_LICENSE=GPL-3.0
TERMUX_PKG_MAINTAINER=@termux
TERMUX_PKG_VERSION=0.0.0-b1094
TERMUX_PKG_SRCURL=https://github.com/ggerganov/llama.cpp/archive/refs/tags/${TERMUX_PKG_VERSION#*-}.tar.gz
TERMUX_PKG_SHA256=315071e1034846e8ed448008cda35da481f056d6495696cb862ef8b94aaae0f6
TERMUX_PKG_AUTO_UPDATE=true
truboxl marked this conversation as resolved.
Show resolved Hide resolved
TERMUX_PKG_DEPENDS="libc++, libopenblas, openmpi"
TERMUX_PKG_RECOMMENDS="python-numpy, python-sentencepiece"
TERMUX_PKG_EXTRA_CONFIGURE_ARGS="
-DLLAMA_MPI=ON
-DBUILD_SHARED_LIBS=ON
-DLLAMA_BLAS=ON
-DLLAMA_BLAS_VENDOR=OpenBLAS
"

# XXX: llama.cpp uses `int64_t`, but on 32-bit Android `size_t` is `int32_t`.
# XXX: I don't think it will work if we simply casting it.
TERMUX_PKG_BLACKLISTED_ARCHES="arm, i686"

termux_pkg_auto_update() {
local latest_tag
latest_tag="$(
termux_github_api_get_tag "${TERMUX_PKG_SRCURL}" "${TERMUX_PKG_UPDATE_TAG_TYPE}"
)"

if [[ -z "${latest_tag}" ]]; then
termux_error_exit "ERROR: Unable to get tag from ${TERMUX_PKG_SRCURL}"
fi
termux_pkg_upgrade_version "0.0.0-${latest_tag}"
}

termux_step_post_make_install() {
cd "$TERMUX_PREFIX/bin" || exit 1
mv main llama
mv server llama-server
}
Loading