Merge pull request #400 from utilityai/update-llama-cpp-2024-07-09

Update llama cpp [MAC TESTING WANTED]
utilityai · Jul 26, 2024 · 8ea1482 · 8ea1482
2 parents 09088ec + 1788762
commit 8ea1482
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 49 deletions.
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
@@ -1,5 +1,6 @@
 //! A safe wrapper around `llama_model`.
 use std::ffi::CString;
+use std::num::NonZeroU16;
 use std::os::raw::c_int;
 use std::path::Path;
 use std::ptr::NonNull;
@@ -131,7 +132,7 @@ impl LlamaModel {
         token: LlamaToken,
         special: Special,
     ) -> Result<Vec<u8>, TokenToStringError> {
-        self.token_to_bytes_with_size(token, 32, special)
+        self.token_to_bytes_with_size(token, 32, special, None)
     }
 
     /// Convert a vector of tokens to a single string.
@@ -264,7 +265,7 @@ impl LlamaModel {
         buffer_size: usize,
         special: Special,
     ) -> Result<String, TokenToStringError> {
-        let bytes = self.token_to_bytes_with_size(token, buffer_size, special)?;
+        let bytes = self.token_to_bytes_with_size(token, buffer_size, special, None)?;
         Ok(String::from_utf8(bytes)?)
     }
 
@@ -287,6 +288,7 @@ impl LlamaModel {
         token: LlamaToken,
         buffer_size: usize,
         special: Special,
+        lstrip: Option<NonZeroU16>
     ) -> Result<Vec<u8>, TokenToStringError> {
         if token == self.token_nl() {
             return Ok(String::from("\n").into_bytes());
@@ -314,8 +316,9 @@ impl LlamaModel {
         let len = string.as_bytes().len();
         let len = c_int::try_from(len).expect("length fits into c_int");
         let buf = string.into_raw();
+        let lstrip = lstrip.map(|it| i32::from(it.get())).unwrap_or(0);
         let size = unsafe {
-            llama_cpp_sys_2::llama_token_to_piece(self.model.as_ptr(), token.0, buf, len, special)
+            llama_cpp_sys_2::llama_token_to_piece(self.model.as_ptr(), token.0, buf, len, lstrip, special)
         };
 
         match size {

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
@@ -86,11 +86,24 @@ compile_error!("feature \"vulkan\" cannot be enabled alongside other GPU based f
 
 static LLAMA_PATH: Lazy<PathBuf> = Lazy::new(|| PathBuf::from("./llama.cpp"));
 
-fn compile_bindings(out_path: &Path, llama_header_path: &Path) {
+fn compile_bindings(
+    out_path: &Path,
+    llama_header_path: &Path,
+) -> Result<(), Box<dyn std::error::Error + 'static>> {
     println!("Generating bindings..");
+
+    let includes = [
+        llama_header_path.join("ggml").join("include"),
+    ];
+
     let bindings = bindgen::Builder::default()
-        // .header(llama_header_path.join("ggml.h").to_string_lossy())
-        .header(llama_header_path.join("llama.h").to_string_lossy())
+        .clang_args(includes.map(|path| format!("-I{}", path.to_string_lossy())))
+        .header(
+            llama_header_path
+                .join("include")
+                .join("llama.h")
+                .to_string_lossy(),
+        )
         .derive_partialeq(true)
         .allowlist_function("ggml_.*")
         .allowlist_type("ggml_.*")
@@ -106,11 +119,11 @@ fn compile_bindings(out_path: &Path, llama_header_path: &Path) {
         bindings = bindings.parse_callbacks(Box::new(GGMLLinkRename {}));
     }
 
-    let bindings = bindings.generate().expect("Unable to generate bindings");
+    let bindings = bindings.generate()?;
 
-    bindings
-        .write_to_file(out_path.join("bindings.rs"))
-        .expect("Couldn't write bindings!");
+    bindings.write_to_file(out_path.join("bindings.rs"))?;
+
+    Ok(())
 }
 
 #[cfg(all(
@@ -324,26 +337,6 @@ fn push_feature_flags(cx: &mut Build, cxx: &mut Build) {
     }
 }
 
-fn compile_opencl(cx: &mut Build, cxx: &mut Build) {
-    println!("Compiling OpenCL GGML..");
-
-    // TODO
-    println!("cargo:warning=OpenCL compilation and execution has not been properly tested yet");
-
-    cx.define("GGML_USE_CLBLAST", None);
-    cxx.define("GGML_USE_CLBLAST", None);
-
-    if cfg!(target_os = "linux") {
-        println!("cargo:rustc-link-lib=OpenCL");
-        println!("cargo:rustc-link-lib=clblast");
-    } else if cfg!(target_os = "macos") {
-        println!("cargo:rustc-link-lib=framework=OpenCL");
-        println!("cargo:rustc-link-lib=clblast");
-    }
-
-    cxx.file(LLAMA_PATH.join("ggml-opencl.cpp"));
-}
-
 fn compile_openblas(cx: &mut Build) {
     println!("Compiling OpenBLAS GGML..");
 
@@ -462,7 +455,9 @@ fn compile_cuda(cx: &mut Build, cxx: &mut Build, featless_cxx: Build) -> &'stati
     }
 
     let lib_name = "ggml-cuda";
-    let cuda_path = LLAMA_PATH.join("ggml-cuda");
+    let ggml_path = LLAMA_PATH.join("ggml");
+    let ggml_src = ggml_path.join("src");
+    let cuda_path = ggml_src.join("ggml-cuda");
     let cuda_sources = read_dir(cuda_path.as_path())
         .unwrap()
         .map(|f| f.unwrap())
@@ -476,10 +471,11 @@ fn compile_cuda(cx: &mut Build, cxx: &mut Build, featless_cxx: Build) -> &'stati
         .map(|entry| entry.path());
 
     nvcc.include(cuda_path.as_path())
-        .include(LLAMA_PATH.as_path())
+        .include(ggml_src.as_path())
+        .include(ggml_path.join("include").as_path())
         .files(cuda_sources)
         .files(template_instances)
-        .file(LLAMA_PATH.join("ggml-cuda.cu"))
+        .file(ggml_src.join("ggml-cuda.cu"))
         .compile(lib_name);
 
     lib_name
@@ -655,22 +651,28 @@ fn compile_vulkan(cx: &mut Build, cxx: &mut Build) -> &'static str {
 
 fn compile_ggml(mut cx: Build) {
     println!("Compiling GGML..");
+    let ggml_src = LLAMA_PATH.join("ggml").join("src");
+    let ggml_include = LLAMA_PATH.join("ggml").join("include");
     cx.std("c11")
-        .include(LLAMA_PATH.as_path())
-        .file(LLAMA_PATH.join("ggml.c"))
-        .file(LLAMA_PATH.join("ggml-alloc.c"))
-        .file(LLAMA_PATH.join("ggml-backend.c"))
-        .file(LLAMA_PATH.join("ggml-quants.c"))
+        .include(ggml_include)
+        .file(ggml_src.join("ggml.c"))
+        .file(ggml_src.join("ggml-alloc.c"))
+        .file(ggml_src.join("ggml-backend.c"))
+        .file(ggml_src.join("ggml-quants.c"))
         .compile("ggml");
 }
 
 fn compile_llama(mut cxx: Build, _out_path: impl AsRef<Path>) {
     println!("Compiling Llama.cpp..");
+    let llama_cpp_src = LLAMA_PATH.join("src");
+    let llama_include = LLAMA_PATH.join("include");
+    let ggml_include = LLAMA_PATH.join("ggml").join("include");
     cxx.std("c++11")
-        .include(LLAMA_PATH.as_path())
-        .file(LLAMA_PATH.join("unicode.cpp"))
-        .file(LLAMA_PATH.join("unicode-data.cpp"))
-        .file(LLAMA_PATH.join("llama.cpp"))
+        .include(llama_include)
+        .include(ggml_include)
+        .file(llama_cpp_src.join("unicode.cpp"))
+        .file(llama_cpp_src.join("unicode-data.cpp"))
+        .file(llama_cpp_src.join("llama.cpp"))
         .compile("llama");
 }
 
@@ -683,9 +685,10 @@ fn main() {
 
         let llama_header_path = std::env::var("LLAMA_HEADE");
         if let Ok(llama_header_path) = llama_header_path {
-            compile_bindings(&out_path, Path::new(&llama_header_path));
+            compile_bindings(&out_path, Path::new(&llama_header_path))
+                .expect("failed to generate bindings");
         } else {
-            compile_bindings(&out_path, &LLAMA_PATH);
+            compile_bindings(&out_path, &LLAMA_PATH).expect("failed to generate bindings");
         }
 
         if let Ok(llama_lib_path) = std::env::var("LLAMA_LIB") {
@@ -703,7 +706,7 @@ fn main() {
 
     println!("cargo:rerun-if-changed={}", LLAMA_PATH.display());
 
-    compile_bindings(&out_path, &LLAMA_PATH);
+    compile_bindings(&out_path, &LLAMA_PATH).expect("failed to generate bindings");
 
     let mut cx = Build::new();
     let mut cxx = Build::new();
@@ -720,9 +723,6 @@ fn main() {
         Some(compile_vulkan(&mut cx, &mut cxx))
     } else if cfg!(feature = "cuda") {
         Some(compile_cuda(&mut cx, &mut cxx, featless_cxx))
-    } else if cfg!(feature = "opencl") {
-        compile_opencl(&mut cx, &mut cxx);
-        None
     } else if cfg!(feature = "openblas") {
         compile_openblas(&mut cx);
         None

diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp