From 70ec39239e68d9eb9110140ed26e5fbdbd0e91ff Mon Sep 17 00:00:00 2001
From: marcus <marcus.s.dunn@gmail.com>
Date: Mon, 25 Mar 2024 17:04:31 -0700
Subject: [PATCH 1/8] use cmake instead of `cc`

---
 Cargo.lock                 |  24 ++--
 Cargo.toml                 |   2 +-
 llama-cpp-2/src/lib.rs     |  10 ++
 llama-cpp-sys-2/Cargo.toml |   2 +-
 llama-cpp-sys-2/build.rs   | 267 ++-----------------------------------
 llama-cpp-sys-2/src/lib.rs |  12 ++
 6 files changed, 47 insertions(+), 270 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a96488ef..17f31be0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -123,10 +123,6 @@ name = "cc"
 version = "1.0.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5"
-dependencies = [
- "jobserver",
- "libc",
-]
 
 [[package]]
 name = "cexpr"
@@ -194,6 +190,15 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce"
 
+[[package]]
+name = "cmake"
+version = "0.1.50"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -418,15 +423,6 @@ version = "1.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
 
-[[package]]
-name = "jobserver"
-version = "0.1.28"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab46a6e9526ddef3ae7f787c06f0f2600639ba80ea3eade3d8e670a2230f51d6"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "js-sys"
 version = "0.3.64"
@@ -484,7 +480,7 @@ name = "llama-cpp-sys-2"
 version = "0.1.45"
 dependencies = [
  "bindgen",
- "cc",
+ "cmake",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 1c6eba10..e29e58d8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,9 +16,9 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.4"
-cc = "1.0.90"
 anyhow = "1.0.81"
 clap = "4.5.3"
+cmake = "0.1.50"
 
 [workspace.lints.rust]
 missing_docs = { level = "warn" }
diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 49e333e0..2aedd4c5 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -242,3 +242,13 @@ pub fn ggml_time_us() -> i64 {
 pub fn llama_supports_mlock() -> bool {
     unsafe { llama_cpp_sys_2::llama_supports_mlock() }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    
+    #[test]
+    fn smoke_test() {
+        ggml_time_us();
+    }
+}
\ No newline at end of file
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index fc908456..94577bec 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -42,7 +42,7 @@ include = [
 
 [build-dependencies]
 bindgen = { workspace = true }
-cc = { workspace = true, features = ["parallel"] }
+cmake = { workspace = true }
 
 [features]
 cublas = []
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 5d14cea5..6d131b3e 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -1,187 +1,21 @@
 use std::env;
-use std::path::Path;
 use std::path::PathBuf;
+use cmake::Config;
 
 fn main() {
     println!("cargo:rerun-if-changed=llama.cpp");
 
-    let cublas_enabled = env::var("CARGO_FEATURE_CUBLAS").is_ok();
-
-    let mut ggml_cuda = if cublas_enabled {
-        Some(cc::Build::new())
-    } else {
-        None
-    };
-
-    if !Path::new("llama.cpp/ggml.c").exists() {
-        panic!("llama.cpp seems to not be populated, try running `git submodule update --init --recursive` to init.")
-    }
-
-    let mut ggml = cc::Build::new();
-    let mut llama_cpp = cc::Build::new();
-
-    ggml.cpp(false);
-    llama_cpp.cpp(true);
-
-    // https://github.com/ggerganov/llama.cpp/blob/a836c8f534ab789b02da149fbdaf7735500bff74/Makefile#L364-L368
-    if let Some(ggml_cuda) = &mut ggml_cuda {
-        for lib in [
-            "cuda", "cublas", "culibos", "cudart", "cublasLt", "pthread", "dl", "rt",
-        ] {
-            println!("cargo:rustc-link-lib={}", lib);
-        }
-        if !ggml_cuda.get_compiler().is_like_msvc() {
-            for lib in ["culibos", "pthread", "dl", "rt"] {
-                println!("cargo:rustc-link-lib={}", lib);
-            }
-        }
-
-        println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
-
-        if cfg!(target_arch = "aarch64") {
-            ggml_cuda
-                .flag_if_supported("-mfp16-format=ieee")
-                .flag_if_supported("-mno-unaligned-access");
-            ggml.flag_if_supported("-mfp16-format=ieee")
-                .flag_if_supported("-mno-unaligned-access");
-            llama_cpp
-                .flag_if_supported("-mfp16-format=ieee")
-                .flag_if_supported("-mno-unaligned-access");
-            ggml.flag_if_supported("-mfp16-format=ieee")
-                .flag_if_supported("-mno-unaligned-access");
-        }
-
-        ggml_cuda
-            .cuda(true)
-            .flag("-arch=all")
-            .file("llama.cpp/ggml-cuda.cu")
-            .include("llama.cpp");
-
-        if ggml_cuda.get_compiler().is_like_msvc() {
-            ggml_cuda.std("c++14");
-        } else {
-            ggml_cuda.flag("-std=c++11").std("c++11");
-        }
-
-        ggml.define("GGML_USE_CUBLAS", None);
-        ggml_cuda.define("GGML_USE_CUBLAS", None);
-        llama_cpp.define("GGML_USE_CUBLAS", None);
-    }
-
-    for build in [&mut ggml, &mut llama_cpp] {
-        let compiler = build.get_compiler();
-
-        if cfg!(target_arch = "i686") || cfg!(target_arch = "x86_64") {
-            let features = x86::Features::get_target();
-            if compiler.is_like_clang() || compiler.is_like_gnu() {
-                build.flag("-pthread");
-
-                if features.avx {
-                    build.flag("-mavx");
-                }
-                if features.avx2 {
-                    build.flag("-mavx2");
-                }
-                if features.fma {
-                    build.flag("-mfma");
-                }
-                if features.f16c {
-                    build.flag("-mf16c");
-                }
-                if features.sse3 {
-                    build.flag("-msse3");
-                }
-            } else if compiler.is_like_msvc() {
-                match (features.avx2, features.avx) {
-                    (true, _) => {
-                        build.flag("/arch:AVX2");
-                    }
-                    (_, true) => {
-                        build.flag("/arch:AVX");
-                    }
-                    _ => {}
-                }
-            }
-        } else if cfg!(target_arch = "aarch64")
-            && (compiler.is_like_clang() || compiler.is_like_gnu())
-        {
-            if cfg!(target_os = "macos") {
-                build.flag("-mcpu=apple-m1");
-            } else if env::var("HOST") == env::var("TARGET") {
-                build.flag("-mcpu=native");
-            }
-            build.flag("-pthread");
-        }
-    }
-
-    // https://github.com/ggerganov/llama.cpp/blob/191221178f51b6e81122c5bda0fd79620e547d07/Makefile#L133-L141
-    if cfg!(target_os = "macos") {
-        assert!(!cublas_enabled, "CUBLAS is not supported on macOS");
-
-        println!("cargo:rustc-link-lib=framework=Metal");
-        println!("cargo:rustc-link-lib=framework=Foundation");
-        println!("cargo:rustc-link-lib=framework=MetalPerformanceShaders");
-        println!("cargo:rustc-link-lib=framework=MetalKit");
-
-        llama_cpp.define("_DARWIN_C_SOURCE", None);
-
-        // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L340-L343
-        llama_cpp.define("GGML_USE_METAL", None);
-        llama_cpp.define("GGML_USE_ACCELERATE", None);
-        llama_cpp.define("ACCELERATE_NEW_LAPACK", None);
-        llama_cpp.define("ACCELERATE_LAPACK_ILP64", None);
-        println!("cargo:rustc-link-lib=framework=Accelerate");
-
-        metal_hack(&mut ggml);
-        ggml.include("./llama.cpp/ggml-metal.h");
-    }
-
-    if cfg!(target_os = "dragonfly") {
-        llama_cpp.define("__BSD_VISIBLE", None);
-    }
-
-    if cfg!(target_os = "linux") {
-        ggml.define("_GNU_SOURCE", None);
-    }
-
-    ggml.std("c11")
-        .include("./llama.cpp")
-        .file("llama.cpp/ggml.c")
-        .file("llama.cpp/ggml-alloc.c")
-        .file("llama.cpp/ggml-backend.c")
-        .file("llama.cpp/ggml-quants.c")
-        .define("GGML_USE_K_QUANTS", None);
-
-    llama_cpp
-        .define("_XOPEN_SOURCE", Some("600"))
-        .include("llama.cpp")
-        .std("c++11")
-        .file("llama.cpp/llama.cpp")
-        .file("llama.cpp/unicode.cpp");
-
-    // Remove debug log output from `llama.cpp`
-    let is_release = env::var("PROFILE").unwrap() == "release";
-    if is_release {
-        ggml.define("NDEBUG", None);
-        llama_cpp.define("NDEBUG", None);
-        if let Some(cuda) = ggml_cuda.as_mut() {
-            cuda.define("NDEBUG", None);
-        }
-    }
-
-    if let Some(ggml_cuda) = ggml_cuda {
-        println!("compiling ggml-cuda");
-        ggml_cuda.compile("ggml-cuda");
-        println!("compiled ggml-cuda");
-    }
-
-    println!("compiling ggml");
-    ggml.compile("ggml");
-    println!("compiled ggml");
-
-    println!("compiling llama");
-    llama_cpp.compile("llama");
-    println!("compiled llama");
+    let build = Config::new("llama.cpp")
+        .define("LLAMA_CUBLAS", if cfg!(feature = "cublas") { "ON" } else { "OFF" })
+        .define("BUILD_SHARED_LIBS", "ON")
+        .define("LLAMA_BUILD_EXAMPLES", "OFF")
+        .define("LLAMA_BUILD_TESTS", "OFF")
+        .define("LLAMA_BUILD_SERVER", "OFF")
+        .build();
+    
+    let shared = build.join("lib");
+    println!("cargo:rustc-link-search={}", shared.display());
+    println!("cargo:rustc-link-lib=dylib=llama");
 
     let header = "llama.cpp/llama.h";
 
@@ -200,79 +34,4 @@ fn main() {
     bindings
         .write_to_file(out_path.join("bindings.rs"))
         .expect("failed to write bindings to file");
-    let llama_cpp_dir = PathBuf::from("llama.cpp").canonicalize().unwrap();
-    println!("cargo:INCLUDE={}", llama_cpp_dir.to_str().unwrap());
-    println!("cargo:OUT_DIR={}", out_path.to_str().unwrap());
-}
-
-// courtesy of https://github.com/rustformers/llm
-fn metal_hack(build: &mut cc::Build) {
-    const GGML_METAL_METAL_PATH: &str = "llama.cpp/ggml-metal.metal";
-    const GGML_METAL_PATH: &str = "llama.cpp/ggml-metal.m";
-
-    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is not defined"));
-
-    let ggml_metal_path = {
-        let ggml_metal_metal = std::fs::read_to_string(GGML_METAL_METAL_PATH)
-            .expect("Could not read ggml-metal.metal")
-            .replace('\\', "\\\\")
-            .replace('\n', "\\n")
-            .replace('\r', "\\r")
-            .replace('\"', "\\\"");
-
-        let ggml_metal =
-            std::fs::read_to_string(GGML_METAL_PATH).expect("Could not read ggml-metal.m");
-
-        let needle = r#"NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];"#;
-        if !ggml_metal.contains(needle) {
-            panic!("ggml-metal.m does not contain the needle to be replaced; the patching logic needs to be reinvestigated. Contact a `llama-cpp-sys-2` developer!");
-        }
-
-        // Replace the runtime read of the file with a compile-time string
-        let ggml_metal = ggml_metal.replace(
-            needle,
-            &format!(r#"NSString * src  = @"{ggml_metal_metal}";"#),
-        );
-
-        let patched_ggml_metal_path = out_dir.join("ggml-metal.m");
-        std::fs::write(&patched_ggml_metal_path, ggml_metal)
-            .expect("Could not write temporary patched ggml-metal.m");
-
-        patched_ggml_metal_path
-    };
-
-    build.file(ggml_metal_path);
-}
-
-// Courtesy of https://github.com/rustformers/llm
-fn get_supported_target_features() -> std::collections::HashSet<String> {
-    env::var("CARGO_CFG_TARGET_FEATURE")
-        .unwrap()
-        .split(',')
-        .map(ToString::to_string)
-        .collect()
-}
-
-mod x86 {
-    #[allow(clippy::struct_excessive_bools)]
-    #[derive(Clone, Debug, PartialEq, Eq)]
-    pub struct Features {
-        pub fma: bool,
-        pub avx: bool,
-        pub avx2: bool,
-        pub f16c: bool,
-        pub sse3: bool,
-    }
-    impl Features {
-        pub fn get_target() -> Self {
-            let features = crate::get_supported_target_features();
-            Self {
-                fma: features.contains("fma"),
-                avx: features.contains("avx"),
-                avx2: features.contains("avx2"),
-                f16c: features.contains("f16c"),
-                sse3: features.contains("sse3"),
-            }
-        }
-    }
-}
+}
\ No newline at end of file
diff --git a/llama-cpp-sys-2/src/lib.rs b/llama-cpp-sys-2/src/lib.rs
index dbec4ab4..3e82bb63 100644
--- a/llama-cpp-sys-2/src/lib.rs
+++ b/llama-cpp-sys-2/src/lib.rs
@@ -28,3 +28,15 @@ impl Debug for llama_grammar_element {
             .finish()
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn smoke_test() {
+        unsafe {
+            llama_time_us();
+        }
+    }
+}
\ No newline at end of file

From bc5c0e49d2e46f14072f2137584c0b9171d09efe Mon Sep 17 00:00:00 2001
From: Marcus Dunn <marcus.s.dunn@gmail.com>
Date: Thu, 28 Mar 2024 19:01:49 -0700
Subject: [PATCH 2/8] formatting + metal hack backport

---
 llama-cpp-2/src/context/sample/sampler.rs |   2 +-
 llama-cpp-2/src/lib.rs                    |   4 +-
 llama-cpp-2/src/model.rs                  |   7 +-
 llama-cpp-sys-2/build.rs                  | 106 +++++++++++-----------
 llama-cpp-sys-2/src/lib.rs                |   2 +-
 5 files changed, 59 insertions(+), 62 deletions(-)

diff --git a/llama-cpp-2/src/context/sample/sampler.rs b/llama-cpp-2/src/context/sample/sampler.rs
index cfe90499..948a1aa5 100644
--- a/llama-cpp-2/src/context/sample/sampler.rs
+++ b/llama-cpp-2/src/context/sample/sampler.rs
@@ -3,7 +3,7 @@
 //! like [`crate::context::LlamaContext`] or token history to the sampler.
 //!
 //! # Example
-//! 
+//!
 //! **Llama.cpp default sampler**
 //!
 //! ```rust
diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 2aedd4c5..4396be55 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -246,9 +246,9 @@ pub fn llama_supports_mlock() -> bool {
 #[cfg(test)]
 mod tests {
     use super::*;
-    
+
     #[test]
     fn smoke_test() {
         ggml_time_us();
     }
-}
\ No newline at end of file
+}
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index 5f412c25..1c8e9386 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -280,17 +280,16 @@ impl LlamaModel {
     /// Get chat template from model.
     ///
     /// # Errors
-    /// 
+    ///
     /// * If the model has no chat template
     /// * If the chat template is not a valid [`CString`].
     #[allow(clippy::missing_panics_doc)] // we statically know this will not panic as
     pub fn get_chat_template(&self, buf_size: usize) -> Result<String, ChatTemplateError> {
-        
         // longest known template is about 1200 bytes from llama.cpp
         let chat_temp = CString::new(vec![b'*'; buf_size]).expect("no null");
         let chat_ptr = chat_temp.into_raw();
         let chat_name = CString::new("tokenizer.chat_template").expect("no null bytes");
-        
+
         let chat_template: String = unsafe {
             let ret = llama_cpp_sys_2::llama_model_meta_val_str(
                 self.model.as_ptr(),
@@ -305,7 +304,7 @@ impl LlamaModel {
             debug_assert_eq!(usize::try_from(ret).unwrap(), template.len(), "llama.cpp guarantees that the returned int {ret} is the length of the string {} but that was not the case", template.len());
             template
         };
-        
+
         Ok(chat_template)
     }
 
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index ecd02666..ef3aa539 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -1,12 +1,27 @@
-use std::env;
-use std::path::PathBuf;
 use cmake::Config;
+use std::env;
+use std::path::{Path, PathBuf};
 
 fn main() {
     println!("cargo:rerun-if-changed=llama.cpp");
 
+    if !Path::new("llama.cpp/ggml.c").exists() {
+        panic!("llama.cpp seems to not be populated, try running `git submodule update --init --recursive` to init.")
+    }
+
+    if cfg!(target_os = "macos") {
+        metal_hack();
+    }
+
     let build = Config::new("llama.cpp")
-        .define("LLAMA_CUBLAS", if cfg!(feature = "cublas") { "ON" } else { "OFF" })
+        .define(
+            "LLAMA_CUBLAS",
+            if cfg!(feature = "cublas") {
+                "ON"
+            } else {
+                "OFF"
+            },
+        )
         .define("BUILD_SHARED_LIBS", "ON")
         .define("LLAMA_BUILD_EXAMPLES", "OFF")
         .define("LLAMA_BUILD_TESTS", "OFF")
@@ -17,14 +32,6 @@ fn main() {
     println!("cargo:rustc-link-search={}", shared.display());
     println!("cargo:rustc-link-lib=dylib=llama");
 
-    if !Path::new("llama.cpp/ggml.c").exists() {
-        panic!("llama.cpp seems to not be populated, try running `git submodule update --init --recursive` to init.")
-    }
-
-    if cfg!(target_os = "macos") {
-        metal_hack();
-    }
-
     let header = "llama.cpp/llama.h";
 
     println!("cargo:rerun-if-changed={header}");
@@ -53,46 +60,37 @@ fn metal_hack() {
     const GGML_METAL_PATH: &str = "llama.cpp/ggml-metal.m";
     const GGML_COMMON_PATH: &str = "llama.cpp/ggml-common.h";
 
-    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is not defined"));
-
-    let ggml_metal_path = {
-        let ggml_metal_metal = std::fs::read_to_string(GGML_METAL_METAL_PATH)
-            .expect("Could not read ggml-metal.metal")
-            .replace('\\', "\\\\")
-            .replace('\n', "\\n")
-            .replace('\r', "\\r")
-            .replace('\"', "\\\"");
-
-        let ggml_common = std::fs::read_to_string(GGML_COMMON_PATH).expect("Could not read ggml-common.h")
-            .replace('\\', "\\\\")
-            .replace('\n', "\\n")
-            .replace('\r', "\\r")
-            .replace('\"', "\\\"");
-
-        let includged_ggml_metal_metal = ggml_metal_metal.replace(
-            "#include \\\"ggml-common.h\\\"",
-            &format!("{ggml_common}")
-        );
-        print!("{}", &includged_ggml_metal_metal);
-
-        let ggml_metal =
-            std::fs::read_to_string(GGML_METAL_PATH).expect("Could not read ggml-metal.m");
-
-        let needle = r#"NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];"#;
-        if !ggml_metal.contains(needle) {
-            panic!("ggml-metal.m does not contain the needle to be replaced; the patching logic needs to be reinvestigated. Contact a `llama-cpp-sys-2` developer!");
-        }
-
-        // Replace the runtime read of the file with a compile-time string
-        let ggml_metal = ggml_metal.replace(
-            needle,
-            &format!(r#"NSString * src  = @"{includged_ggml_metal_metal}";"#),
-        );
-
-        let patched_ggml_metal_path = out_dir.join("ggml-metal.m");
-        std::fs::write(&patched_ggml_metal_path, ggml_metal)
-            .expect("Could not write temporary patched ggml-metal.m");
-
-        patched_ggml_metal_path
-    };
-}
\ No newline at end of file
+    let ggml_metal_metal = std::fs::read_to_string(GGML_METAL_METAL_PATH)
+        .expect("Could not read ggml-metal.metal")
+        .replace('\\', "\\\\")
+        .replace('\n', "\\n")
+        .replace('\r', "\\r")
+        .replace('\"', "\\\"");
+
+    let ggml_common = std::fs::read_to_string(GGML_COMMON_PATH)
+        .expect("Could not read ggml-common.h")
+        .replace('\\', "\\\\")
+        .replace('\n', "\\n")
+        .replace('\r', "\\r")
+        .replace('\"', "\\\"");
+
+    let includged_ggml_metal_metal =
+        ggml_metal_metal.replace("#include \\\"ggml-common.h\\\"", &format!("{ggml_common}"));
+    print!("{}", &includged_ggml_metal_metal);
+
+    let ggml_metal = std::fs::read_to_string(GGML_METAL_PATH).expect("Could not read ggml-metal.m");
+
+    let needle = r#"NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];"#;
+    if !ggml_metal.contains(needle) {
+        panic!("ggml-metal.m does not contain the needle to be replaced; the patching logic needs to be reinvestigated. Contact a `llama-cpp-sys-2` developer!");
+    }
+
+    // Replace the runtime read of the file with a compile-time string
+    let ggml_metal = ggml_metal.replace(
+        needle,
+        &format!(r#"NSString * src  = @"{includged_ggml_metal_metal}";"#),
+    );
+
+    std::fs::write(&GGML_METAL_PATH, ggml_metal)
+        .expect("Could not write temporary patched ggml-metal.m");
+}
diff --git a/llama-cpp-sys-2/src/lib.rs b/llama-cpp-sys-2/src/lib.rs
index 3e82bb63..d166d738 100644
--- a/llama-cpp-sys-2/src/lib.rs
+++ b/llama-cpp-sys-2/src/lib.rs
@@ -39,4 +39,4 @@ mod tests {
             llama_time_us();
         }
     }
-}
\ No newline at end of file
+}

From 7a20a387817249365266973cd2c4d00275c653fe Mon Sep 17 00:00:00 2001
From: Marcus Dunn <marcus.s.dunn@gmail.com>
Date: Thu, 28 Mar 2024 19:39:37 -0700
Subject: [PATCH 3/8] remove print

---
 llama-cpp-sys-2/build.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index ef3aa539..5fd29e89 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -76,7 +76,6 @@ fn metal_hack() {
 
     let includged_ggml_metal_metal =
         ggml_metal_metal.replace("#include \\\"ggml-common.h\\\"", &format!("{ggml_common}"));
-    print!("{}", &includged_ggml_metal_metal);
 
     let ggml_metal = std::fs::read_to_string(GGML_METAL_PATH).expect("Could not read ggml-metal.m");
 

From e81b0ec89797f640225688175d6494cd076ba2e9 Mon Sep 17 00:00:00 2001
From: Marcus Dunn <marcus.s.dunn@gmail.com>
Date: Thu, 28 Mar 2024 20:30:30 -0700
Subject: [PATCH 4/8] add cmake to test-build

---
 llama-cpp-sys-2/build.rs | 8 +-------
 test-build.Dockerfile    | 2 +-
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 5fd29e89..ed5ad953 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -80,15 +80,9 @@ fn metal_hack() {
     let ggml_metal = std::fs::read_to_string(GGML_METAL_PATH).expect("Could not read ggml-metal.m");
 
     let needle = r#"NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];"#;
-    if !ggml_metal.contains(needle) {
-        panic!("ggml-metal.m does not contain the needle to be replaced; the patching logic needs to be reinvestigated. Contact a `llama-cpp-sys-2` developer!");
-    }
 
     // Replace the runtime read of the file with a compile-time string
-    let ggml_metal = ggml_metal.replace(
-        needle,
-        &format!(r#"NSString * src  = @"{includged_ggml_metal_metal}";"#),
-    );
+    let ggml_metal = ggml_metal.replace(needle, &format!(r#"NSString * src  = @"{includged_ggml_metal_metal}";"#), );
 
     std::fs::write(&GGML_METAL_PATH, ggml_metal)
         .expect("Could not write temporary patched ggml-metal.m");
diff --git a/test-build.Dockerfile b/test-build.Dockerfile
index da82218e..5b47e31f 100644
--- a/test-build.Dockerfile
+++ b/test-build.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} as base-cuda
 
 # Install requirements for rustup install + bindgen: https://rust-lang.github.io/rust-bindgen/requirements.html
-RUN DEBIAN_FRONTEND=noninteractive apt update -y && apt install -y curl llvm-dev libclang-dev clang pkg-config libssl-dev
+RUN DEBIAN_FRONTEND=noninteractive apt update -y && apt install -y curl llvm-dev libclang-dev clang pkg-config libssl-dev cmake
 RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
 ENV PATH=/root/.cargo/bin:$PATH
 

From 938da11b190920e6d88b80f08eeda63633951f40 Mon Sep 17 00:00:00 2001
From: Marcus Dunn <marcus.s.dunn@gmail.com>
Date: Thu, 28 Mar 2024 20:33:55 -0700
Subject: [PATCH 5/8] added vulkan (I think?)

---
 llama-cpp-2/Cargo.toml     | 1 +
 llama-cpp-sys-2/Cargo.toml | 1 +
 llama-cpp-sys-2/build.rs   | 5 +++++
 simple/Cargo.toml          | 1 +
 4 files changed, 8 insertions(+)

diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 7e2d9f59..df63ebd6 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -15,6 +15,7 @@ tracing = { workspace = true }
 
 [features]
 cublas = ["llama-cpp-sys-2/cublas"]
+vulkan = ["llama-cpp-sys-2/vulkan"]
 sampler = []
 
 [lints]
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 73d3fb98..ffcdedfa 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -46,4 +46,5 @@ cmake = { workspace = true }
 
 [features]
 cublas = []
+vulkan = []
 
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index ed5ad953..9a59cb47 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -22,6 +22,11 @@ fn main() {
                 "OFF"
             },
         )
+        .define("LLAMA_VULKAN", if cfg!(feature = "vulkan") {
+            "ON"
+        } else {
+            "OFF"
+        })
         .define("BUILD_SHARED_LIBS", "ON")
         .define("LLAMA_BUILD_EXAMPLES", "OFF")
         .define("LLAMA_BUILD_TESTS", "OFF")
diff --git a/simple/Cargo.toml b/simple/Cargo.toml
index 4e23632a..4d876102 100644
--- a/simple/Cargo.toml
+++ b/simple/Cargo.toml
@@ -13,6 +13,7 @@ anyhow = { workspace = true }
 
 [features]
 cublas = ["llama-cpp-2/cublas"]
+vulkan = ["llama-cpp-2/vulkan"]
 
 [lints]
 workspace = true

From 9ee4c0f05ff77c73f19e2e1d570549905f64fd13 Mon Sep 17 00:00:00 2001
From: Marcus Dunn <marcus.s.dunn@gmail.com>
Date: Thu, 28 Mar 2024 23:06:33 -0700
Subject: [PATCH 6/8] update llama.cpp

---
 llama-cpp-sys-2/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
index a32b77c4..bfe7dafc 160000
--- a/llama-cpp-sys-2/llama.cpp
+++ b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit a32b77c4b2c1808654d0b952f26c37d73d2e746b
+Subproject commit bfe7dafc9cf96b9a09ead347fed9a547930fc631

From 988de5595a3d1e406c20144acb80336cc65b58b3 Mon Sep 17 00:00:00 2001
From: marcus <marcus.s.dunn@gmail.com>
Date: Sun, 21 Apr 2024 11:30:00 -0700
Subject: [PATCH 7/8] updated to latest llama.cpp (seems to run llama-3)

---
 llama-cpp-2/src/model.rs  | 29 +++++++++++++++++++++++------
 llama-cpp-sys-2/llama.cpp |  2 +-
 simple/src/main.rs        |  6 +++---
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index 1c8e9386..01b32811 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -34,6 +34,15 @@ pub enum AddBos {
     Never,
 }
 
+/// How to determine if we should tokenize special tokens
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Special {
+    /// Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space.
+    Tokenize,
+    /// Treat special and/or control tokens as plaintext.
+    Plaintext,
+}
+
 unsafe impl Send for LlamaModel {}
 
 unsafe impl Sync for LlamaModel {}
@@ -54,10 +63,11 @@ impl LlamaModel {
     /// Get all tokens in the model.
     pub fn tokens(
         &self,
+        special: Special,
     ) -> impl Iterator<Item = (LlamaToken, Result<String, TokenToStringError>)> + '_ {
         (0..self.n_vocab())
             .map(LlamaToken::new)
-            .map(|llama_token| (llama_token, self.token_to_str(llama_token)))
+            .map(move |llama_token| (llama_token, self.token_to_str(llama_token, special)))
     }
 
     /// Get the beginning of stream token.
@@ -86,8 +96,8 @@ impl LlamaModel {
     /// # Errors
     ///
     /// See [`TokenToStringError`] for more information.
-    pub fn token_to_str(&self, token: LlamaToken) -> Result<String, TokenToStringError> {
-        self.token_to_str_with_size(token, 32)
+    pub fn token_to_str(&self, token: LlamaToken, special: Special) -> Result<String, TokenToStringError> {
+        self.token_to_str_with_size(token, 32, special)
     }
 
     /// Convert a vector of tokens to a single string.
@@ -95,9 +105,9 @@ impl LlamaModel {
     /// # Errors
     ///
     /// See [`TokenToStringError`] for more information.
-    pub fn tokens_to_str(&self, tokens: &[LlamaToken]) -> Result<String, TokenToStringError> {
+    pub fn tokens_to_str(&self, tokens: &[LlamaToken], special: Special) -> Result<String, TokenToStringError> {
         let mut builder = String::with_capacity(tokens.len() * 4);
-        for str in tokens.iter().copied().map(|t| self.token_to_str(t)) {
+        for str in tokens.iter().copied().map(|t| self.token_to_str(t, special)) {
             builder += &str?;
         }
         Ok(builder)
@@ -210,11 +220,13 @@ impl LlamaModel {
         &self,
         token: LlamaToken,
         buffer_size: usize,
+        special: Special,
     ) -> Result<String, TokenToStringError> {
         if token == self.token_nl() {
             return Ok(String::from("\n"));
         }
 
+        // unsure what to do with this in the face of the 'special' arg
         match self.token_type(token) {
             LlamaTokenType::Normal | LlamaTokenType::UserDefined => {}
             LlamaTokenType::Control => {
@@ -230,12 +242,17 @@ impl LlamaModel {
             }
         }
 
+        let special = match special {
+            Special::Tokenize => true,
+            Special::Plaintext => false,
+        };
+
         let string = CString::new(vec![b'*'; buffer_size]).expect("no null");
         let len = string.as_bytes().len();
         let len = c_int::try_from(len).expect("length fits into c_int");
         let buf = string.into_raw();
         let size = unsafe {
-            llama_cpp_sys_2::llama_token_to_piece(self.model.as_ptr(), token.0, buf, len)
+            llama_cpp_sys_2::llama_token_to_piece(self.model.as_ptr(), token.0, buf, len, special)
         };
 
         match size {
diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
index bfe7dafc..5cf5e7d4 160000
--- a/llama-cpp-sys-2/llama.cpp
+++ b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit bfe7dafc9cf96b9a09ead347fed9a547930fc631
+Subproject commit 5cf5e7d490dfdd2e70bface2d35dfd14aa44b4fb
diff --git a/simple/src/main.rs b/simple/src/main.rs
index 8f7451f7..9d13ed9d 100644
--- a/simple/src/main.rs
+++ b/simple/src/main.rs
@@ -15,7 +15,7 @@ use llama_cpp_2::llama_backend::LlamaBackend;
 use llama_cpp_2::llama_batch::LlamaBatch;
 use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue;
 use llama_cpp_2::model::params::LlamaModelParams;
-use llama_cpp_2::model::AddBos;
+use llama_cpp_2::model::{AddBos, Special};
 use llama_cpp_2::model::LlamaModel;
 use llama_cpp_2::token::data_array::LlamaTokenDataArray;
 use std::ffi::CString;
@@ -214,7 +214,7 @@ either reduce n_len or increase n_ctx"
     eprintln!();
 
     for token in &tokens_list {
-        eprint!("{}", model.token_to_str(*token)?);
+        eprint!("{}", model.token_to_str(*token, Special::Tokenize)?);
     }
 
     std::io::stderr().flush()?;
@@ -256,7 +256,7 @@ either reduce n_len or increase n_ctx"
                 break;
             }
 
-            print!("{}", model.token_to_str(new_token_id)?);
+            print!("{}", model.token_to_str(new_token_id, Special::Tokenize)?);
             std::io::stdout().flush()?;
 
             batch.clear();

From 29a6e26c0006b24559b8ff0eb6eb649475ad40d2 Mon Sep 17 00:00:00 2001
From: marcus <marcus.s.dunn@gmail.com>
Date: Sun, 21 Apr 2024 11:32:41 -0700
Subject: [PATCH 8/8] fixed embeddings compiler error

---
 embeddings/src/main.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/embeddings/src/main.rs b/embeddings/src/main.rs
index bc0b578d..67c8988b 100644
--- a/embeddings/src/main.rs
+++ b/embeddings/src/main.rs
@@ -20,7 +20,7 @@ use llama_cpp_2::ggml_time_us;
 use llama_cpp_2::llama_backend::LlamaBackend;
 use llama_cpp_2::llama_batch::LlamaBatch;
 use llama_cpp_2::model::params::LlamaModelParams;
-use llama_cpp_2::model::AddBos;
+use llama_cpp_2::model::{AddBos, Special};
 use llama_cpp_2::model::LlamaModel;
 
 #[derive(clap::Parser, Debug, Clone)]
@@ -137,7 +137,7 @@ fn main() -> Result<()> {
     for (i, token_line) in tokens_lines_list.iter().enumerate() {
         eprintln!("Prompt {i}");
         for token in token_line {
-            eprintln!(" {} --> {}", token, model.token_to_str(*token)?);
+            eprintln!(" {} --> {}", token, model.token_to_str(*token, Special::Tokenize)?);
         }
         eprintln!();
     }