From 70ec39239e68d9eb9110140ed26e5fbdbd0e91ff Mon Sep 17 00:00:00 2001 From: marcus Date: Mon, 25 Mar 2024 17:04:31 -0700 Subject: [PATCH 1/8] use cmake instead of `cc` --- Cargo.lock | 24 ++-- Cargo.toml | 2 +- llama-cpp-2/src/lib.rs | 10 ++ llama-cpp-sys-2/Cargo.toml | 2 +- llama-cpp-sys-2/build.rs | 267 ++----------------------------------- llama-cpp-sys-2/src/lib.rs | 12 ++ 6 files changed, 47 insertions(+), 270 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a96488ef..17f31be0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -123,10 +123,6 @@ name = "cc" version = "1.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5" -dependencies = [ - "jobserver", - "libc", -] [[package]] name = "cexpr" @@ -194,6 +190,15 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" +[[package]] +name = "cmake" +version = "0.1.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.0" @@ -418,15 +423,6 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" -[[package]] -name = "jobserver" -version = "0.1.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab46a6e9526ddef3ae7f787c06f0f2600639ba80ea3eade3d8e670a2230f51d6" -dependencies = [ - "libc", -] - [[package]] name = "js-sys" version = "0.3.64" @@ -484,7 +480,7 @@ name = "llama-cpp-sys-2" version = "0.1.45" dependencies = [ "bindgen", - "cc", + "cmake", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 1c6eba10..e29e58d8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,9 +16,9 @@ hf-hub = { version = "0.3.2" } criterion = "0.5.1" pprof = "0.13.0" bindgen = "0.69.4" -cc = "1.0.90" anyhow = "1.0.81" clap = "4.5.3" +cmake = "0.1.50" [workspace.lints.rust] missing_docs = { level = "warn" } diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs index 49e333e0..2aedd4c5 100644 --- a/llama-cpp-2/src/lib.rs +++ b/llama-cpp-2/src/lib.rs @@ -242,3 +242,13 @@ pub fn ggml_time_us() -> i64 { pub fn llama_supports_mlock() -> bool { unsafe { llama_cpp_sys_2::llama_supports_mlock() } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn smoke_test() { + ggml_time_us(); + } +} \ No newline at end of file diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml index fc908456..94577bec 100644 --- a/llama-cpp-sys-2/Cargo.toml +++ b/llama-cpp-sys-2/Cargo.toml @@ -42,7 +42,7 @@ include = [ [build-dependencies] bindgen = { workspace = true } -cc = { workspace = true, features = ["parallel"] } +cmake = { workspace = true } [features] cublas = [] diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index 5d14cea5..6d131b3e 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -1,187 +1,21 @@ use std::env; -use std::path::Path; use std::path::PathBuf; +use cmake::Config; fn main() { println!("cargo:rerun-if-changed=llama.cpp"); - let cublas_enabled = env::var("CARGO_FEATURE_CUBLAS").is_ok(); - - let mut ggml_cuda = if cublas_enabled { - Some(cc::Build::new()) - } else { - None - }; - - if !Path::new("llama.cpp/ggml.c").exists() { - panic!("llama.cpp seems to not be populated, try running `git submodule update --init --recursive` to init.") - } - - let mut ggml = cc::Build::new(); - let mut llama_cpp = cc::Build::new(); - - ggml.cpp(false); - llama_cpp.cpp(true); - - // https://github.com/ggerganov/llama.cpp/blob/a836c8f534ab789b02da149fbdaf7735500bff74/Makefile#L364-L368 - if let Some(ggml_cuda) = &mut ggml_cuda { - for lib in [ - "cuda", "cublas", "culibos", "cudart", "cublasLt", "pthread", "dl", "rt", - ] { - println!("cargo:rustc-link-lib={}", lib); - } - if !ggml_cuda.get_compiler().is_like_msvc() { - for lib in ["culibos", "pthread", "dl", "rt"] { - println!("cargo:rustc-link-lib={}", lib); - } - } - - println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); - - if cfg!(target_arch = "aarch64") { - ggml_cuda - .flag_if_supported("-mfp16-format=ieee") - .flag_if_supported("-mno-unaligned-access"); - ggml.flag_if_supported("-mfp16-format=ieee") - .flag_if_supported("-mno-unaligned-access"); - llama_cpp - .flag_if_supported("-mfp16-format=ieee") - .flag_if_supported("-mno-unaligned-access"); - ggml.flag_if_supported("-mfp16-format=ieee") - .flag_if_supported("-mno-unaligned-access"); - } - - ggml_cuda - .cuda(true) - .flag("-arch=all") - .file("llama.cpp/ggml-cuda.cu") - .include("llama.cpp"); - - if ggml_cuda.get_compiler().is_like_msvc() { - ggml_cuda.std("c++14"); - } else { - ggml_cuda.flag("-std=c++11").std("c++11"); - } - - ggml.define("GGML_USE_CUBLAS", None); - ggml_cuda.define("GGML_USE_CUBLAS", None); - llama_cpp.define("GGML_USE_CUBLAS", None); - } - - for build in [&mut ggml, &mut llama_cpp] { - let compiler = build.get_compiler(); - - if cfg!(target_arch = "i686") || cfg!(target_arch = "x86_64") { - let features = x86::Features::get_target(); - if compiler.is_like_clang() || compiler.is_like_gnu() { - build.flag("-pthread"); - - if features.avx { - build.flag("-mavx"); - } - if features.avx2 { - build.flag("-mavx2"); - } - if features.fma { - build.flag("-mfma"); - } - if features.f16c { - build.flag("-mf16c"); - } - if features.sse3 { - build.flag("-msse3"); - } - } else if compiler.is_like_msvc() { - match (features.avx2, features.avx) { - (true, _) => { - build.flag("/arch:AVX2"); - } - (_, true) => { - build.flag("/arch:AVX"); - } - _ => {} - } - } - } else if cfg!(target_arch = "aarch64") - && (compiler.is_like_clang() || compiler.is_like_gnu()) - { - if cfg!(target_os = "macos") { - build.flag("-mcpu=apple-m1"); - } else if env::var("HOST") == env::var("TARGET") { - build.flag("-mcpu=native"); - } - build.flag("-pthread"); - } - } - - // https://github.com/ggerganov/llama.cpp/blob/191221178f51b6e81122c5bda0fd79620e547d07/Makefile#L133-L141 - if cfg!(target_os = "macos") { - assert!(!cublas_enabled, "CUBLAS is not supported on macOS"); - - println!("cargo:rustc-link-lib=framework=Metal"); - println!("cargo:rustc-link-lib=framework=Foundation"); - println!("cargo:rustc-link-lib=framework=MetalPerformanceShaders"); - println!("cargo:rustc-link-lib=framework=MetalKit"); - - llama_cpp.define("_DARWIN_C_SOURCE", None); - - // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L340-L343 - llama_cpp.define("GGML_USE_METAL", None); - llama_cpp.define("GGML_USE_ACCELERATE", None); - llama_cpp.define("ACCELERATE_NEW_LAPACK", None); - llama_cpp.define("ACCELERATE_LAPACK_ILP64", None); - println!("cargo:rustc-link-lib=framework=Accelerate"); - - metal_hack(&mut ggml); - ggml.include("./llama.cpp/ggml-metal.h"); - } - - if cfg!(target_os = "dragonfly") { - llama_cpp.define("__BSD_VISIBLE", None); - } - - if cfg!(target_os = "linux") { - ggml.define("_GNU_SOURCE", None); - } - - ggml.std("c11") - .include("./llama.cpp") - .file("llama.cpp/ggml.c") - .file("llama.cpp/ggml-alloc.c") - .file("llama.cpp/ggml-backend.c") - .file("llama.cpp/ggml-quants.c") - .define("GGML_USE_K_QUANTS", None); - - llama_cpp - .define("_XOPEN_SOURCE", Some("600")) - .include("llama.cpp") - .std("c++11") - .file("llama.cpp/llama.cpp") - .file("llama.cpp/unicode.cpp"); - - // Remove debug log output from `llama.cpp` - let is_release = env::var("PROFILE").unwrap() == "release"; - if is_release { - ggml.define("NDEBUG", None); - llama_cpp.define("NDEBUG", None); - if let Some(cuda) = ggml_cuda.as_mut() { - cuda.define("NDEBUG", None); - } - } - - if let Some(ggml_cuda) = ggml_cuda { - println!("compiling ggml-cuda"); - ggml_cuda.compile("ggml-cuda"); - println!("compiled ggml-cuda"); - } - - println!("compiling ggml"); - ggml.compile("ggml"); - println!("compiled ggml"); - - println!("compiling llama"); - llama_cpp.compile("llama"); - println!("compiled llama"); + let build = Config::new("llama.cpp") + .define("LLAMA_CUBLAS", if cfg!(feature = "cublas") { "ON" } else { "OFF" }) + .define("BUILD_SHARED_LIBS", "ON") + .define("LLAMA_BUILD_EXAMPLES", "OFF") + .define("LLAMA_BUILD_TESTS", "OFF") + .define("LLAMA_BUILD_SERVER", "OFF") + .build(); + + let shared = build.join("lib"); + println!("cargo:rustc-link-search={}", shared.display()); + println!("cargo:rustc-link-lib=dylib=llama"); let header = "llama.cpp/llama.h"; @@ -200,79 +34,4 @@ fn main() { bindings .write_to_file(out_path.join("bindings.rs")) .expect("failed to write bindings to file"); - let llama_cpp_dir = PathBuf::from("llama.cpp").canonicalize().unwrap(); - println!("cargo:INCLUDE={}", llama_cpp_dir.to_str().unwrap()); - println!("cargo:OUT_DIR={}", out_path.to_str().unwrap()); -} - -// courtesy of https://github.com/rustformers/llm -fn metal_hack(build: &mut cc::Build) { - const GGML_METAL_METAL_PATH: &str = "llama.cpp/ggml-metal.metal"; - const GGML_METAL_PATH: &str = "llama.cpp/ggml-metal.m"; - - let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is not defined")); - - let ggml_metal_path = { - let ggml_metal_metal = std::fs::read_to_string(GGML_METAL_METAL_PATH) - .expect("Could not read ggml-metal.metal") - .replace('\\', "\\\\") - .replace('\n', "\\n") - .replace('\r', "\\r") - .replace('\"', "\\\""); - - let ggml_metal = - std::fs::read_to_string(GGML_METAL_PATH).expect("Could not read ggml-metal.m"); - - let needle = r#"NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];"#; - if !ggml_metal.contains(needle) { - panic!("ggml-metal.m does not contain the needle to be replaced; the patching logic needs to be reinvestigated. Contact a `llama-cpp-sys-2` developer!"); - } - - // Replace the runtime read of the file with a compile-time string - let ggml_metal = ggml_metal.replace( - needle, - &format!(r#"NSString * src = @"{ggml_metal_metal}";"#), - ); - - let patched_ggml_metal_path = out_dir.join("ggml-metal.m"); - std::fs::write(&patched_ggml_metal_path, ggml_metal) - .expect("Could not write temporary patched ggml-metal.m"); - - patched_ggml_metal_path - }; - - build.file(ggml_metal_path); -} - -// Courtesy of https://github.com/rustformers/llm -fn get_supported_target_features() -> std::collections::HashSet { - env::var("CARGO_CFG_TARGET_FEATURE") - .unwrap() - .split(',') - .map(ToString::to_string) - .collect() -} - -mod x86 { - #[allow(clippy::struct_excessive_bools)] - #[derive(Clone, Debug, PartialEq, Eq)] - pub struct Features { - pub fma: bool, - pub avx: bool, - pub avx2: bool, - pub f16c: bool, - pub sse3: bool, - } - impl Features { - pub fn get_target() -> Self { - let features = crate::get_supported_target_features(); - Self { - fma: features.contains("fma"), - avx: features.contains("avx"), - avx2: features.contains("avx2"), - f16c: features.contains("f16c"), - sse3: features.contains("sse3"), - } - } - } -} +} \ No newline at end of file diff --git a/llama-cpp-sys-2/src/lib.rs b/llama-cpp-sys-2/src/lib.rs index dbec4ab4..3e82bb63 100644 --- a/llama-cpp-sys-2/src/lib.rs +++ b/llama-cpp-sys-2/src/lib.rs @@ -28,3 +28,15 @@ impl Debug for llama_grammar_element { .finish() } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn smoke_test() { + unsafe { + llama_time_us(); + } + } +} \ No newline at end of file From bc5c0e49d2e46f14072f2137584c0b9171d09efe Mon Sep 17 00:00:00 2001 From: Marcus Dunn Date: Thu, 28 Mar 2024 19:01:49 -0700 Subject: [PATCH 2/8] formatting + metal hack backport --- llama-cpp-2/src/context/sample/sampler.rs | 2 +- llama-cpp-2/src/lib.rs | 4 +- llama-cpp-2/src/model.rs | 7 +- llama-cpp-sys-2/build.rs | 106 +++++++++++----------- llama-cpp-sys-2/src/lib.rs | 2 +- 5 files changed, 59 insertions(+), 62 deletions(-) diff --git a/llama-cpp-2/src/context/sample/sampler.rs b/llama-cpp-2/src/context/sample/sampler.rs index cfe90499..948a1aa5 100644 --- a/llama-cpp-2/src/context/sample/sampler.rs +++ b/llama-cpp-2/src/context/sample/sampler.rs @@ -3,7 +3,7 @@ //! like [`crate::context::LlamaContext`] or token history to the sampler. //! //! # Example -//! +//! //! **Llama.cpp default sampler** //! //! ```rust diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs index 2aedd4c5..4396be55 100644 --- a/llama-cpp-2/src/lib.rs +++ b/llama-cpp-2/src/lib.rs @@ -246,9 +246,9 @@ pub fn llama_supports_mlock() -> bool { #[cfg(test)] mod tests { use super::*; - + #[test] fn smoke_test() { ggml_time_us(); } -} \ No newline at end of file +} diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs index 5f412c25..1c8e9386 100644 --- a/llama-cpp-2/src/model.rs +++ b/llama-cpp-2/src/model.rs @@ -280,17 +280,16 @@ impl LlamaModel { /// Get chat template from model. /// /// # Errors - /// + /// /// * If the model has no chat template /// * If the chat template is not a valid [`CString`]. #[allow(clippy::missing_panics_doc)] // we statically know this will not panic as pub fn get_chat_template(&self, buf_size: usize) -> Result { - // longest known template is about 1200 bytes from llama.cpp let chat_temp = CString::new(vec![b'*'; buf_size]).expect("no null"); let chat_ptr = chat_temp.into_raw(); let chat_name = CString::new("tokenizer.chat_template").expect("no null bytes"); - + let chat_template: String = unsafe { let ret = llama_cpp_sys_2::llama_model_meta_val_str( self.model.as_ptr(), @@ -305,7 +304,7 @@ impl LlamaModel { debug_assert_eq!(usize::try_from(ret).unwrap(), template.len(), "llama.cpp guarantees that the returned int {ret} is the length of the string {} but that was not the case", template.len()); template }; - + Ok(chat_template) } diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index ecd02666..ef3aa539 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -1,12 +1,27 @@ -use std::env; -use std::path::PathBuf; use cmake::Config; +use std::env; +use std::path::{Path, PathBuf}; fn main() { println!("cargo:rerun-if-changed=llama.cpp"); + if !Path::new("llama.cpp/ggml.c").exists() { + panic!("llama.cpp seems to not be populated, try running `git submodule update --init --recursive` to init.") + } + + if cfg!(target_os = "macos") { + metal_hack(); + } + let build = Config::new("llama.cpp") - .define("LLAMA_CUBLAS", if cfg!(feature = "cublas") { "ON" } else { "OFF" }) + .define( + "LLAMA_CUBLAS", + if cfg!(feature = "cublas") { + "ON" + } else { + "OFF" + }, + ) .define("BUILD_SHARED_LIBS", "ON") .define("LLAMA_BUILD_EXAMPLES", "OFF") .define("LLAMA_BUILD_TESTS", "OFF") @@ -17,14 +32,6 @@ fn main() { println!("cargo:rustc-link-search={}", shared.display()); println!("cargo:rustc-link-lib=dylib=llama"); - if !Path::new("llama.cpp/ggml.c").exists() { - panic!("llama.cpp seems to not be populated, try running `git submodule update --init --recursive` to init.") - } - - if cfg!(target_os = "macos") { - metal_hack(); - } - let header = "llama.cpp/llama.h"; println!("cargo:rerun-if-changed={header}"); @@ -53,46 +60,37 @@ fn metal_hack() { const GGML_METAL_PATH: &str = "llama.cpp/ggml-metal.m"; const GGML_COMMON_PATH: &str = "llama.cpp/ggml-common.h"; - let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is not defined")); - - let ggml_metal_path = { - let ggml_metal_metal = std::fs::read_to_string(GGML_METAL_METAL_PATH) - .expect("Could not read ggml-metal.metal") - .replace('\\', "\\\\") - .replace('\n', "\\n") - .replace('\r', "\\r") - .replace('\"', "\\\""); - - let ggml_common = std::fs::read_to_string(GGML_COMMON_PATH).expect("Could not read ggml-common.h") - .replace('\\', "\\\\") - .replace('\n', "\\n") - .replace('\r', "\\r") - .replace('\"', "\\\""); - - let includged_ggml_metal_metal = ggml_metal_metal.replace( - "#include \\\"ggml-common.h\\\"", - &format!("{ggml_common}") - ); - print!("{}", &includged_ggml_metal_metal); - - let ggml_metal = - std::fs::read_to_string(GGML_METAL_PATH).expect("Could not read ggml-metal.m"); - - let needle = r#"NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];"#; - if !ggml_metal.contains(needle) { - panic!("ggml-metal.m does not contain the needle to be replaced; the patching logic needs to be reinvestigated. Contact a `llama-cpp-sys-2` developer!"); - } - - // Replace the runtime read of the file with a compile-time string - let ggml_metal = ggml_metal.replace( - needle, - &format!(r#"NSString * src = @"{includged_ggml_metal_metal}";"#), - ); - - let patched_ggml_metal_path = out_dir.join("ggml-metal.m"); - std::fs::write(&patched_ggml_metal_path, ggml_metal) - .expect("Could not write temporary patched ggml-metal.m"); - - patched_ggml_metal_path - }; -} \ No newline at end of file + let ggml_metal_metal = std::fs::read_to_string(GGML_METAL_METAL_PATH) + .expect("Could not read ggml-metal.metal") + .replace('\\', "\\\\") + .replace('\n', "\\n") + .replace('\r', "\\r") + .replace('\"', "\\\""); + + let ggml_common = std::fs::read_to_string(GGML_COMMON_PATH) + .expect("Could not read ggml-common.h") + .replace('\\', "\\\\") + .replace('\n', "\\n") + .replace('\r', "\\r") + .replace('\"', "\\\""); + + let includged_ggml_metal_metal = + ggml_metal_metal.replace("#include \\\"ggml-common.h\\\"", &format!("{ggml_common}")); + print!("{}", &includged_ggml_metal_metal); + + let ggml_metal = std::fs::read_to_string(GGML_METAL_PATH).expect("Could not read ggml-metal.m"); + + let needle = r#"NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];"#; + if !ggml_metal.contains(needle) { + panic!("ggml-metal.m does not contain the needle to be replaced; the patching logic needs to be reinvestigated. Contact a `llama-cpp-sys-2` developer!"); + } + + // Replace the runtime read of the file with a compile-time string + let ggml_metal = ggml_metal.replace( + needle, + &format!(r#"NSString * src = @"{includged_ggml_metal_metal}";"#), + ); + + std::fs::write(&GGML_METAL_PATH, ggml_metal) + .expect("Could not write temporary patched ggml-metal.m"); +} diff --git a/llama-cpp-sys-2/src/lib.rs b/llama-cpp-sys-2/src/lib.rs index 3e82bb63..d166d738 100644 --- a/llama-cpp-sys-2/src/lib.rs +++ b/llama-cpp-sys-2/src/lib.rs @@ -39,4 +39,4 @@ mod tests { llama_time_us(); } } -} \ No newline at end of file +} From 7a20a387817249365266973cd2c4d00275c653fe Mon Sep 17 00:00:00 2001 From: Marcus Dunn Date: Thu, 28 Mar 2024 19:39:37 -0700 Subject: [PATCH 3/8] remove print --- llama-cpp-sys-2/build.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index ef3aa539..5fd29e89 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -76,7 +76,6 @@ fn metal_hack() { let includged_ggml_metal_metal = ggml_metal_metal.replace("#include \\\"ggml-common.h\\\"", &format!("{ggml_common}")); - print!("{}", &includged_ggml_metal_metal); let ggml_metal = std::fs::read_to_string(GGML_METAL_PATH).expect("Could not read ggml-metal.m"); From e81b0ec89797f640225688175d6494cd076ba2e9 Mon Sep 17 00:00:00 2001 From: Marcus Dunn Date: Thu, 28 Mar 2024 20:30:30 -0700 Subject: [PATCH 4/8] add cmake to test-build --- llama-cpp-sys-2/build.rs | 8 +------- test-build.Dockerfile | 2 +- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index 5fd29e89..ed5ad953 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -80,15 +80,9 @@ fn metal_hack() { let ggml_metal = std::fs::read_to_string(GGML_METAL_PATH).expect("Could not read ggml-metal.m"); let needle = r#"NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];"#; - if !ggml_metal.contains(needle) { - panic!("ggml-metal.m does not contain the needle to be replaced; the patching logic needs to be reinvestigated. Contact a `llama-cpp-sys-2` developer!"); - } // Replace the runtime read of the file with a compile-time string - let ggml_metal = ggml_metal.replace( - needle, - &format!(r#"NSString * src = @"{includged_ggml_metal_metal}";"#), - ); + let ggml_metal = ggml_metal.replace(needle, &format!(r#"NSString * src = @"{includged_ggml_metal_metal}";"#), ); std::fs::write(&GGML_METAL_PATH, ggml_metal) .expect("Could not write temporary patched ggml-metal.m"); diff --git a/test-build.Dockerfile b/test-build.Dockerfile index da82218e..5b47e31f 100644 --- a/test-build.Dockerfile +++ b/test-build.Dockerfile @@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04 FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} as base-cuda # Install requirements for rustup install + bindgen: https://rust-lang.github.io/rust-bindgen/requirements.html -RUN DEBIAN_FRONTEND=noninteractive apt update -y && apt install -y curl llvm-dev libclang-dev clang pkg-config libssl-dev +RUN DEBIAN_FRONTEND=noninteractive apt update -y && apt install -y curl llvm-dev libclang-dev clang pkg-config libssl-dev cmake RUN curl https://sh.rustup.rs -sSf | bash -s -- -y ENV PATH=/root/.cargo/bin:$PATH From 938da11b190920e6d88b80f08eeda63633951f40 Mon Sep 17 00:00:00 2001 From: Marcus Dunn Date: Thu, 28 Mar 2024 20:33:55 -0700 Subject: [PATCH 5/8] added vulkan (I think?) --- llama-cpp-2/Cargo.toml | 1 + llama-cpp-sys-2/Cargo.toml | 1 + llama-cpp-sys-2/build.rs | 5 +++++ simple/Cargo.toml | 1 + 4 files changed, 8 insertions(+) diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml index 7e2d9f59..df63ebd6 100644 --- a/llama-cpp-2/Cargo.toml +++ b/llama-cpp-2/Cargo.toml @@ -15,6 +15,7 @@ tracing = { workspace = true } [features] cublas = ["llama-cpp-sys-2/cublas"] +vulkan = ["llama-cpp-sys-2/vulkan"] sampler = [] [lints] diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml index 73d3fb98..ffcdedfa 100644 --- a/llama-cpp-sys-2/Cargo.toml +++ b/llama-cpp-sys-2/Cargo.toml @@ -46,4 +46,5 @@ cmake = { workspace = true } [features] cublas = [] +vulkan = [] diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index ed5ad953..9a59cb47 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -22,6 +22,11 @@ fn main() { "OFF" }, ) + .define("LLAMA_VULKAN", if cfg!(feature = "vulkan") { + "ON" + } else { + "OFF" + }) .define("BUILD_SHARED_LIBS", "ON") .define("LLAMA_BUILD_EXAMPLES", "OFF") .define("LLAMA_BUILD_TESTS", "OFF") diff --git a/simple/Cargo.toml b/simple/Cargo.toml index 4e23632a..4d876102 100644 --- a/simple/Cargo.toml +++ b/simple/Cargo.toml @@ -13,6 +13,7 @@ anyhow = { workspace = true } [features] cublas = ["llama-cpp-2/cublas"] +vulkan = ["llama-cpp-2/vulkan"] [lints] workspace = true From 9ee4c0f05ff77c73f19e2e1d570549905f64fd13 Mon Sep 17 00:00:00 2001 From: Marcus Dunn Date: Thu, 28 Mar 2024 23:06:33 -0700 Subject: [PATCH 6/8] update llama.cpp --- llama-cpp-sys-2/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp index a32b77c4..bfe7dafc 160000 --- a/llama-cpp-sys-2/llama.cpp +++ b/llama-cpp-sys-2/llama.cpp @@ -1 +1 @@ -Subproject commit a32b77c4b2c1808654d0b952f26c37d73d2e746b +Subproject commit bfe7dafc9cf96b9a09ead347fed9a547930fc631 From 988de5595a3d1e406c20144acb80336cc65b58b3 Mon Sep 17 00:00:00 2001 From: marcus Date: Sun, 21 Apr 2024 11:30:00 -0700 Subject: [PATCH 7/8] updated to latest llama.cpp (seems to run llama-3) --- llama-cpp-2/src/model.rs | 29 +++++++++++++++++++++++------ llama-cpp-sys-2/llama.cpp | 2 +- simple/src/main.rs | 6 +++--- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs index 1c8e9386..01b32811 100644 --- a/llama-cpp-2/src/model.rs +++ b/llama-cpp-2/src/model.rs @@ -34,6 +34,15 @@ pub enum AddBos { Never, } +/// How to determine if we should tokenize special tokens +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Special { + /// Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space. + Tokenize, + /// Treat special and/or control tokens as plaintext. + Plaintext, +} + unsafe impl Send for LlamaModel {} unsafe impl Sync for LlamaModel {} @@ -54,10 +63,11 @@ impl LlamaModel { /// Get all tokens in the model. pub fn tokens( &self, + special: Special, ) -> impl Iterator)> + '_ { (0..self.n_vocab()) .map(LlamaToken::new) - .map(|llama_token| (llama_token, self.token_to_str(llama_token))) + .map(move |llama_token| (llama_token, self.token_to_str(llama_token, special))) } /// Get the beginning of stream token. @@ -86,8 +96,8 @@ impl LlamaModel { /// # Errors /// /// See [`TokenToStringError`] for more information. - pub fn token_to_str(&self, token: LlamaToken) -> Result { - self.token_to_str_with_size(token, 32) + pub fn token_to_str(&self, token: LlamaToken, special: Special) -> Result { + self.token_to_str_with_size(token, 32, special) } /// Convert a vector of tokens to a single string. @@ -95,9 +105,9 @@ impl LlamaModel { /// # Errors /// /// See [`TokenToStringError`] for more information. - pub fn tokens_to_str(&self, tokens: &[LlamaToken]) -> Result { + pub fn tokens_to_str(&self, tokens: &[LlamaToken], special: Special) -> Result { let mut builder = String::with_capacity(tokens.len() * 4); - for str in tokens.iter().copied().map(|t| self.token_to_str(t)) { + for str in tokens.iter().copied().map(|t| self.token_to_str(t, special)) { builder += &str?; } Ok(builder) @@ -210,11 +220,13 @@ impl LlamaModel { &self, token: LlamaToken, buffer_size: usize, + special: Special, ) -> Result { if token == self.token_nl() { return Ok(String::from("\n")); } + // unsure what to do with this in the face of the 'special' arg match self.token_type(token) { LlamaTokenType::Normal | LlamaTokenType::UserDefined => {} LlamaTokenType::Control => { @@ -230,12 +242,17 @@ impl LlamaModel { } } + let special = match special { + Special::Tokenize => true, + Special::Plaintext => false, + }; + let string = CString::new(vec![b'*'; buffer_size]).expect("no null"); let len = string.as_bytes().len(); let len = c_int::try_from(len).expect("length fits into c_int"); let buf = string.into_raw(); let size = unsafe { - llama_cpp_sys_2::llama_token_to_piece(self.model.as_ptr(), token.0, buf, len) + llama_cpp_sys_2::llama_token_to_piece(self.model.as_ptr(), token.0, buf, len, special) }; match size { diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp index bfe7dafc..5cf5e7d4 160000 --- a/llama-cpp-sys-2/llama.cpp +++ b/llama-cpp-sys-2/llama.cpp @@ -1 +1 @@ -Subproject commit bfe7dafc9cf96b9a09ead347fed9a547930fc631 +Subproject commit 5cf5e7d490dfdd2e70bface2d35dfd14aa44b4fb diff --git a/simple/src/main.rs b/simple/src/main.rs index 8f7451f7..9d13ed9d 100644 --- a/simple/src/main.rs +++ b/simple/src/main.rs @@ -15,7 +15,7 @@ use llama_cpp_2::llama_backend::LlamaBackend; use llama_cpp_2::llama_batch::LlamaBatch; use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue; use llama_cpp_2::model::params::LlamaModelParams; -use llama_cpp_2::model::AddBos; +use llama_cpp_2::model::{AddBos, Special}; use llama_cpp_2::model::LlamaModel; use llama_cpp_2::token::data_array::LlamaTokenDataArray; use std::ffi::CString; @@ -214,7 +214,7 @@ either reduce n_len or increase n_ctx" eprintln!(); for token in &tokens_list { - eprint!("{}", model.token_to_str(*token)?); + eprint!("{}", model.token_to_str(*token, Special::Tokenize)?); } std::io::stderr().flush()?; @@ -256,7 +256,7 @@ either reduce n_len or increase n_ctx" break; } - print!("{}", model.token_to_str(new_token_id)?); + print!("{}", model.token_to_str(new_token_id, Special::Tokenize)?); std::io::stdout().flush()?; batch.clear(); From 29a6e26c0006b24559b8ff0eb6eb649475ad40d2 Mon Sep 17 00:00:00 2001 From: marcus Date: Sun, 21 Apr 2024 11:32:41 -0700 Subject: [PATCH 8/8] fixed embeddings compiler error --- embeddings/src/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/embeddings/src/main.rs b/embeddings/src/main.rs index bc0b578d..67c8988b 100644 --- a/embeddings/src/main.rs +++ b/embeddings/src/main.rs @@ -20,7 +20,7 @@ use llama_cpp_2::ggml_time_us; use llama_cpp_2::llama_backend::LlamaBackend; use llama_cpp_2::llama_batch::LlamaBatch; use llama_cpp_2::model::params::LlamaModelParams; -use llama_cpp_2::model::AddBos; +use llama_cpp_2::model::{AddBos, Special}; use llama_cpp_2::model::LlamaModel; #[derive(clap::Parser, Debug, Clone)] @@ -137,7 +137,7 @@ fn main() -> Result<()> { for (i, token_line) in tokens_lines_list.iter().enumerate() { eprintln!("Prompt {i}"); for token in token_line { - eprintln!(" {} --> {}", token, model.token_to_str(*token)?); + eprintln!(" {} --> {}", token, model.token_to_str(*token, Special::Tokenize)?); } eprintln!(); }