From 1f69c90dd4822ebf34bfac6ae988f16232d71284 Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Sat, 1 Jun 2024 10:36:21 -0700 Subject: [PATCH 1/3] Chnaged out the build file --- llama-cpp-sys-2/Cargo.toml | 1 + llama-cpp-sys-2/build.rs | 1227 ++++++++++++++++++++++++++++-------- llama-cpp-sys-2/src/lib.rs | 31 +- 3 files changed, 982 insertions(+), 277 deletions(-) diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml index 4d06e522..ed28515a 100644 --- a/llama-cpp-sys-2/Cargo.toml +++ b/llama-cpp-sys-2/Cargo.toml @@ -48,6 +48,7 @@ include = [ [build-dependencies] bindgen = { workspace = true } cc = { workspace = true, features = ["parallel"] } +once_cell = "1.19.0" [features] cublas = [] diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index 471235b3..81e8029a 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -1,322 +1,1039 @@ use std::env; -use std::ffi::OsStr; -use std::path::Path; -use std::path::PathBuf; +use std::fs::{read_dir, File}; +use std::io::Write; +use std::path::{Path, PathBuf}; +use std::process::Command; -fn main() { - println!("cargo:rerun-if-changed=llama.cpp"); +use cc::Build; +use once_cell::sync::Lazy; - let cublas_enabled = env::var("CARGO_FEATURE_CUBLAS").is_ok(); +// This build file is based on: +// https://github.com/mdrokz/rust-llama.cpp/blob/master/build.rs +// License MIT +// 12-2-2024 - let mut ggml_cuda = if cublas_enabled { - Some(cc::Build::new()) - } else { - None - }; +#[cfg(all( + feature = "metal", + any( + feature = "cuda", + feature = "blas", + feature = "hipblas", + feature = "clblast", + feature = "vulkan" + ) +))] +compile_error!("feature \"metal\" cannot be enabled alongside other GPU based features"); + +#[cfg(all( + feature = "cuda", + any( + feature = "metal", + feature = "blas", + feature = "hipblas", + feature = "clblast", + feature = "vulkan" + ) +))] +compile_error!("feature \"cuda\" cannot be enabled alongside other GPU based features"); + +#[cfg(all( + feature = "blas", + any( + feature = "cuda", + feature = "metal", + feature = "hipblas", + feature = "clblast", + feature = "vulkan" + ) +))] +compile_error!("feature \"blas\" cannot be enabled alongside other GPU based features"); + +#[cfg(all( + feature = "hipblas", + any( + feature = "cuda", + feature = "blas", + feature = "metal", + feature = "clblast", + feature = "vulkan" + ) +))] +compile_error!("feature \"hipblas\" cannot be enabled alongside other GPU based features"); - if !Path::new("llama.cpp/ggml.c").exists() { - panic!("llama.cpp seems to not be populated, try running `git submodule update --init --recursive` to init.") +#[cfg(all( + feature = "clblast", + any( + feature = "cuda", + feature = "blas", + feature = "hipblas", + feature = "metal", + feature = "vulkan" + ) +))] +compile_error!("feature \"clblas\" cannot be enabled alongside other GPU based features"); + +#[cfg(all( + feature = "vulkan", + any( + feature = "cuda", + feature = "blas", + feature = "hipblas", + feature = "clblast", + feature = "metal" + ) +))] +compile_error!("feature \"vulkan\" cannot be enabled alongside other GPU based features"); + +static LLAMA_PATH: Lazy = Lazy::new(|| PathBuf::from("./llama.cpp")); + +fn compile_bindings(out_path: &Path) { + println!("Generating bindings.."); + let bindings = bindgen::Builder::default() + .header(LLAMA_PATH.join("ggml.h").to_string_lossy()) + .header(LLAMA_PATH.join("llama.h").to_string_lossy()) + .derive_partialeq(true) + .allowlist_function("ggml_.*") + .allowlist_type("ggml_.*") + .allowlist_function("llama_.*") + .allowlist_type("llama_.*") + .prepend_enum_name(false); + + #[cfg(all( + feature = "compat", + not(any(target_os = "macos", target_os = "ios", target_os = "dragonfly")) + ))] + { + bindings = bindings.parse_callbacks(Box::new(GGMLLinkRename {})); } - let mut ggml = cc::Build::new(); - let mut llama_cpp = cc::Build::new(); + let bindings = bindings.generate().expect("Unable to generate bindings"); + + bindings + .write_to_file(out_path.join("bindings.rs")) + .expect("Couldn't write bindings!"); +} - ggml.cpp(false); - llama_cpp.cpp(true); +#[cfg(all( + feature = "compat", + not(any(target_os = "macos", target_os = "ios", target_os = "dragonfly")) +))] +#[derive(Debug)] +struct GGMLLinkRename {} - // CMakeFiles.txt: set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeline parallelism") - // get LLAMA_SCHED_MAX_COPIES from env, default to 4 - let mut max_copies = "4".to_owned(); - if let Ok(env_max_copies) = env::var("LLAMA_SCHED_MAX_COPIES") { - if let Ok(v) = env_max_copies.parse::() { - if v > 0 { - max_copies = env_max_copies; - } - } - } - ggml.define("GGML_SCHED_MAX_COPIES", Some(max_copies.as_str())); - - // https://github.com/ggerganov/llama.cpp/blob/a836c8f534ab789b02da149fbdaf7735500bff74/Makefile#L364-L368 - if let Some(ggml_cuda) = &mut ggml_cuda { - for lib in [ - "cuda", "cublas", "culibos", "cudart", "cublasLt", "pthread", "dl", "rt", - ] { - println!("cargo:rustc-link-lib={}", lib); - } - if !ggml_cuda.get_compiler().is_like_msvc() { - for lib in ["culibos", "pthread", "dl", "rt"] { - println!("cargo:rustc-link-lib={}", lib); +#[cfg(all( + feature = "compat", + not(any(target_os = "macos", target_os = "ios", target_os = "dragonfly")) +))] +impl ParseCallbacks for GGMLLinkRename { + fn generated_link_name_override(&self, item_info: ItemInfo<'_>) -> Option { + match item_info.kind { + ItemKind::Function => { + if item_info.name.starts_with("ggml_") { + Some(format!("{PREFIX}{}", item_info.name)) + } else { + None + } } + _ => None, } + } +} + +/// Add platform appropriate flags and definitions present in all compilation configurations. +fn push_common_flags(cx: &mut Build, cxx: &mut Build) { + cx.static_flag(true) + .cpp(false) + .define("GGML_SCHED_MAX_COPIES", "4"); + cxx.static_flag(true) + .cpp(true) + .define("GGML_SCHED_MAX_COPIES", "4"); + + if !cfg!(debug_assertions) { + cx.define("NDEBUG", None); + cxx.define("NDEBUG", None); + } else { + cx.define("GGML_DEBUG", "100"); + cxx.define("GGML_DEBUG", "100"); - println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); - - if cfg!(target_arch = "aarch64") { - ggml_cuda - .flag_if_supported("-mfp16-format=ieee") - .flag_if_supported("-mno-unaligned-access"); - ggml.flag_if_supported("-mfp16-format=ieee") - .flag_if_supported("-mno-unaligned-access"); - llama_cpp - .flag_if_supported("-mfp16-format=ieee") - .flag_if_supported("-mno-unaligned-access"); - ggml.flag_if_supported("-mfp16-format=ieee") - .flag_if_supported("-mno-unaligned-access"); + if cfg!(target_os = "linux") { + cx.define("_GLIBCXX_ASSERTIONS", None); + cxx.define("_GLIBCXX_ASSERTIONS", None); + } else if cfg!(target_os = "windows") { + cx.define("_CRT_SECURE_NO_WARNINGS", None); + cxx.define("_CRT_SECURE_NO_WARNINGS", None); } + } - ggml_cuda - .cuda(true) - .flag("-arch=all") - .file("llama.cpp/ggml-cuda.cu") - .files(std::fs::read_dir("llama.cpp/ggml-cuda") - .expect("failed to read 'llama.cpp/ggml-cuda'") - .map(|e| e.expect("failed to ready entry").path()) - .filter(|p| p.extension().is_some_and(|it| it == OsStr::new("cu"))) - ) - .include("llama.cpp/ggml-cuda") - .include("llama.cpp"); - - if ggml_cuda.get_compiler().is_like_msvc() { - ggml_cuda.std("c++14"); - } else { - ggml_cuda.flag("-std=c++11").std("c++11"); + if cfg!(target_os = "openbsd") { + cx.define("_XOPEN_SOURCE", "700"); + cxx.define("_XOPEN_SOURCE", "700"); + } else { + cx.define("_XOPEN_SOURCE", "600"); + cxx.define("_XOPEN_SOURCE", "600"); + } + + if cfg!(target_os = "linux") { + cx.define("_GNU_SOURCE", None); + cxx.define("_GNU_SOURCE", None); + } else if cfg!(any( + target_os = "macos", + target_os = "ios", + target_os = "dragonfly" + )) { + cx.define("_DARWIN_C_SOURCE", None); + cxx.define("_DARWIN_C_SOURCE", None); + } else if cfg!(target_os = "openbsd") { + cx.define("_BSD_SOURCE", None); + cxx.define("_BSD_SOURCE", None); + } else if cfg!(target_os = "freebsd") { + cx.define("__BSD_VISIBLE", None); + cxx.define("__BSD_VISIBLE", None); + } else if cfg!(target_os = "netbsd") { + cx.define("_NETBSD_SOURCE", None); + cxx.define("_NETBSD_SOURCE", None); + } + + if cfg!(any(target_arch = "arm", target_arch = "aarch64")) { + if cfg!(target_family = "unix") { + // cx.flag("-mavx512vnni").flag("-mfp16-format=ieee"); + // cxx.flag("-mavx512vnni").flag("-mfp16-format=ieee"); + } else if cfg!(target_family = "windows") { + cx.define("__ARM_NEON", None) + .define("__ARM_FEATURE_FMA", None) + .define("__ARM_FEATURE_DOTPROD", None) + .define("__aarch64__", None); + cxx.define("__ARM_NEON", None) + .define("__ARM_FEATURE_FMA", None) + .define("__ARM_FEATURE_DOTPROD", None) + .define("__aarch64__", None); } + } +} - ggml.define("GGML_USE_CUDA", None); - ggml_cuda.define("GGML_USE_CUDA", None); - llama_cpp.define("GGML_USE_CUDA", None); +/// Add platform appropriate flags and definitions for compilation warnings. +fn push_warn_flags(cx: &mut Build, cxx: &mut Build) { + if cfg!(target_family = "unix") { + cx.flag("-pthread") + .flag("-Wall") + .flag("-Wextra") + .flag("-Wpedantic") + .flag("-Wcast-qual") + .flag("-Wdouble-promotion") + .flag("-Wshadow") + .flag("-Wstrict-prototypes") + .flag("-Wpointer-arith"); + cxx.flag("-fPIC") + .flag("-pthread") + .flag("-Wall") + .flag("-Wdeprecated-declarations") + .flag("-Wextra") + .flag("-Wpedantic") + .flag("-Wcast-qual") + .flag("-Wno-unused-function") + .flag("-Wno-multichar"); + } else if cfg!(target_family = "windows") { + cx.flag("/W4") + .flag("/Wall") + .flag("/wd4820") + .flag("/wd4710") + .flag("/wd4711") + .flag("/wd4820") + .flag("/wd4514"); + cxx.flag("/W4") + .flag("/Wall") + .flag("/wd4820") + .flag("/wd4710") + .flag("/wd4711") + .flag("/wd4820") + .flag("/wd4514"); } +} - for build in [&mut ggml, &mut llama_cpp] { - let compiler = build.get_compiler(); +/// Add platform appropriate flags and definitions based on enabled features. +fn push_feature_flags(cx: &mut Build, cxx: &mut Build) { + // TODO in llama.cpp's cmake (https://github.com/ggerganov/llama.cpp/blob/9ecdd12e95aee20d6dfaf5f5a0f0ce5ac1fb2747/CMakeLists.txt#L659), they include SIMD instructions manually, however it doesn't seem to be necessary for VS2022's MSVC, check when it is needed - if cfg!(target_arch = "i686") || cfg!(target_arch = "x86_64") { - let features = x86::Features::get_target(); - if compiler.is_like_clang() || compiler.is_like_gnu() { - build.flag("-pthread"); + if cfg!(any(target_arch = "x86", target_arch = "x86_64")) { + if cfg!(feature = "native") && cfg!(target_os = "linux") { + cx.flag("-march=native"); + cxx.flag("-march=native"); + } - if features.avx { - build.flag("-mavx"); - } - if features.avx2 { - build.flag("-mavx2"); - } - if features.fma { - build.flag("-mfma"); + if cfg!(feature = "fma") && cfg!(target_family = "unix") { + cx.flag("-mfma"); + cxx.flag("-mfma"); + } + + if cfg!(feature = "f16c") && cfg!(target_family = "unix") { + cx.flag("-mf16c"); + cxx.flag("-mf16c"); + } + + if cfg!(target_family = "unix") { + if cfg!(feature = "avx512") { + cx.flag("-mavx512f").flag("-mavx512bw"); + cxx.flag("-mavx512f").flag("-mavx512bw"); + + if cfg!(feature = "avx512_vmbi") { + cx.flag("-mavx512vbmi"); + cxx.flag("-mavx512vbmi"); } - if features.f16c { - build.flag("-mf16c"); + + if cfg!(feature = "avx512_vnni") { + cx.flag("-mavx512vnni"); + cxx.flag("-mavx512vnni"); } - if features.sse3 { - build.flag("-msse3"); + } + + if cfg!(feature = "avx2") { + cx.flag("-mavx2"); + cxx.flag("-mavx2"); + } + + if cfg!(feature = "avx") { + cx.flag("-mavx"); + cxx.flag("-mavx"); + } + } else if cfg!(target_family = "windows") { + if cfg!(feature = "avx512") { + cx.flag("/arch:AVX512"); + cxx.flag("/arch:AVX512"); + + if cfg!(feature = "avx512_vmbi") { + cx.define("__AVX512VBMI__", None); + cxx.define("__AVX512VBMI__", None); } - } else if compiler.is_like_msvc() { - match (features.avx2, features.avx) { - (true, _) => { - build.flag("/arch:AVX2"); - } - (_, true) => { - build.flag("/arch:AVX"); - } - _ => {} + + if cfg!(feature = "avx512_vnni") { + cx.define("__AVX512VNNI__", None); + cxx.define("__AVX512VNNI__", None); } + } else if cfg!(feature = "avx2") { + cx.flag("/arch:AVX2"); + cxx.flag("/arch:AVX2"); + } else if cfg!(feature = "avx") { + cx.flag("/arch:AVX"); + cxx.flag("/arch:AVX"); } - } else if cfg!(target_arch = "aarch64") - && (compiler.is_like_clang() || compiler.is_like_gnu()) - { - if cfg!(target_os = "macos") { - build.flag("-mcpu=apple-m1"); - } else if env::var("HOST") == env::var("TARGET") { - build.flag("-mcpu=native"); - } - build.flag("-pthread"); } } +} - // https://github.com/ggerganov/llama.cpp/blob/191221178f51b6e81122c5bda0fd79620e547d07/Makefile#L133-L141 - if cfg!(target_os = "macos") { - assert!(!cublas_enabled, "CUBLAS is not supported on macOS"); +fn compile_opencl(cx: &mut Build, cxx: &mut Build) { + println!("Compiling OpenCL GGML.."); - let metal_enabled = env::var("CARGO_FEATURE_METAL").is_ok(); + // TODO + println!("cargo:warning=OpenCL compilation and execution has not been properly tested yet"); - println!("cargo:rustc-link-lib=framework=Foundation"); - if metal_enabled { - println!("cargo:rustc-link-lib=framework=Metal"); - println!("cargo:rustc-link-lib=framework=MetalPerformanceShaders"); - println!("cargo:rustc-link-lib=framework=MetalKit"); - } + cx.define("GGML_USE_CLBLAST", None); + cxx.define("GGML_USE_CLBLAST", None); - llama_cpp.define("_DARWIN_C_SOURCE", None); + if cfg!(target_os = "linux") { + println!("cargo:rustc-link-lib=OpenCL"); + println!("cargo:rustc-link-lib=clblast"); + } else if cfg!(target_os = "macos") { + println!("cargo:rustc-link-lib=framework=OpenCL"); + println!("cargo:rustc-link-lib=clblast"); + } - // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L340-L343 - if metal_enabled { - llama_cpp.define("GGML_USE_METAL", None); - } - llama_cpp.define("GGML_USE_ACCELERATE", None); - llama_cpp.define("ACCELERATE_NEW_LAPACK", None); - llama_cpp.define("ACCELERATE_LAPACK_ILP64", None); - println!("cargo:rustc-link-lib=framework=Accelerate"); - - if metal_enabled { - metal_hack(&mut ggml); - ggml.include("./llama.cpp/ggml-metal.h"); - } + cxx.file(LLAMA_PATH.join("ggml-opencl.cpp")); +} + +fn compile_openblas(cx: &mut Build) { + println!("Compiling OpenBLAS GGML.."); + + // TODO + println!("cargo:warning=OpenBlas compilation and execution has not been properly tested yet"); + + cx.define("GGML_USE_OPENBLAS", None) + .include("/usr/local/include/openblas") + .include("/usr/local/include/openblas"); + println!("cargo:rustc-link-lib=openblas"); +} + +fn compile_blis(cx: &mut Build) { + println!("Compiling BLIS GGML.."); + + // TODO + println!("cargo:warning=Blis compilation and execution has not been properly tested yet"); + + cx.define("GGML_USE_OPENBLAS", None) + .include("/usr/local/include/blis") + .include("/usr/local/include/blis"); + println!("cargo:rustc-link-search=native=/usr/local/lib"); + println!("cargo:rustc-link-lib=blis"); +} + +fn compile_hipblas(cx: &mut Build, cxx: &mut Build, mut hip: Build) -> &'static str { + const DEFAULT_ROCM_PATH_STR: &str = "/opt/rocm/"; + + let rocm_path_str = env::var("ROCM_PATH") + .map_err(|_| DEFAULT_ROCM_PATH_STR.to_string()) + .unwrap(); + println!("Compiling HIPBLAS GGML. Using ROCm from {rocm_path_str}"); + + let rocm_path = PathBuf::from(rocm_path_str); + let rocm_include = rocm_path.join("include"); + let rocm_lib = rocm_path.join("lib"); + let rocm_hip_bin = rocm_path.join("bin/hipcc"); + + let cuda_lib = "ggml-cuda"; + let cuda_file = cuda_lib.to_string() + ".cu"; + let cuda_header = cuda_lib.to_string() + ".h"; + + let defines = ["GGML_USE_HIPBLAS", "GGML_USE_CUBLAS"]; + for def in defines { + cx.define(def, None); + cxx.define(def, None); } - if cfg!(target_os = "dragonfly") { - llama_cpp.define("__BSD_VISIBLE", None); + cx.include(&rocm_include); + cxx.include(&rocm_include); + + hip.compiler(rocm_hip_bin) + .std("c++11") + .file(LLAMA_PATH.join(cuda_file)) + .include(LLAMA_PATH.join(cuda_header)) + .define("GGML_USE_HIPBLAS", None) + .compile(cuda_lib); + + println!( + "cargo:rustc-link-search=native={}", + rocm_lib.to_string_lossy() + ); + + let rocm_libs = ["hipblas", "rocblas", "amdhip64"]; + for lib in rocm_libs { + println!("cargo:rustc-link-lib={lib}"); } + cuda_lib +} + +fn compile_cuda(cx: &mut Build, cxx: &mut Build, featless_cxx: Build) -> &'static str { + println!("Compiling CUDA GGML.."); + + // CUDA gets linked through the cudarc crate. + + cx.define("GGML_USE_CUDA", None); + cxx.define("GGML_USE_CUDA", None); + + let mut nvcc = featless_cxx; + nvcc.cuda(true) + .flag("--forward-unknown-to-host-compiler") + .flag("-arch=all") + .define("K_QUANTS_PER_ITERATION", Some("2")) + .define("GGML_CUDA_PEER_MAX_BATCH_SIZE", Some("128")); + if cfg!(target_os = "linux") { - ggml.define("_GNU_SOURCE", None); + nvcc.flag("-Wno-pedantic"); + // TODO Are these links needed? + println!("cargo:rustc-link-lib=pthread"); + println!("cargo:rustc-link-lib=dl"); + println!("cargo:rustc-link-lib=rt"); } - ggml.std("c11") - .include("./llama.cpp") - .file("llama.cpp/ggml.c") - .file("llama.cpp/ggml-alloc.c") - .file("llama.cpp/ggml-backend.c") - .file("llama.cpp/ggml-quants.c") - .define("GGML_USE_K_QUANTS", None); - - llama_cpp - .define("_XOPEN_SOURCE", Some("600")) - .include("llama.cpp") - .std("c++11") - .file("llama.cpp/llama.cpp") - .file("llama.cpp/unicode.cpp") - .file("llama.cpp/unicode-data.cpp"); - - // Remove debug log output from `llama.cpp` - let is_release = env::var("PROFILE").unwrap() == "release"; - if is_release { - ggml.define("NDEBUG", None); - llama_cpp.define("NDEBUG", None); - if let Some(cuda) = ggml_cuda.as_mut() { - cuda.define("NDEBUG", None); - } + if cfg!(feature = "cuda_dmmv") { + nvcc.define("GGML_CUDA_FORCE_DMMV", None) + .define("GGML_CUDA_DMMV_X", Some("32")) + .define("GGML_CUDA_MMV_Y", Some("1")); + } - ggml.opt_level(3); - llama_cpp.opt_level(3); + if cfg!(feature = "cuda_mmq") { + nvcc.define("GGML_CUDA_FORCE_MMQ", None); } - if let Some(ggml_cuda) = ggml_cuda { - eprintln!("compiling ggml-cuda"); - ggml_cuda.compile("ggml-cuda"); - eprintln!("compiled ggml-cuda"); + let lib_name = "ggml-cuda"; + let cuda_path = LLAMA_PATH.join("ggml-cuda"); + let cuda_sources = read_dir(cuda_path.as_path()) + .unwrap() + .map(|f| f.unwrap()) + .filter(|entry| entry.file_name().to_string_lossy().ends_with(".cu")) + .map(|entry| entry.path()); + + nvcc.include(cuda_path.as_path()) + .include(LLAMA_PATH.as_path()) + .files(cuda_sources) + .file(LLAMA_PATH.join("ggml-cuda.cu")) + .compile(lib_name); + + lib_name +} + +fn compile_metal(cx: &mut Build, cxx: &mut Build) { + println!("Compiling Metal GGML.."); + + cx.define("GGML_USE_METAL", None); + cxx.define("GGML_USE_METAL", None); + + cx.define("GGML_METAL_EMBED_LIBRARY", None); + cxx.define("GGML_METAL_EMBED_LIBRARY", None); + + if !cfg!(debug_assertions) { + cx.define("GGML_METAL_NDEBUG", None); } - eprintln!("compiling ggml"); - ggml.compile("ggml"); - eprintln!("compiled ggml"); + // It's idiomatic to use OUT_DIR for intermediate c/c++ artifacts + let out_dir = env::var("OUT_DIR").unwrap(); - eprintln!("compiling llama"); - llama_cpp.compile("llama"); - eprintln!("compiled llama"); + let ggml_metal_shader_path = LLAMA_PATH.join("ggml-metal.metal"); - let header = "llama.cpp/llama.h"; + // Create a temporary assembly file that will allow for static linking to the metal shader. + let ggml_metal_embed_assembly_path = PathBuf::from(&out_dir).join("ggml-metal-embed.asm"); + let mut ggml_metal_embed_assembly_file = File::create(&ggml_metal_embed_assembly_path) + .expect("Failed to open ggml-metal-embed.asm file"); - println!("cargo:rerun-if-changed={header}"); + let ggml_metal_shader_out_path = PathBuf::from(&out_dir).join("ggml-metal.metal"); + let common = LLAMA_PATH.join("ggml-common.h"); - let bindings = bindgen::builder() - .header(header) - .derive_partialeq(true) - .no_debug("llama_grammar_element") - .prepend_enum_name(false) - .derive_eq(true) - .generate() - .expect("failed to generate bindings for llama.cpp"); + let input_file = File::open(ggml_metal_shader_path).expect("Failed to open input file"); + let mut output_file = + File::create(&ggml_metal_shader_out_path).expect("Failed to create output file"); - let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); - bindings - .write_to_file(out_path.join("bindings.rs")) - .expect("failed to write bindings to file"); - let llama_cpp_dir = PathBuf::from("llama.cpp").canonicalize().unwrap(); - println!("cargo:INCLUDE={}", llama_cpp_dir.to_str().unwrap()); - println!("cargo:OUT_DIR={}", out_path.to_str().unwrap()); + let output = Command::new("sed") + .arg("-e") + .arg(format!( + "/#include \"ggml-common.h\"/r {}", + common.to_string_lossy() + )) + .arg("-e") + .arg("/#include \"ggml-common.h\"/d") + .stdin(input_file) + .stdout(output_file) + .output() + .expect("Failed to execute command"); + if !output.status.success() { + panic!( + "An error has occurred while embedding common file ({}):\n{}", + output.status, + String::from_utf8_lossy(&output.stderr) + ); + } + + // The contents of this file is directly copied from the llama.cpp Makefile + let ggml_metal_embed_assembly_code = format!( + ".section __DATA, __ggml_metallib\n\ + .globl _ggml_metallib_start\n\ + _ggml_metallib_start:\n\ + .incbin \"{}\"\n\ + .globl _ggml_metallib_end\n\ + _ggml_metallib_end:\n", + ggml_metal_shader_out_path + .to_str() + .expect("Failed to convert path to string") + ); + + write!( + ggml_metal_embed_assembly_file, + "{}", + ggml_metal_embed_assembly_code + ) + .expect("Failed to write ggml metal embed assembly code"); + + // Assemble the ggml metal embed code. + let ggml_metal_embed_object_path = PathBuf::from(&out_dir).join("ggml-metal-embed.o"); + Command::new("as") + .arg(&ggml_metal_embed_assembly_path) + .arg("-o") + .arg(&ggml_metal_embed_object_path) + .status() + .expect("Failed to assemble ggml-metal-embed file"); + + // Create a static library for our metal embed code. + let ggml_metal_embed_library_path = PathBuf::from(&out_dir).join("libggml-metal-embed.a"); + Command::new("ar") + .args(&[ + "crus", + ggml_metal_embed_library_path.to_str().unwrap(), + ggml_metal_embed_object_path.to_str().unwrap(), + ]) + .status() + .expect("Failed to create static library from ggml-metal-embed object file"); + + println!("cargo:rustc-link-lib=framework=Metal"); + println!("cargo:rustc-link-lib=framework=Foundation"); + println!("cargo:rustc-link-lib=framework=MetalPerformanceShaders"); + println!("cargo:rustc-link-lib=framework=MetalKit"); + + // Link to our new static library for our metal embed code. + println!("cargo:rustc-link-search=native={}", &out_dir); + println!("cargo:rustc-link-lib=static=ggml-metal-embed"); + + cx.include(LLAMA_PATH.join("ggml-metal.h")) + .file(LLAMA_PATH.join("ggml-metal.m")); } -// courtesy of https://github.com/rustformers/llm -fn metal_hack(build: &mut cc::Build) { - const GGML_METAL_METAL_PATH: &str = "llama.cpp/ggml-metal.metal"; - const GGML_METAL_PATH: &str = "llama.cpp/ggml-metal.m"; - const GGML_COMMON_PATH: &str = "llama.cpp/ggml-common.h"; - - let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is not defined")); - - let ggml_metal_path = { - let ggml_metal_metal = std::fs::read_to_string(GGML_METAL_METAL_PATH) - .expect("Could not read ggml-metal.metal") - .replace('\\', "\\\\") - .replace('\n', "\\n") - .replace('\r', "\\r") - .replace('\"', "\\\""); - - let ggml_common = std::fs::read_to_string(GGML_COMMON_PATH).expect("Could not read ggml-common.h") - .replace('\\', "\\\\") - .replace('\n', "\\n") - .replace('\r', "\\r") - .replace('\"', "\\\""); - - let includged_ggml_metal_metal = ggml_metal_metal.replace( - "#include \\\"ggml-common.h\\\"", - &format!("{ggml_common}") - ); - print!("{}", &includged_ggml_metal_metal); +fn compile_vulkan(cx: &mut Build, cxx: &mut Build) -> &'static str { + println!("Compiling Vulkan GGML.."); - let ggml_metal = - std::fs::read_to_string(GGML_METAL_PATH).expect("Could not read ggml-metal.m"); + // Vulkan gets linked through the ash crate. - let needle = r#"NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];"#; - if !ggml_metal.contains(needle) { - panic!("ggml-metal.m does not contain the needle to be replaced; the patching logic needs to be reinvestigated. Contact a `llama-cpp-sys-2` developer!"); - } + if cfg!(debug_assertions) { + cx.define("GGML_VULKAN_DEBUG", None) + .define("GGML_VULKAN_CHECK_RESULTS", None) + .define("GGML_VULKAN_VALIDATE", None); + cxx.define("GGML_VULKAN_DEBUG", None) + .define("GGML_VULKAN_CHECK_RESULTS", None) + .define("GGML_VULKAN_VALIDATE", None); + } + + cx.define("GGML_USE_VULKAN", None); + cxx.define("GGML_USE_VULKAN", None); + + let lib_name = "ggml-vulkan"; + + cxx.clone() + .include("./thirdparty/Vulkan-Headers/include/") + .include(LLAMA_PATH.as_path()) + .file(LLAMA_PATH.join("ggml-vulkan.cpp")) + .compile(lib_name); + + lib_name +} + +fn compile_ggml(mut cx: Build) { + println!("Compiling GGML.."); + cx.std("c11") + .include(LLAMA_PATH.as_path()) + .file(LLAMA_PATH.join("ggml.c")) + .file(LLAMA_PATH.join("ggml-alloc.c")) + .file(LLAMA_PATH.join("ggml-backend.c")) + .file(LLAMA_PATH.join("ggml-quants.c")) + .compile("ggml"); +} + +fn compile_llama(mut cxx: Build, _out_path: impl AsRef) { + println!("Compiling Llama.cpp.."); + cxx.std("c++11") + .include(LLAMA_PATH.as_path()) + .file(LLAMA_PATH.join("unicode.cpp")) + .file(LLAMA_PATH.join("unicode-data.cpp")) + .file(LLAMA_PATH.join("llama.cpp")) + .compile("llama"); +} - // Replace the runtime read of the file with a compile-time string - let ggml_metal = ggml_metal.replace( - needle, - &format!(r#"NSString * src = @"{includged_ggml_metal_metal}";"#), +fn main() { + if std::fs::read_dir(LLAMA_PATH.as_path()).is_err() { + panic!( + "Could not find {}. Did you forget to initialize submodules?", + LLAMA_PATH.display() ); + } + + let out_path = PathBuf::from(env::var("OUT_DIR").expect("No out dir found")); + + println!("cargo:rerun-if-changed={}", LLAMA_PATH.display()); - let patched_ggml_metal_path = out_dir.join("ggml-metal.m"); - std::fs::write(&patched_ggml_metal_path, ggml_metal) - .expect("Could not write temporary patched ggml-metal.m"); + compile_bindings(&out_path); - patched_ggml_metal_path + let mut cx = Build::new(); + let mut cxx = Build::new(); + + push_common_flags(&mut cx, &mut cxx); + + let featless_cxx = cxx.clone(); // mostly used for CUDA + + push_warn_flags(&mut cx, &mut cxx); + push_feature_flags(&mut cx, &mut cxx); + + let feat_lib = if cfg!(feature = "vulkan") { + Some(compile_vulkan(&mut cx, &mut cxx)) + } else if cfg!(feature = "cuda") { + Some(compile_cuda(&mut cx, &mut cxx, featless_cxx)) + } else if cfg!(feature = "opencl") { + compile_opencl(&mut cx, &mut cxx); + None + } else if cfg!(feature = "openblas") { + compile_openblas(&mut cx); + None + } else if cfg!(feature = "blis") { + compile_blis(&mut cx); + None + } else if cfg!(feature = "metal") && cfg!(target_os = "macos") { + compile_metal(&mut cx, &mut cxx); + None + } else if cfg!(feature = "hipblas") { + Some(compile_hipblas(&mut cx, &mut cxx, featless_cxx)) + } else { + None }; - build.file(ggml_metal_path); -} + compile_ggml(cx); + compile_llama(cxx, &out_path); -// Courtesy of https://github.com/rustformers/llm -fn get_supported_target_features() -> std::collections::HashSet { - env::var("CARGO_CFG_TARGET_FEATURE") - .unwrap() - .split(',') - .map(ToString::to_string) - .collect() + #[cfg(all( + feature = "compat", + not(any(target_os = "macos", target_os = "ios", target_os = "dragonfly")) + ))] + { + compat::redefine_symbols(out_path, feat_lib); + } } -mod x86 { - #[allow(clippy::struct_excessive_bools)] - #[derive(Clone, Debug, PartialEq, Eq)] - pub struct Features { - pub fma: bool, - pub avx: bool, - pub avx2: bool, - pub f16c: bool, - pub sse3: bool, +// MacOS will prefix all exported symbols with a leading underscore. +// Additionally, it seems that there are no collision issues when building with both llama and whisper crates, so the +// compat feature can be ignored. + +#[cfg(all( + feature = "compat", + not(any(target_os = "macos", target_os = "ios", target_os = "dragonfly")) +))] +mod compat { + use std::collections::HashSet; + use std::fmt::{Display, Formatter}; + use std::process::Command; + + use crate::*; + + pub fn redefine_symbols(out_path: impl AsRef, additional_lib: Option<&str>) { + let (ggml_lib_name, llama_lib_name) = lib_names(); + let (nm, objcopy) = tools(); + println!( + "Modifying {ggml_lib_name} and {llama_lib_name}, symbols acquired via \ + \"{nm}\" and modified via \"{objcopy}\"" + ); + + // Modifying symbols exposed by the ggml library + + let out_str = nm_symbols(&nm, ggml_lib_name, &out_path); + let symbols = get_symbols( + &out_str, + [ + Filter { + prefix: "ggml", + sym_type: 'T', + }, + Filter { + prefix: "ggml", + sym_type: 'U', + }, + Filter { + prefix: "ggml", + sym_type: 'B', + }, + Filter { + prefix: "gguf", + sym_type: 'T', + }, + Filter { + prefix: "quantize", + sym_type: 'T', + }, + Filter { + prefix: "dequantize", + sym_type: 'T', + }, + Filter { + prefix: "iq2xs", + sym_type: 'T', + }, + Filter { + prefix: "iq3xs", + sym_type: 'T', + }, + ], + ); + objcopy_redefine(&objcopy, ggml_lib_name, PREFIX, symbols, &out_path); + + // Modifying the symbols llama depends on from ggml + + let out_str = nm_symbols(&nm, llama_lib_name, &out_path); + let symbols = get_symbols( + &out_str, + [ + Filter { + prefix: "ggml", + sym_type: 'U', + }, + Filter { + prefix: "gguf", + sym_type: 'U', + }, + ], + ); + objcopy_redefine(&objcopy, llama_lib_name, PREFIX, symbols, &out_path); + + if let Some(gpu_lib_name) = additional_lib { + // Modifying the symbols of the GPU library + + let lib_name = if cfg!(target_family = "windows") { + format!("{gpu_lib_name}.lib") + } else if cfg!(target_family = "unix") { + format!("lib{gpu_lib_name}.a") + } else { + println!("cargo:warning=Unknown target family, defaulting to Unix lib names"); + format!("lib{gpu_lib_name}.a") + }; + + let out_str = nm_symbols(&nm, &lib_name, &out_path); + let symbols = get_symbols( + &out_str, + [ + Filter { + prefix: "ggml", + sym_type: 'U', + }, + Filter { + prefix: "ggml", + sym_type: 'T', + }, + ], + ); + objcopy_redefine(&objcopy, &lib_name, PREFIX, symbols, &out_path); + } + } + + /// Returns *GGML*'s and *Llama.cpp*'s compiled library names, based on the operating system. + fn lib_names() -> (&'static str, &'static str) { + let ggml_lib_name; + let llama_lib_name; + if cfg!(target_family = "windows") { + ggml_lib_name = "ggml.lib"; + llama_lib_name = "llama.lib"; + } else if cfg!(target_family = "unix") { + ggml_lib_name = "libggml.a"; + llama_lib_name = "libllama.a"; + } else { + println!("cargo:warning=Unknown target family, defaulting to Unix lib names"); + ggml_lib_name = "libggml.a"; + llama_lib_name = "libllama.a"; + }; + + (ggml_lib_name, llama_lib_name) } - impl Features { - pub fn get_target() -> Self { - let features = crate::get_supported_target_features(); - Self { - fma: features.contains("fma"), - avx: features.contains("avx"), - avx2: features.contains("avx2"), - f16c: features.contains("f16c"), - sse3: features.contains("sse3"), + + /// Returns [`Tool`]s equivalent to [nm][nm] and [objcopy][objcopy]. + /// + /// [nm]: https://www.man7.org/linux/man-pages/man1/nm.1.html + /// [objcopy]: https://www.man7.org/linux/man-pages/man1/objcopy.1.html + fn tools() -> (Tool, Tool) { + let nm_names; + let objcopy_names; + let nm_help; + let objcopy_help; + if cfg!(target_os = "linux") { + nm_names = vec!["nm", "llvm-nm"]; + objcopy_names = vec!["objcopy", "llvm-objcopy"]; + nm_help = vec!["\"nm\" from GNU Binutils", "\"llvm-nm\" from LLVM"]; + objcopy_help = vec![ + "\"objcopy\" from GNU Binutils", + "\"llvm-objcopy\" from LLVM", + ]; + } else if cfg!(any( + target_os = "macos", + target_os = "ios", + target_os = "dragonfly" + )) { + nm_names = vec!["nm", "llvm-nm"]; + objcopy_names = vec!["llvm-objcopy"]; + nm_help = vec!["\"llvm-nm\" from LLVM 17"]; + objcopy_help = vec!["\"llvm-objcopy\" from LLVM 17"]; + } else { + nm_names = vec!["llvm-nm"]; + objcopy_names = vec!["llvm-objcopy"]; + nm_help = vec!["\"llvm-nm\" from LLVM 17"]; + objcopy_help = vec!["\"llvm-objcopy\" from LLVM 17"]; + } + + let nm_env = "NM_PATH"; + println!("cargo:rerun-if-env-changed={nm_env}"); + println!("Looking for \"nm\" or an equivalent tool"); + let nm_name = find_tool(&nm_names, nm_env).unwrap_or_else(move || { + panic_tool_help("nm", nm_env, &nm_help); + unreachable!("The function above should have panicked") + }); + + let objcopy_env = "OBJCOPY_PATH"; + println!("cargo:rerun-if-env-changed={objcopy_env}"); + println!("Looking for \"objcopy\" or an equivalent tool.."); + let objcopy_name = find_tool(&objcopy_names, objcopy_env).unwrap_or_else(move || { + panic_tool_help("objcopy", objcopy_env, &objcopy_help); + unreachable!("The function above should have panicked") + }); + + (nm_name, objcopy_name) + } + + /// A command line tool name present in `PATH` or its full [`Path`]. + enum Tool { + /// The name of a tool present in `PATH`. + Name(&'static str), + + /// The full [`Path`] to a tool. + FullPath(PathBuf), + } + + impl Display for Tool { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Tool::Name(name) => write!(f, "{}", name), + Tool::FullPath(path) => write!(f, "{}", path.display()), + } + } + } + + /// Returns the first [`Tool`] found in the system `PATH`, given a list of tool names, returning + /// the first one found and printing its version. + /// + /// If a value is present in the provided environment variable name, it will get checked + /// instead. + /// + /// ## Panic + /// Returns [`Option::None`] if no [`Tool`] is found. + fn find_tool(names: &[&'static str], env: &str) -> Option { + if let Ok(path_str) = env::var(env) { + let path_str = path_str.trim_matches([' ', '"', '\''].as_slice()); + println!("{env} is set, checking if \"{path_str}\" is a valid tool"); + let path = PathBuf::from(&path_str); + + if !path.is_file() { + panic!("\"{path_str}\" is not a file path.") } + + let output = Command::new(path_str) + .arg("--version") + .output() + .unwrap_or_else(|e| panic!("Failed to run \"{path_str} --version\". ({e})")); + + if output.status.success() { + let out_str = String::from_utf8_lossy(&output.stdout); + println!("Valid tool found:\n{out_str}"); + } else { + println!("cargo:warning=Tool \"{path_str}\" found, but could not execute \"{path_str} --version\"") + } + + return Some(Tool::FullPath(path)); } + + println!("{env} not set, looking for {names:?} in PATH"); + for name in names { + if let Ok(output) = Command::new(name).arg("--version").output() { + if output.status.success() { + let out_str = String::from_utf8_lossy(&output.stdout); + println!("Valid tool found:\n{out_str}"); + return Some(Tool::Name(name)); + } + } + } + + None + } + + /// Always panics, printing suggestions for finding the specified tool. + fn panic_tool_help(name: &str, env: &str, suggestions: &[&str]) { + let suggestions_str = if suggestions.is_empty() { + String::new() + } else { + let mut suggestions_str = "For your Operating System we recommend:\n".to_string(); + for suggestion in &suggestions[..suggestions.len() - 1] { + suggestions_str.push_str(&format!("{suggestion}\nOR\n")); + } + suggestions_str.push_str(suggestions[suggestions.len() - 1]); + suggestions_str + }; + + panic!("No suitable tool equivalent to \"{name}\" has been found in PATH, if one is already installed, either add its directory to PATH or set {env} to its full path. {suggestions_str}") + } + + /// Executes [nm][nm] or an equivalent tool in portable mode and returns the output. + /// + /// ## Panic + /// Will panic on any errors. + /// + /// [nm]: https://www.man7.org/linux/man-pages/man1/nm.1.html + fn nm_symbols(tool: &Tool, target_lib: &str, out_path: impl AsRef) -> String { + let output = Command::new(tool.to_string()) + .current_dir(&out_path) + .arg(target_lib) + .args(["-p", "-P"]) + .output() + .unwrap_or_else(move |e| panic!("Failed to run \"{tool}\". ({e})")); + + if !output.status.success() { + panic!( + "An error has occurred while acquiring symbols from the compiled library \"{target_lib}\" ({}):\n{}", + output.status, + String::from_utf8_lossy(&output.stderr) + ); + } + + String::from_utf8_lossy(&output.stdout).to_string() + } + + /// Executes [objcopy][objcopy], adding a prefix to the specified symbols of the target library. + /// + /// ## Panic + /// Will panic on any errors. + /// + /// [objcopy]: https://www.man7.org/linux/man-pages/man1/objcopy.1.html + fn objcopy_redefine( + tool: &Tool, + target_lib: &str, + prefix: &str, + symbols: HashSet<&str>, + out_path: impl AsRef, + ) { + let mut cmd = Command::new(tool.to_string()); + cmd.current_dir(&out_path); + for symbol in symbols { + cmd.arg(format!("--redefine-sym={symbol}={prefix}{symbol}")); + } + + let output = cmd + .arg(target_lib) + .output() + .unwrap_or_else(move |e| panic!("Failed to run \"{tool}\". ({e})")); + + if !output.status.success() { + panic!( + "An error has occurred while redefining symbols from library file \"{target_lib}\" ({}):\n{}", + output.status, + String::from_utf8_lossy(&output.stderr) + ); + } + } + + /// A filter for a symbol in a library. + struct Filter<'a> { + prefix: &'a str, + sym_type: char, + } + + /// Turns **`nm`**'s output into an iterator of [`str`] symbols. + /// + /// This function expects **`nm`** to be called using the **`-p`** and **`-P`** flags. + fn get_symbols<'a, const N: usize>( + nm_output: &'a str, + filters: [Filter<'a>; N], + ) -> HashSet<&'a str> { + let iter = nm_output + .lines() + .map(|symbol| { + // Strip irrelevant information + + let mut stripped = symbol; + while stripped.split(' ').count() > 2 { + // SAFETY: We just made sure ' ' is present above + let idx = unsafe { stripped.rfind(' ').unwrap_unchecked() }; + stripped = &stripped[..idx] + } + stripped + }) + .filter(move |symbol| { + // Filter matching symbols + + if symbol.split(' ').count() == 2 { + for filter in &filters { + if symbol.ends_with(filter.sym_type) && symbol.starts_with(filter.prefix) { + return true; + } + } + } + false + }) + .map(|symbol| &symbol[..symbol.len() - 2]); // Strip the type, so only the symbol remains + + // Filter duplicates + HashSet::from_iter(iter) } } diff --git a/llama-cpp-sys-2/src/lib.rs b/llama-cpp-sys-2/src/lib.rs index dbec4ab4..d1ef8abb 100644 --- a/llama-cpp-sys-2/src/lib.rs +++ b/llama-cpp-sys-2/src/lib.rs @@ -4,27 +4,14 @@ #![allow(non_camel_case_types)] #![allow(non_snake_case)] -use std::fmt::{Debug, Formatter}; -include!(concat!(env!("OUT_DIR"), "/bindings.rs")); +// [`ash`] is only included to link to the Vulkan SDK. +#[allow(unused)] +#[cfg(feature = "vulkan")] +use ash; -impl Debug for llama_grammar_element { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - fn type_to_str(r#type: llama_gretype) -> &'static str { - match r#type { - LLAMA_GRETYPE_END => "END", - LLAMA_GRETYPE_ALT => "ALT", - LLAMA_GRETYPE_RULE_REF => "RULE_REF", - LLAMA_GRETYPE_CHAR => "CHAR", - LLAMA_GRETYPE_CHAR_NOT => "CHAR_NOT", - LLAMA_GRETYPE_CHAR_RNG_UPPER => "CHAR_RNG_UPPER", - LLAMA_GRETYPE_CHAR_ALT => "CHAR_ALT", - _ => "Unknown", - } - } +// [`cudarc`] is only included to link to CUDA. +#[allow(unused)] +#[cfg(feature = "cuda")] +use cudarc; - f.debug_struct("llama_grammar_element") - .field("type", &type_to_str(self.type_)) - .field("value", &self.value) - .finish() - } -} +include!(concat!(env!("OUT_DIR"), "/bindings.rs")); From 6b0f923b2a0ba5befb07ee6b5fd337f967663b5f Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Tue, 4 Jun 2024 11:18:58 -0700 Subject: [PATCH 2/3] Change cublas to cuda --- Cargo.lock | 5 +++-- README.md | 2 +- embeddings/src/main.rs | 10 +++++----- llama-cpp-2/Cargo.toml | 4 ++-- llama-cpp-2/src/lib.rs | 2 +- llama-cpp-sys-2/Cargo.toml | 2 +- llama-cpp-sys-2/README.md | 4 ++-- llama-cpp-sys-2/llama.cpp | 2 +- simple/Cargo.toml | 2 +- simple/src/main.rs | 10 +++++----- test-build.Dockerfile | 4 ++-- 11 files changed, 24 insertions(+), 23 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 303c8736..6af8201e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -494,6 +494,7 @@ version = "0.1.54" dependencies = [ "bindgen", "cc", + "once_cell", ] [[package]] @@ -559,9 +560,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "openssl" diff --git a/README.md b/README.md index e39c5eae..a4366bdb 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ git clone --recursive https://github.com/utilityai/llama-cpp-rs cd llama-cpp-rs ``` -Run the simple example (add `--featues cublas` if you have a cuda gpu) +Run the simple example (add `--featues cuda` if you have a cuda gpu) ```bash cargo run --release --bin simple "The way to kill a linux process is" hf-model TheBloke/Llama-2-7B-GGUF llama-2-7b.Q4_K_M.gguf diff --git a/embeddings/src/main.rs b/embeddings/src/main.rs index f0514b1e..5f5b3fa8 100644 --- a/embeddings/src/main.rs +++ b/embeddings/src/main.rs @@ -20,8 +20,8 @@ use llama_cpp_2::ggml_time_us; use llama_cpp_2::llama_backend::LlamaBackend; use llama_cpp_2::llama_batch::LlamaBatch; use llama_cpp_2::model::params::LlamaModelParams; -use llama_cpp_2::model::{AddBos, Special}; use llama_cpp_2::model::LlamaModel; +use llama_cpp_2::model::{AddBos, Special}; #[derive(clap::Parser, Debug, Clone)] struct Args { @@ -35,7 +35,7 @@ struct Args { #[clap(short)] normalise: bool, /// Disable offloading layers to the gpu - #[cfg(feature = "cublas")] + #[cfg(feature = "cuda")] #[clap(long)] disable_gpu: bool, } @@ -78,7 +78,7 @@ fn main() -> Result<()> { model, prompt, normalise, - #[cfg(feature = "cublas")] + #[cfg(feature = "cuda")] disable_gpu, } = Args::parse(); @@ -87,13 +87,13 @@ fn main() -> Result<()> { // offload all layers to the gpu let model_params = { - #[cfg(feature = "cublas")] + #[cfg(feature = "cuda")] if !disable_gpu { LlamaModelParams::default().with_n_gpu_layers(1000) } else { LlamaModelParams::default() } - #[cfg(not(feature = "cublas"))] + #[cfg(not(feature = "cuda"))] LlamaModelParams::default() }; diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml index 9e66935c..24cd44fa 100644 --- a/llama-cpp-2/Cargo.toml +++ b/llama-cpp-2/Cargo.toml @@ -14,7 +14,7 @@ thiserror = { workspace = true } tracing = { workspace = true } [features] -cublas = ["llama-cpp-sys-2/cublas"] +cuda = ["llama-cpp-sys-2/cuda"] metal = ["llama-cpp-sys-2/metal"] sampler = [] @@ -25,4 +25,4 @@ llama-cpp-sys-2 = { path = "../llama-cpp-sys-2", features=["metal"], version = " workspace = true [package.metadata.docs.rs] -features = ["sampler"] \ No newline at end of file +features = ["sampler"] diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs index 95384a93..4603bd7c 100644 --- a/llama-cpp-2/src/lib.rs +++ b/llama-cpp-2/src/lib.rs @@ -11,7 +11,7 @@ //! //! # Feature Flags //! -//! - `cublas` enables CUDA gpu support. +//! - `cuda` enables CUDA gpu support. //! - `sampler` adds the [`context::sample::sampler`] struct for a more rusty way of sampling. use std::ffi::NulError; use std::fmt::Debug; diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml index 58c2d541..41d335db 100644 --- a/llama-cpp-sys-2/Cargo.toml +++ b/llama-cpp-sys-2/Cargo.toml @@ -51,6 +51,6 @@ cc = { workspace = true, features = ["parallel"] } once_cell = "1.19.0" [features] -cublas = [] +cuda = [] metal = [] diff --git a/llama-cpp-sys-2/README.md b/llama-cpp-sys-2/README.md index 88981c50..69dd4733 100644 --- a/llama-cpp-sys-2/README.md +++ b/llama-cpp-sys-2/README.md @@ -1,5 +1,5 @@ # llama-cpp-sys -Raw bindings to llama.cpp with cublas support. +Raw bindings to llama.cpp with cuda support. -See [llama-cpp-2](https://crates.io/crates/llama-cpp-2) for a safe API. \ No newline at end of file +See [llama-cpp-2](https://crates.io/crates/llama-cpp-2) for a safe API. diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp index 0541f062..917dc8cf 160000 --- a/llama-cpp-sys-2/llama.cpp +++ b/llama-cpp-sys-2/llama.cpp @@ -1 +1 @@ -Subproject commit 0541f06296753dbc59a57379eb54cec865a4c9f9 +Subproject commit 917dc8cfa67a72fb7c8bf7392270da3bf4833af4 diff --git a/simple/Cargo.toml b/simple/Cargo.toml index 4fe31ce8..dae758de 100644 --- a/simple/Cargo.toml +++ b/simple/Cargo.toml @@ -13,7 +13,7 @@ anyhow = { workspace = true } encoding_rs = { workspace = true } [features] -cublas = ["llama-cpp-2/cublas"] +cuda = ["llama-cpp-2/cuda"] metal = ["llama-cpp-2/metal"] [lints] diff --git a/simple/src/main.rs b/simple/src/main.rs index 8dd17b2c..58e09b45 100644 --- a/simple/src/main.rs +++ b/simple/src/main.rs @@ -15,8 +15,8 @@ use llama_cpp_2::llama_backend::LlamaBackend; use llama_cpp_2::llama_batch::LlamaBatch; use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue; use llama_cpp_2::model::params::LlamaModelParams; -use llama_cpp_2::model::{AddBos, Special}; use llama_cpp_2::model::LlamaModel; +use llama_cpp_2::model::{AddBos, Special}; use llama_cpp_2::token::data_array::LlamaTokenDataArray; use std::ffi::CString; use std::io::Write; @@ -44,7 +44,7 @@ struct Args { #[arg(short = 'o', value_parser = parse_key_val)] key_value_overrides: Vec<(String, ParamOverrideValue)>, /// Disable offloading layers to the gpu - #[cfg(feature = "cublas")] + #[cfg(feature = "cuda")] #[clap(long)] disable_gpu: bool, #[arg(short = 's', long, help = "RNG seed (default: 1234)")] @@ -123,7 +123,7 @@ fn main() -> Result<()> { model, prompt, file, - #[cfg(feature = "cublas")] + #[cfg(feature = "cuda")] disable_gpu, key_value_overrides, seed, @@ -137,13 +137,13 @@ fn main() -> Result<()> { // offload all layers to the gpu let model_params = { - #[cfg(feature = "cublas")] + #[cfg(feature = "cuda")] if !disable_gpu { LlamaModelParams::default().with_n_gpu_layers(1000) } else { LlamaModelParams::default() } - #[cfg(not(feature = "cublas"))] + #[cfg(not(feature = "cuda"))] LlamaModelParams::default() }; diff --git a/test-build.Dockerfile b/test-build.Dockerfile index da82218e..8540d2f9 100644 --- a/test-build.Dockerfile +++ b/test-build.Dockerfile @@ -8,10 +8,10 @@ RUN curl https://sh.rustup.rs -sSf | bash -s -- -y ENV PATH=/root/.cargo/bin:$PATH COPY . . -RUN cargo build --bin simple --features cublas +RUN cargo build --bin simple --features cuda FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} as base-cuda-runtime COPY --from=base-cuda /target/debug/simple /usr/local/bin/simple -ENTRYPOINT ["/usr/local/bin/simple"] \ No newline at end of file +ENTRYPOINT ["/usr/local/bin/simple"] From 09c7b612cf2945e47f27c6f57b6f005eac49000e Mon Sep 17 00:00:00 2001 From: SilasMarvin <19626586+SilasMarvin@users.noreply.github.com> Date: Tue, 4 Jun 2024 19:00:15 -0700 Subject: [PATCH 3/3] Update build script to remove cudarc --- llama-cpp-sys-2/build.rs | 29 +++++++++++++++++++++-------- llama-cpp-sys-2/src/lib.rs | 10 ---------- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index 81e8029a..7da89866 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -413,8 +413,6 @@ fn compile_hipblas(cx: &mut Build, cxx: &mut Build, mut hip: Build) -> &'static fn compile_cuda(cx: &mut Build, cxx: &mut Build, featless_cxx: Build) -> &'static str { println!("Compiling CUDA GGML.."); - // CUDA gets linked through the cudarc crate. - cx.define("GGML_USE_CUDA", None); cxx.define("GGML_USE_CUDA", None); @@ -425,12 +423,27 @@ fn compile_cuda(cx: &mut Build, cxx: &mut Build, featless_cxx: Build) -> &'stati .define("K_QUANTS_PER_ITERATION", Some("2")) .define("GGML_CUDA_PEER_MAX_BATCH_SIZE", Some("128")); - if cfg!(target_os = "linux") { - nvcc.flag("-Wno-pedantic"); - // TODO Are these links needed? - println!("cargo:rustc-link-lib=pthread"); - println!("cargo:rustc-link-lib=dl"); - println!("cargo:rustc-link-lib=rt"); + // if cfg!(target_os = "linux") { + // nvcc.flag("-Wno-pedantic"); + // } + + for lib in [ + "cuda", "cublas", "culibos", "cudart", "cublasLt", "pthread", "dl", "rt", + ] { + println!("cargo:rustc-link-lib={}", lib); + } + if !nvcc.get_compiler().is_like_msvc() { + for lib in ["culibos", "pthread", "dl", "rt"] { + println!("cargo:rustc-link-lib={}", lib); + } + } + + println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); + + if nvcc.get_compiler().is_like_msvc() { + nvcc.std("c++14"); + } else { + nvcc.flag("-std=c++11").std("c++11"); } if cfg!(feature = "cuda_dmmv") { diff --git a/llama-cpp-sys-2/src/lib.rs b/llama-cpp-sys-2/src/lib.rs index d1ef8abb..f91bb3a7 100644 --- a/llama-cpp-sys-2/src/lib.rs +++ b/llama-cpp-sys-2/src/lib.rs @@ -4,14 +4,4 @@ #![allow(non_camel_case_types)] #![allow(non_snake_case)] -// [`ash`] is only included to link to the Vulkan SDK. -#[allow(unused)] -#[cfg(feature = "vulkan")] -use ash; - -// [`cudarc`] is only included to link to CUDA. -#[allow(unused)] -#[cfg(feature = "cuda")] -use cudarc; - include!(concat!(env!("OUT_DIR"), "/bindings.rs"));