From 1f69c90dd4822ebf34bfac6ae988f16232d71284 Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Sat, 1 Jun 2024 10:36:21 -0700
Subject: [PATCH 1/3] Chnaged out the build file

---
 llama-cpp-sys-2/Cargo.toml |    1 +
 llama-cpp-sys-2/build.rs   | 1227 ++++++++++++++++++++++++++++--------
 llama-cpp-sys-2/src/lib.rs |   31 +-
 3 files changed, 982 insertions(+), 277 deletions(-)
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 4d06e522..ed28515a 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -48,6 +48,7 @@ include = [
 [build-dependencies]
 bindgen = { workspace = true }
 cc = { workspace = true, features = ["parallel"] }
+once_cell = "1.19.0"
 
 [features]
 cublas = []
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 471235b3..81e8029a 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -1,322 +1,1039 @@
 use std::env;
-use std::ffi::OsStr;
-use std::path::Path;
-use std::path::PathBuf;
+use std::fs::{read_dir, File};
+use std::io::Write;
+use std::path::{Path, PathBuf};
+use std::process::Command;
 
-fn main() {
-    println!("cargo:rerun-if-changed=llama.cpp");
+use cc::Build;
+use once_cell::sync::Lazy;
 
-    let cublas_enabled = env::var("CARGO_FEATURE_CUBLAS").is_ok();
+// This build file is based on:
+// https://github.com/mdrokz/rust-llama.cpp/blob/master/build.rs
+// License MIT
+// 12-2-2024
 
-    let mut ggml_cuda = if cublas_enabled {
-        Some(cc::Build::new())
-    } else {
-        None
-    };
+#[cfg(all(
+    feature = "metal",
+    any(
+        feature = "cuda",
+        feature = "blas",
+        feature = "hipblas",
+        feature = "clblast",
+        feature = "vulkan"
+    )
+))]
+compile_error!("feature \"metal\" cannot be enabled alongside other GPU based features");
+
+#[cfg(all(
+    feature = "cuda",
+    any(
+        feature = "metal",
+        feature = "blas",
+        feature = "hipblas",
+        feature = "clblast",
+        feature = "vulkan"
+    )
+))]
+compile_error!("feature \"cuda\" cannot be enabled alongside other GPU based features");
+
+#[cfg(all(
+    feature = "blas",
+    any(
+        feature = "cuda",
+        feature = "metal",
+        feature = "hipblas",
+        feature = "clblast",
+        feature = "vulkan"
+    )
+))]
+compile_error!("feature \"blas\" cannot be enabled alongside other GPU based features");
+
+#[cfg(all(
+    feature = "hipblas",
+    any(
+        feature = "cuda",
+        feature = "blas",
+        feature = "metal",
+        feature = "clblast",
+        feature = "vulkan"
+    )
+))]
+compile_error!("feature \"hipblas\" cannot be enabled alongside other GPU based features");
 
-    if !Path::new("llama.cpp/ggml.c").exists() {
-        panic!("llama.cpp seems to not be populated, try running `git submodule update --init --recursive` to init.")
+#[cfg(all(
+    feature = "clblast",
+    any(
+        feature = "cuda",
+        feature = "blas",
+        feature = "hipblas",
+        feature = "metal",
+        feature = "vulkan"
+    )
+))]
+compile_error!("feature \"clblas\" cannot be enabled alongside other GPU based features");
+
+#[cfg(all(
+    feature = "vulkan",
+    any(
+        feature = "cuda",
+        feature = "blas",
+        feature = "hipblas",
+        feature = "clblast",
+        feature = "metal"
+    )
+))]
+compile_error!("feature \"vulkan\" cannot be enabled alongside other GPU based features");
+
+static LLAMA_PATH: Lazy<PathBuf> = Lazy::new(|| PathBuf::from("./llama.cpp"));
+
+fn compile_bindings(out_path: &Path) {
+    println!("Generating bindings..");
+    let bindings = bindgen::Builder::default()
+        .header(LLAMA_PATH.join("ggml.h").to_string_lossy())
+        .header(LLAMA_PATH.join("llama.h").to_string_lossy())
+        .derive_partialeq(true)
+        .allowlist_function("ggml_.*")
+        .allowlist_type("ggml_.*")
+        .allowlist_function("llama_.*")
+        .allowlist_type("llama_.*")
+        .prepend_enum_name(false);
+
+    #[cfg(all(
+        feature = "compat",
+        not(any(target_os = "macos", target_os = "ios", target_os = "dragonfly"))
+    ))]
+    {
+        bindings = bindings.parse_callbacks(Box::new(GGMLLinkRename {}));
     }
 
-    let mut ggml = cc::Build::new();
-    let mut llama_cpp = cc::Build::new();
+    let bindings = bindings.generate().expect("Unable to generate bindings");
+
+    bindings
+        .write_to_file(out_path.join("bindings.rs"))
+        .expect("Couldn't write bindings!");
+}
 
-    ggml.cpp(false);
-    llama_cpp.cpp(true);
+#[cfg(all(
+    feature = "compat",
+    not(any(target_os = "macos", target_os = "ios", target_os = "dragonfly"))
+))]
+#[derive(Debug)]
+struct GGMLLinkRename {}
 
-    // CMakeFiles.txt: set(LLAMA_SCHED_MAX_COPIES  "4" CACHE STRING "llama: max input copies for pipeline parallelism")
-    // get LLAMA_SCHED_MAX_COPIES from env, default to 4
-    let mut max_copies = "4".to_owned();
-    if let Ok(env_max_copies) = env::var("LLAMA_SCHED_MAX_COPIES") {
-        if let Ok(v) = env_max_copies.parse::<u32>() {
-            if v > 0 {
-                max_copies = env_max_copies;
-            }
-        } 
-    } 
-    ggml.define("GGML_SCHED_MAX_COPIES", Some(max_copies.as_str()));
-
-    // https://github.com/ggerganov/llama.cpp/blob/a836c8f534ab789b02da149fbdaf7735500bff74/Makefile#L364-L368
-    if let Some(ggml_cuda) = &mut ggml_cuda {
-        for lib in [
-            "cuda", "cublas", "culibos", "cudart", "cublasLt", "pthread", "dl", "rt",
-        ] {
-            println!("cargo:rustc-link-lib={}", lib);
-        }
-        if !ggml_cuda.get_compiler().is_like_msvc() {
-            for lib in ["culibos", "pthread", "dl", "rt"] {
-                println!("cargo:rustc-link-lib={}", lib);
+#[cfg(all(
+    feature = "compat",
+    not(any(target_os = "macos", target_os = "ios", target_os = "dragonfly"))
+))]
+impl ParseCallbacks for GGMLLinkRename {
+    fn generated_link_name_override(&self, item_info: ItemInfo<'_>) -> Option<String> {
+        match item_info.kind {
+            ItemKind::Function => {
+                if item_info.name.starts_with("ggml_") {
+                    Some(format!("{PREFIX}{}", item_info.name))
+                } else {
+                    None
+                }
             }
+            _ => None,
         }
+    }
+}
+
+/// Add platform appropriate flags and definitions present in all compilation configurations.
+fn push_common_flags(cx: &mut Build, cxx: &mut Build) {
+    cx.static_flag(true)
+        .cpp(false)
+        .define("GGML_SCHED_MAX_COPIES", "4");
+    cxx.static_flag(true)
+        .cpp(true)
+        .define("GGML_SCHED_MAX_COPIES", "4");
+
+    if !cfg!(debug_assertions) {
+        cx.define("NDEBUG", None);
+        cxx.define("NDEBUG", None);
+    } else {
+        cx.define("GGML_DEBUG", "100");
+        cxx.define("GGML_DEBUG", "100");
 
-        println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
-
-        if cfg!(target_arch = "aarch64") {
-            ggml_cuda
-                .flag_if_supported("-mfp16-format=ieee")
-                .flag_if_supported("-mno-unaligned-access");
-            ggml.flag_if_supported("-mfp16-format=ieee")
-                .flag_if_supported("-mno-unaligned-access");
-            llama_cpp
-                .flag_if_supported("-mfp16-format=ieee")
-                .flag_if_supported("-mno-unaligned-access");
-            ggml.flag_if_supported("-mfp16-format=ieee")
-                .flag_if_supported("-mno-unaligned-access");
+        if cfg!(target_os = "linux") {
+            cx.define("_GLIBCXX_ASSERTIONS", None);
+            cxx.define("_GLIBCXX_ASSERTIONS", None);
+        } else if cfg!(target_os = "windows") {
+            cx.define("_CRT_SECURE_NO_WARNINGS", None);
+            cxx.define("_CRT_SECURE_NO_WARNINGS", None);
         }
+    }
 
-        ggml_cuda
-            .cuda(true)
-            .flag("-arch=all")
-            .file("llama.cpp/ggml-cuda.cu")
-            .files(std::fs::read_dir("llama.cpp/ggml-cuda")
-                .expect("failed to read 'llama.cpp/ggml-cuda'")
-                .map(|e| e.expect("failed to ready entry").path())
-                .filter(|p| p.extension().is_some_and(|it| it == OsStr::new("cu")))
-            )
-            .include("llama.cpp/ggml-cuda")
-            .include("llama.cpp");
-
-        if ggml_cuda.get_compiler().is_like_msvc() {
-            ggml_cuda.std("c++14");
-        } else {
-            ggml_cuda.flag("-std=c++11").std("c++11");
+    if cfg!(target_os = "openbsd") {
+        cx.define("_XOPEN_SOURCE", "700");
+        cxx.define("_XOPEN_SOURCE", "700");
+    } else {
+        cx.define("_XOPEN_SOURCE", "600");
+        cxx.define("_XOPEN_SOURCE", "600");
+    }
+
+    if cfg!(target_os = "linux") {
+        cx.define("_GNU_SOURCE", None);
+        cxx.define("_GNU_SOURCE", None);
+    } else if cfg!(any(
+        target_os = "macos",
+        target_os = "ios",
+        target_os = "dragonfly"
+    )) {
+        cx.define("_DARWIN_C_SOURCE", None);
+        cxx.define("_DARWIN_C_SOURCE", None);
+    } else if cfg!(target_os = "openbsd") {
+        cx.define("_BSD_SOURCE", None);
+        cxx.define("_BSD_SOURCE", None);
+    } else if cfg!(target_os = "freebsd") {
+        cx.define("__BSD_VISIBLE", None);
+        cxx.define("__BSD_VISIBLE", None);
+    } else if cfg!(target_os = "netbsd") {
+        cx.define("_NETBSD_SOURCE", None);
+        cxx.define("_NETBSD_SOURCE", None);
+    }
+
+    if cfg!(any(target_arch = "arm", target_arch = "aarch64")) {
+        if cfg!(target_family = "unix") {
+            // cx.flag("-mavx512vnni").flag("-mfp16-format=ieee");
+            // cxx.flag("-mavx512vnni").flag("-mfp16-format=ieee");
+        } else if cfg!(target_family = "windows") {
+            cx.define("__ARM_NEON", None)
+                .define("__ARM_FEATURE_FMA", None)
+                .define("__ARM_FEATURE_DOTPROD", None)
+                .define("__aarch64__", None);
+            cxx.define("__ARM_NEON", None)
+                .define("__ARM_FEATURE_FMA", None)
+                .define("__ARM_FEATURE_DOTPROD", None)
+                .define("__aarch64__", None);
         }
+    }
+}
 
-        ggml.define("GGML_USE_CUDA", None);
-        ggml_cuda.define("GGML_USE_CUDA", None);
-        llama_cpp.define("GGML_USE_CUDA", None);
+/// Add platform appropriate flags and definitions for compilation warnings.
+fn push_warn_flags(cx: &mut Build, cxx: &mut Build) {
+    if cfg!(target_family = "unix") {
+        cx.flag("-pthread")
+            .flag("-Wall")
+            .flag("-Wextra")
+            .flag("-Wpedantic")
+            .flag("-Wcast-qual")
+            .flag("-Wdouble-promotion")
+            .flag("-Wshadow")
+            .flag("-Wstrict-prototypes")
+            .flag("-Wpointer-arith");
+        cxx.flag("-fPIC")
+            .flag("-pthread")
+            .flag("-Wall")
+            .flag("-Wdeprecated-declarations")
+            .flag("-Wextra")
+            .flag("-Wpedantic")
+            .flag("-Wcast-qual")
+            .flag("-Wno-unused-function")
+            .flag("-Wno-multichar");
+    } else if cfg!(target_family = "windows") {
+        cx.flag("/W4")
+            .flag("/Wall")
+            .flag("/wd4820")
+            .flag("/wd4710")
+            .flag("/wd4711")
+            .flag("/wd4820")
+            .flag("/wd4514");
+        cxx.flag("/W4")
+            .flag("/Wall")
+            .flag("/wd4820")
+            .flag("/wd4710")
+            .flag("/wd4711")
+            .flag("/wd4820")
+            .flag("/wd4514");
     }
+}
 
-    for build in [&mut ggml, &mut llama_cpp] {
-        let compiler = build.get_compiler();
+/// Add platform appropriate flags and definitions based on enabled features.
+fn push_feature_flags(cx: &mut Build, cxx: &mut Build) {
+    // TODO in llama.cpp's cmake (https://github.com/ggerganov/llama.cpp/blob/9ecdd12e95aee20d6dfaf5f5a0f0ce5ac1fb2747/CMakeLists.txt#L659), they include SIMD instructions manually, however it doesn't seem to be necessary for VS2022's MSVC, check when it is needed
 
-        if cfg!(target_arch = "i686") || cfg!(target_arch = "x86_64") {
-            let features = x86::Features::get_target();
-            if compiler.is_like_clang() || compiler.is_like_gnu() {
-                build.flag("-pthread");
+    if cfg!(any(target_arch = "x86", target_arch = "x86_64")) {
+        if cfg!(feature = "native") && cfg!(target_os = "linux") {
+            cx.flag("-march=native");
+            cxx.flag("-march=native");
+        }
 
-                if features.avx {
-                    build.flag("-mavx");
-                }
-                if features.avx2 {
-                    build.flag("-mavx2");
-                }
-                if features.fma {
-                    build.flag("-mfma");
+        if cfg!(feature = "fma") && cfg!(target_family = "unix") {
+            cx.flag("-mfma");
+            cxx.flag("-mfma");
+        }
+
+        if cfg!(feature = "f16c") && cfg!(target_family = "unix") {
+            cx.flag("-mf16c");
+            cxx.flag("-mf16c");
+        }
+
+        if cfg!(target_family = "unix") {
+            if cfg!(feature = "avx512") {
+                cx.flag("-mavx512f").flag("-mavx512bw");
+                cxx.flag("-mavx512f").flag("-mavx512bw");
+
+                if cfg!(feature = "avx512_vmbi") {
+                    cx.flag("-mavx512vbmi");
+                    cxx.flag("-mavx512vbmi");
                 }
-                if features.f16c {
-                    build.flag("-mf16c");
+
+                if cfg!(feature = "avx512_vnni") {
+                    cx.flag("-mavx512vnni");
+                    cxx.flag("-mavx512vnni");
                 }
-                if features.sse3 {
-                    build.flag("-msse3");
+            }
+
+            if cfg!(feature = "avx2") {
+                cx.flag("-mavx2");
+                cxx.flag("-mavx2");
+            }
+
+            if cfg!(feature = "avx") {
+                cx.flag("-mavx");
+                cxx.flag("-mavx");
+            }
+        } else if cfg!(target_family = "windows") {
+            if cfg!(feature = "avx512") {
+                cx.flag("/arch:AVX512");
+                cxx.flag("/arch:AVX512");
+
+                if cfg!(feature = "avx512_vmbi") {
+                    cx.define("__AVX512VBMI__", None);
+                    cxx.define("__AVX512VBMI__", None);
                 }
-            } else if compiler.is_like_msvc() {
-                match (features.avx2, features.avx) {
-                    (true, _) => {
-                        build.flag("/arch:AVX2");
-                    }
-                    (_, true) => {
-                        build.flag("/arch:AVX");
-                    }
-                    _ => {}
+
+                if cfg!(feature = "avx512_vnni") {
+                    cx.define("__AVX512VNNI__", None);
+                    cxx.define("__AVX512VNNI__", None);
                 }
+            } else if cfg!(feature = "avx2") {
+                cx.flag("/arch:AVX2");
+                cxx.flag("/arch:AVX2");
+            } else if cfg!(feature = "avx") {
+                cx.flag("/arch:AVX");
+                cxx.flag("/arch:AVX");
             }
-        } else if cfg!(target_arch = "aarch64")
-            && (compiler.is_like_clang() || compiler.is_like_gnu())
-        {
-            if cfg!(target_os = "macos") {
-                build.flag("-mcpu=apple-m1");
-            } else if env::var("HOST") == env::var("TARGET") {
-                build.flag("-mcpu=native");
-            }
-            build.flag("-pthread");
         }
     }
+}
 
-    // https://github.com/ggerganov/llama.cpp/blob/191221178f51b6e81122c5bda0fd79620e547d07/Makefile#L133-L141
-    if cfg!(target_os = "macos") {
-        assert!(!cublas_enabled, "CUBLAS is not supported on macOS");
+fn compile_opencl(cx: &mut Build, cxx: &mut Build) {
+    println!("Compiling OpenCL GGML..");
 
-        let metal_enabled = env::var("CARGO_FEATURE_METAL").is_ok();
+    // TODO
+    println!("cargo:warning=OpenCL compilation and execution has not been properly tested yet");
 
-        println!("cargo:rustc-link-lib=framework=Foundation");
-        if metal_enabled {
-            println!("cargo:rustc-link-lib=framework=Metal");
-            println!("cargo:rustc-link-lib=framework=MetalPerformanceShaders");
-            println!("cargo:rustc-link-lib=framework=MetalKit");
-        }
+    cx.define("GGML_USE_CLBLAST", None);
+    cxx.define("GGML_USE_CLBLAST", None);
 
-        llama_cpp.define("_DARWIN_C_SOURCE", None);
+    if cfg!(target_os = "linux") {
+        println!("cargo:rustc-link-lib=OpenCL");
+        println!("cargo:rustc-link-lib=clblast");
+    } else if cfg!(target_os = "macos") {
+        println!("cargo:rustc-link-lib=framework=OpenCL");
+        println!("cargo:rustc-link-lib=clblast");
+    }
 
-        // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L340-L343
-        if metal_enabled {
-            llama_cpp.define("GGML_USE_METAL", None);
-        }
-        llama_cpp.define("GGML_USE_ACCELERATE", None);
-        llama_cpp.define("ACCELERATE_NEW_LAPACK", None);
-        llama_cpp.define("ACCELERATE_LAPACK_ILP64", None);
-        println!("cargo:rustc-link-lib=framework=Accelerate");
-
-        if metal_enabled {
-            metal_hack(&mut ggml);
-            ggml.include("./llama.cpp/ggml-metal.h");
-        }
+    cxx.file(LLAMA_PATH.join("ggml-opencl.cpp"));
+}
+
+fn compile_openblas(cx: &mut Build) {
+    println!("Compiling OpenBLAS GGML..");
+
+    // TODO
+    println!("cargo:warning=OpenBlas compilation and execution has not been properly tested yet");
+
+    cx.define("GGML_USE_OPENBLAS", None)
+        .include("/usr/local/include/openblas")
+        .include("/usr/local/include/openblas");
+    println!("cargo:rustc-link-lib=openblas");
+}
+
+fn compile_blis(cx: &mut Build) {
+    println!("Compiling BLIS GGML..");
+
+    // TODO
+    println!("cargo:warning=Blis compilation and execution has not been properly tested yet");
+
+    cx.define("GGML_USE_OPENBLAS", None)
+        .include("/usr/local/include/blis")
+        .include("/usr/local/include/blis");
+    println!("cargo:rustc-link-search=native=/usr/local/lib");
+    println!("cargo:rustc-link-lib=blis");
+}
+
+fn compile_hipblas(cx: &mut Build, cxx: &mut Build, mut hip: Build) -> &'static str {
+    const DEFAULT_ROCM_PATH_STR: &str = "/opt/rocm/";
+
+    let rocm_path_str = env::var("ROCM_PATH")
+        .map_err(|_| DEFAULT_ROCM_PATH_STR.to_string())
+        .unwrap();
+    println!("Compiling HIPBLAS GGML. Using ROCm from {rocm_path_str}");
+
+    let rocm_path = PathBuf::from(rocm_path_str);
+    let rocm_include = rocm_path.join("include");
+    let rocm_lib = rocm_path.join("lib");
+    let rocm_hip_bin = rocm_path.join("bin/hipcc");
+
+    let cuda_lib = "ggml-cuda";
+    let cuda_file = cuda_lib.to_string() + ".cu";
+    let cuda_header = cuda_lib.to_string() + ".h";
+
+    let defines = ["GGML_USE_HIPBLAS", "GGML_USE_CUBLAS"];
+    for def in defines {
+        cx.define(def, None);
+        cxx.define(def, None);
     }
 
-    if cfg!(target_os = "dragonfly") {
-        llama_cpp.define("__BSD_VISIBLE", None);
+    cx.include(&rocm_include);
+    cxx.include(&rocm_include);
+
+    hip.compiler(rocm_hip_bin)
+        .std("c++11")
+        .file(LLAMA_PATH.join(cuda_file))
+        .include(LLAMA_PATH.join(cuda_header))
+        .define("GGML_USE_HIPBLAS", None)
+        .compile(cuda_lib);
+
+    println!(
+        "cargo:rustc-link-search=native={}",
+        rocm_lib.to_string_lossy()
+    );
+
+    let rocm_libs = ["hipblas", "rocblas", "amdhip64"];
+    for lib in rocm_libs {
+        println!("cargo:rustc-link-lib={lib}");
     }
 
+    cuda_lib
+}
+
+fn compile_cuda(cx: &mut Build, cxx: &mut Build, featless_cxx: Build) -> &'static str {
+    println!("Compiling CUDA GGML..");
+
+    // CUDA gets linked through the cudarc crate.
+
+    cx.define("GGML_USE_CUDA", None);
+    cxx.define("GGML_USE_CUDA", None);
+
+    let mut nvcc = featless_cxx;
+    nvcc.cuda(true)
+        .flag("--forward-unknown-to-host-compiler")
+        .flag("-arch=all")
+        .define("K_QUANTS_PER_ITERATION", Some("2"))
+        .define("GGML_CUDA_PEER_MAX_BATCH_SIZE", Some("128"));
+
     if cfg!(target_os = "linux") {
-        ggml.define("_GNU_SOURCE", None);
+        nvcc.flag("-Wno-pedantic");
+        // TODO Are these links needed?
+        println!("cargo:rustc-link-lib=pthread");
+        println!("cargo:rustc-link-lib=dl");
+        println!("cargo:rustc-link-lib=rt");
     }
 
-    ggml.std("c11")
-        .include("./llama.cpp")
-        .file("llama.cpp/ggml.c")
-        .file("llama.cpp/ggml-alloc.c")
-        .file("llama.cpp/ggml-backend.c")
-        .file("llama.cpp/ggml-quants.c")
-        .define("GGML_USE_K_QUANTS", None);
-
-    llama_cpp
-        .define("_XOPEN_SOURCE", Some("600"))
-        .include("llama.cpp")
-        .std("c++11")
-        .file("llama.cpp/llama.cpp")
-        .file("llama.cpp/unicode.cpp")
-        .file("llama.cpp/unicode-data.cpp");
-
-    // Remove debug log output from `llama.cpp`
-    let is_release = env::var("PROFILE").unwrap() == "release";
-    if is_release {
-        ggml.define("NDEBUG", None);
-        llama_cpp.define("NDEBUG", None);
-        if let Some(cuda) = ggml_cuda.as_mut() {
-            cuda.define("NDEBUG", None);
-        }
+    if cfg!(feature = "cuda_dmmv") {
+        nvcc.define("GGML_CUDA_FORCE_DMMV", None)
+            .define("GGML_CUDA_DMMV_X", Some("32"))
+            .define("GGML_CUDA_MMV_Y", Some("1"));
+    }
 
-        ggml.opt_level(3);
-        llama_cpp.opt_level(3);
+    if cfg!(feature = "cuda_mmq") {
+        nvcc.define("GGML_CUDA_FORCE_MMQ", None);
     }
 
-    if let Some(ggml_cuda) = ggml_cuda {
-        eprintln!("compiling ggml-cuda");
-        ggml_cuda.compile("ggml-cuda");
-        eprintln!("compiled ggml-cuda");
+    let lib_name = "ggml-cuda";
+    let cuda_path = LLAMA_PATH.join("ggml-cuda");
+    let cuda_sources = read_dir(cuda_path.as_path())
+        .unwrap()
+        .map(|f| f.unwrap())
+        .filter(|entry| entry.file_name().to_string_lossy().ends_with(".cu"))
+        .map(|entry| entry.path());
+
+    nvcc.include(cuda_path.as_path())
+        .include(LLAMA_PATH.as_path())
+        .files(cuda_sources)
+        .file(LLAMA_PATH.join("ggml-cuda.cu"))
+        .compile(lib_name);
+
+    lib_name
+}
+
+fn compile_metal(cx: &mut Build, cxx: &mut Build) {
+    println!("Compiling Metal GGML..");
+
+    cx.define("GGML_USE_METAL", None);
+    cxx.define("GGML_USE_METAL", None);
+
+    cx.define("GGML_METAL_EMBED_LIBRARY", None);
+    cxx.define("GGML_METAL_EMBED_LIBRARY", None);
+
+    if !cfg!(debug_assertions) {
+        cx.define("GGML_METAL_NDEBUG", None);
     }
 
-    eprintln!("compiling ggml");
-    ggml.compile("ggml");
-    eprintln!("compiled ggml");
+    // It's idiomatic to use OUT_DIR for intermediate c/c++ artifacts
+    let out_dir = env::var("OUT_DIR").unwrap();
 
-    eprintln!("compiling llama");
-    llama_cpp.compile("llama");
-    eprintln!("compiled llama");
+    let ggml_metal_shader_path = LLAMA_PATH.join("ggml-metal.metal");
 
-    let header = "llama.cpp/llama.h";
+    // Create a temporary assembly file that will allow for static linking to the metal shader.
+    let ggml_metal_embed_assembly_path = PathBuf::from(&out_dir).join("ggml-metal-embed.asm");
+    let mut ggml_metal_embed_assembly_file = File::create(&ggml_metal_embed_assembly_path)
+        .expect("Failed to open ggml-metal-embed.asm file");
 
-    println!("cargo:rerun-if-changed={header}");
+    let ggml_metal_shader_out_path = PathBuf::from(&out_dir).join("ggml-metal.metal");
+    let common = LLAMA_PATH.join("ggml-common.h");
 
-    let bindings = bindgen::builder()
-        .header(header)
-        .derive_partialeq(true)
-        .no_debug("llama_grammar_element")
-        .prepend_enum_name(false)
-        .derive_eq(true)
-        .generate()
-        .expect("failed to generate bindings for llama.cpp");
+    let input_file = File::open(ggml_metal_shader_path).expect("Failed to open input file");
+    let mut output_file =
+        File::create(&ggml_metal_shader_out_path).expect("Failed to create output file");
 
-    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
-    bindings
-        .write_to_file(out_path.join("bindings.rs"))
-        .expect("failed to write bindings to file");
-    let llama_cpp_dir = PathBuf::from("llama.cpp").canonicalize().unwrap();
-    println!("cargo:INCLUDE={}", llama_cpp_dir.to_str().unwrap());
-    println!("cargo:OUT_DIR={}", out_path.to_str().unwrap());
+    let output = Command::new("sed")
+        .arg("-e")
+        .arg(format!(
+            "/#include \"ggml-common.h\"/r {}",
+            common.to_string_lossy()
+        ))
+        .arg("-e")
+        .arg("/#include \"ggml-common.h\"/d")
+        .stdin(input_file)
+        .stdout(output_file)
+        .output()
+        .expect("Failed to execute command");
+    if !output.status.success() {
+        panic!(
+            "An error has occurred while embedding common file ({}):\n{}",
+            output.status,
+            String::from_utf8_lossy(&output.stderr)
+        );
+    }
+
+    // The contents of this file is directly copied from the llama.cpp Makefile
+    let ggml_metal_embed_assembly_code = format!(
+        ".section __DATA, __ggml_metallib\n\
+         .globl _ggml_metallib_start\n\
+         _ggml_metallib_start:\n\
+         .incbin \"{}\"\n\
+         .globl _ggml_metallib_end\n\
+         _ggml_metallib_end:\n",
+        ggml_metal_shader_out_path
+            .to_str()
+            .expect("Failed to convert path to string")
+    );
+
+    write!(
+        ggml_metal_embed_assembly_file,
+        "{}",
+        ggml_metal_embed_assembly_code
+    )
+    .expect("Failed to write ggml metal embed assembly code");
+
+    // Assemble the ggml metal embed code.
+    let ggml_metal_embed_object_path = PathBuf::from(&out_dir).join("ggml-metal-embed.o");
+    Command::new("as")
+        .arg(&ggml_metal_embed_assembly_path)
+        .arg("-o")
+        .arg(&ggml_metal_embed_object_path)
+        .status()
+        .expect("Failed to assemble ggml-metal-embed file");
+
+    // Create a static library for our metal embed code.
+    let ggml_metal_embed_library_path = PathBuf::from(&out_dir).join("libggml-metal-embed.a");
+    Command::new("ar")
+        .args(&[
+            "crus",
+            ggml_metal_embed_library_path.to_str().unwrap(),
+            ggml_metal_embed_object_path.to_str().unwrap(),
+        ])
+        .status()
+        .expect("Failed to create static library from ggml-metal-embed object file");
+
+    println!("cargo:rustc-link-lib=framework=Metal");
+    println!("cargo:rustc-link-lib=framework=Foundation");
+    println!("cargo:rustc-link-lib=framework=MetalPerformanceShaders");
+    println!("cargo:rustc-link-lib=framework=MetalKit");
+
+    // Link to our new static library for our metal embed code.
+    println!("cargo:rustc-link-search=native={}", &out_dir);
+    println!("cargo:rustc-link-lib=static=ggml-metal-embed");
+
+    cx.include(LLAMA_PATH.join("ggml-metal.h"))
+        .file(LLAMA_PATH.join("ggml-metal.m"));
 }
 
-// courtesy of https://github.com/rustformers/llm
-fn metal_hack(build: &mut cc::Build) {
-    const GGML_METAL_METAL_PATH: &str = "llama.cpp/ggml-metal.metal";
-    const GGML_METAL_PATH: &str = "llama.cpp/ggml-metal.m";
-    const GGML_COMMON_PATH: &str = "llama.cpp/ggml-common.h";
-
-    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is not defined"));
-
-    let ggml_metal_path = {
-        let ggml_metal_metal = std::fs::read_to_string(GGML_METAL_METAL_PATH)
-            .expect("Could not read ggml-metal.metal")
-            .replace('\\', "\\\\")
-            .replace('\n', "\\n")
-            .replace('\r', "\\r")
-            .replace('\"', "\\\"");
-
-        let ggml_common = std::fs::read_to_string(GGML_COMMON_PATH).expect("Could not read ggml-common.h")
-            .replace('\\', "\\\\")
-            .replace('\n', "\\n")
-            .replace('\r', "\\r")
-            .replace('\"', "\\\"");
-
-        let includged_ggml_metal_metal = ggml_metal_metal.replace(
-            "#include \\\"ggml-common.h\\\"",
-            &format!("{ggml_common}")
-        );
-        print!("{}", &includged_ggml_metal_metal);
+fn compile_vulkan(cx: &mut Build, cxx: &mut Build) -> &'static str {
+    println!("Compiling Vulkan GGML..");
 
-        let ggml_metal =
-            std::fs::read_to_string(GGML_METAL_PATH).expect("Could not read ggml-metal.m");
+    // Vulkan gets linked through the ash crate.
 
-        let needle = r#"NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];"#;
-        if !ggml_metal.contains(needle) {
-            panic!("ggml-metal.m does not contain the needle to be replaced; the patching logic needs to be reinvestigated. Contact a `llama-cpp-sys-2` developer!");
-        }
+    if cfg!(debug_assertions) {
+        cx.define("GGML_VULKAN_DEBUG", None)
+            .define("GGML_VULKAN_CHECK_RESULTS", None)
+            .define("GGML_VULKAN_VALIDATE", None);
+        cxx.define("GGML_VULKAN_DEBUG", None)
+            .define("GGML_VULKAN_CHECK_RESULTS", None)
+            .define("GGML_VULKAN_VALIDATE", None);
+    }
+
+    cx.define("GGML_USE_VULKAN", None);
+    cxx.define("GGML_USE_VULKAN", None);
+
+    let lib_name = "ggml-vulkan";
+
+    cxx.clone()
+        .include("./thirdparty/Vulkan-Headers/include/")
+        .include(LLAMA_PATH.as_path())
+        .file(LLAMA_PATH.join("ggml-vulkan.cpp"))
+        .compile(lib_name);
+
+    lib_name
+}
+
+fn compile_ggml(mut cx: Build) {
+    println!("Compiling GGML..");
+    cx.std("c11")
+        .include(LLAMA_PATH.as_path())
+        .file(LLAMA_PATH.join("ggml.c"))
+        .file(LLAMA_PATH.join("ggml-alloc.c"))
+        .file(LLAMA_PATH.join("ggml-backend.c"))
+        .file(LLAMA_PATH.join("ggml-quants.c"))
+        .compile("ggml");
+}
+
+fn compile_llama(mut cxx: Build, _out_path: impl AsRef<Path>) {
+    println!("Compiling Llama.cpp..");
+    cxx.std("c++11")
+        .include(LLAMA_PATH.as_path())
+        .file(LLAMA_PATH.join("unicode.cpp"))
+        .file(LLAMA_PATH.join("unicode-data.cpp"))
+        .file(LLAMA_PATH.join("llama.cpp"))
+        .compile("llama");
+}
 
-        // Replace the runtime read of the file with a compile-time string
-        let ggml_metal = ggml_metal.replace(
-            needle,
-            &format!(r#"NSString * src  = @"{includged_ggml_metal_metal}";"#),
+fn main() {
+    if std::fs::read_dir(LLAMA_PATH.as_path()).is_err() {
+        panic!(
+            "Could not find {}. Did you forget to initialize submodules?",
+            LLAMA_PATH.display()
         );
+    }
+
+    let out_path = PathBuf::from(env::var("OUT_DIR").expect("No out dir found"));
+
+    println!("cargo:rerun-if-changed={}", LLAMA_PATH.display());
 
-        let patched_ggml_metal_path = out_dir.join("ggml-metal.m");
-        std::fs::write(&patched_ggml_metal_path, ggml_metal)
-            .expect("Could not write temporary patched ggml-metal.m");
+    compile_bindings(&out_path);
 
-        patched_ggml_metal_path
+    let mut cx = Build::new();
+    let mut cxx = Build::new();
+
+    push_common_flags(&mut cx, &mut cxx);
+
+    let featless_cxx = cxx.clone(); // mostly used for CUDA
+
+    push_warn_flags(&mut cx, &mut cxx);
+    push_feature_flags(&mut cx, &mut cxx);
+
+    let feat_lib = if cfg!(feature = "vulkan") {
+        Some(compile_vulkan(&mut cx, &mut cxx))
+    } else if cfg!(feature = "cuda") {
+        Some(compile_cuda(&mut cx, &mut cxx, featless_cxx))
+    } else if cfg!(feature = "opencl") {
+        compile_opencl(&mut cx, &mut cxx);
+        None
+    } else if cfg!(feature = "openblas") {
+        compile_openblas(&mut cx);
+        None
+    } else if cfg!(feature = "blis") {
+        compile_blis(&mut cx);
+        None
+    } else if cfg!(feature = "metal") && cfg!(target_os = "macos") {
+        compile_metal(&mut cx, &mut cxx);
+        None
+    } else if cfg!(feature = "hipblas") {
+        Some(compile_hipblas(&mut cx, &mut cxx, featless_cxx))
+    } else {
+        None
     };
 
-    build.file(ggml_metal_path);
-}
+    compile_ggml(cx);
+    compile_llama(cxx, &out_path);
 
-// Courtesy of https://github.com/rustformers/llm
-fn get_supported_target_features() -> std::collections::HashSet<String> {
-    env::var("CARGO_CFG_TARGET_FEATURE")
-        .unwrap()
-        .split(',')
-        .map(ToString::to_string)
-        .collect()
+    #[cfg(all(
+        feature = "compat",
+        not(any(target_os = "macos", target_os = "ios", target_os = "dragonfly"))
+    ))]
+    {
+        compat::redefine_symbols(out_path, feat_lib);
+    }
 }
 
-mod x86 {
-    #[allow(clippy::struct_excessive_bools)]
-    #[derive(Clone, Debug, PartialEq, Eq)]
-    pub struct Features {
-        pub fma: bool,
-        pub avx: bool,
-        pub avx2: bool,
-        pub f16c: bool,
-        pub sse3: bool,
+// MacOS will prefix all exported symbols with a leading underscore.
+// Additionally, it seems that there are no collision issues when building with both llama and whisper crates, so the
+// compat feature can be ignored.
+
+#[cfg(all(
+    feature = "compat",
+    not(any(target_os = "macos", target_os = "ios", target_os = "dragonfly"))
+))]
+mod compat {
+    use std::collections::HashSet;
+    use std::fmt::{Display, Formatter};
+    use std::process::Command;
+
+    use crate::*;
+
+    pub fn redefine_symbols(out_path: impl AsRef<Path>, additional_lib: Option<&str>) {
+        let (ggml_lib_name, llama_lib_name) = lib_names();
+        let (nm, objcopy) = tools();
+        println!(
+            "Modifying {ggml_lib_name} and {llama_lib_name}, symbols acquired via \
+        \"{nm}\" and modified via \"{objcopy}\""
+        );
+
+        // Modifying symbols exposed by the ggml library
+
+        let out_str = nm_symbols(&nm, ggml_lib_name, &out_path);
+        let symbols = get_symbols(
+            &out_str,
+            [
+                Filter {
+                    prefix: "ggml",
+                    sym_type: 'T',
+                },
+                Filter {
+                    prefix: "ggml",
+                    sym_type: 'U',
+                },
+                Filter {
+                    prefix: "ggml",
+                    sym_type: 'B',
+                },
+                Filter {
+                    prefix: "gguf",
+                    sym_type: 'T',
+                },
+                Filter {
+                    prefix: "quantize",
+                    sym_type: 'T',
+                },
+                Filter {
+                    prefix: "dequantize",
+                    sym_type: 'T',
+                },
+                Filter {
+                    prefix: "iq2xs",
+                    sym_type: 'T',
+                },
+                Filter {
+                    prefix: "iq3xs",
+                    sym_type: 'T',
+                },
+            ],
+        );
+        objcopy_redefine(&objcopy, ggml_lib_name, PREFIX, symbols, &out_path);
+
+        // Modifying the symbols llama depends on from ggml
+
+        let out_str = nm_symbols(&nm, llama_lib_name, &out_path);
+        let symbols = get_symbols(
+            &out_str,
+            [
+                Filter {
+                    prefix: "ggml",
+                    sym_type: 'U',
+                },
+                Filter {
+                    prefix: "gguf",
+                    sym_type: 'U',
+                },
+            ],
+        );
+        objcopy_redefine(&objcopy, llama_lib_name, PREFIX, symbols, &out_path);
+
+        if let Some(gpu_lib_name) = additional_lib {
+            // Modifying the symbols of the GPU library
+
+            let lib_name = if cfg!(target_family = "windows") {
+                format!("{gpu_lib_name}.lib")
+            } else if cfg!(target_family = "unix") {
+                format!("lib{gpu_lib_name}.a")
+            } else {
+                println!("cargo:warning=Unknown target family, defaulting to Unix lib names");
+                format!("lib{gpu_lib_name}.a")
+            };
+
+            let out_str = nm_symbols(&nm, &lib_name, &out_path);
+            let symbols = get_symbols(
+                &out_str,
+                [
+                    Filter {
+                        prefix: "ggml",
+                        sym_type: 'U',
+                    },
+                    Filter {
+                        prefix: "ggml",
+                        sym_type: 'T',
+                    },
+                ],
+            );
+            objcopy_redefine(&objcopy, &lib_name, PREFIX, symbols, &out_path);
+        }
+    }
+
+    /// Returns *GGML*'s and *Llama.cpp*'s compiled library names, based on the operating system.
+    fn lib_names() -> (&'static str, &'static str) {
+        let ggml_lib_name;
+        let llama_lib_name;
+        if cfg!(target_family = "windows") {
+            ggml_lib_name = "ggml.lib";
+            llama_lib_name = "llama.lib";
+        } else if cfg!(target_family = "unix") {
+            ggml_lib_name = "libggml.a";
+            llama_lib_name = "libllama.a";
+        } else {
+            println!("cargo:warning=Unknown target family, defaulting to Unix lib names");
+            ggml_lib_name = "libggml.a";
+            llama_lib_name = "libllama.a";
+        };
+
+        (ggml_lib_name, llama_lib_name)
     }
-    impl Features {
-        pub fn get_target() -> Self {
-            let features = crate::get_supported_target_features();
-            Self {
-                fma: features.contains("fma"),
-                avx: features.contains("avx"),
-                avx2: features.contains("avx2"),
-                f16c: features.contains("f16c"),
-                sse3: features.contains("sse3"),
+
+    /// Returns [`Tool`]s equivalent to [nm][nm] and [objcopy][objcopy].
+    ///
+    /// [nm]: https://www.man7.org/linux/man-pages/man1/nm.1.html
+    /// [objcopy]: https://www.man7.org/linux/man-pages/man1/objcopy.1.html
+    fn tools() -> (Tool, Tool) {
+        let nm_names;
+        let objcopy_names;
+        let nm_help;
+        let objcopy_help;
+        if cfg!(target_os = "linux") {
+            nm_names = vec!["nm", "llvm-nm"];
+            objcopy_names = vec!["objcopy", "llvm-objcopy"];
+            nm_help = vec!["\"nm\" from GNU Binutils", "\"llvm-nm\" from LLVM"];
+            objcopy_help = vec![
+                "\"objcopy\" from GNU Binutils",
+                "\"llvm-objcopy\" from LLVM",
+            ];
+        } else if cfg!(any(
+            target_os = "macos",
+            target_os = "ios",
+            target_os = "dragonfly"
+        )) {
+            nm_names = vec!["nm", "llvm-nm"];
+            objcopy_names = vec!["llvm-objcopy"];
+            nm_help = vec!["\"llvm-nm\" from LLVM 17"];
+            objcopy_help = vec!["\"llvm-objcopy\" from LLVM 17"];
+        } else {
+            nm_names = vec!["llvm-nm"];
+            objcopy_names = vec!["llvm-objcopy"];
+            nm_help = vec!["\"llvm-nm\" from LLVM 17"];
+            objcopy_help = vec!["\"llvm-objcopy\" from LLVM 17"];
+        }
+
+        let nm_env = "NM_PATH";
+        println!("cargo:rerun-if-env-changed={nm_env}");
+        println!("Looking for \"nm\" or an equivalent tool");
+        let nm_name = find_tool(&nm_names, nm_env).unwrap_or_else(move || {
+            panic_tool_help("nm", nm_env, &nm_help);
+            unreachable!("The function above should have panicked")
+        });
+
+        let objcopy_env = "OBJCOPY_PATH";
+        println!("cargo:rerun-if-env-changed={objcopy_env}");
+        println!("Looking for \"objcopy\" or an equivalent tool..");
+        let objcopy_name = find_tool(&objcopy_names, objcopy_env).unwrap_or_else(move || {
+            panic_tool_help("objcopy", objcopy_env, &objcopy_help);
+            unreachable!("The function above should have panicked")
+        });
+
+        (nm_name, objcopy_name)
+    }
+
+    /// A command line tool name present in `PATH` or its full [`Path`].
+    enum Tool {
+        /// The name of a tool present in `PATH`.
+        Name(&'static str),
+
+        /// The full [`Path`] to a tool.
+        FullPath(PathBuf),
+    }
+
+    impl Display for Tool {
+        fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+            match self {
+                Tool::Name(name) => write!(f, "{}", name),
+                Tool::FullPath(path) => write!(f, "{}", path.display()),
+            }
+        }
+    }
+
+    /// Returns the first [`Tool`] found in the system `PATH`, given a list of tool names, returning
+    /// the first one found and printing its version.
+    ///
+    /// If a value is present in the provided environment variable name, it will get checked
+    /// instead.
+    ///
+    /// ## Panic
+    /// Returns [`Option::None`] if no [`Tool`] is found.
+    fn find_tool(names: &[&'static str], env: &str) -> Option<Tool> {
+        if let Ok(path_str) = env::var(env) {
+            let path_str = path_str.trim_matches([' ', '"', '\''].as_slice());
+            println!("{env} is set, checking if \"{path_str}\" is a valid tool");
+            let path = PathBuf::from(&path_str);
+
+            if !path.is_file() {
+                panic!("\"{path_str}\" is not a file path.")
             }
+
+            let output = Command::new(path_str)
+                .arg("--version")
+                .output()
+                .unwrap_or_else(|e| panic!("Failed to run \"{path_str} --version\". ({e})"));
+
+            if output.status.success() {
+                let out_str = String::from_utf8_lossy(&output.stdout);
+                println!("Valid tool found:\n{out_str}");
+            } else {
+                println!("cargo:warning=Tool \"{path_str}\" found, but could not execute \"{path_str} --version\"")
+            }
+
+            return Some(Tool::FullPath(path));
         }
+
+        println!("{env} not set, looking for {names:?} in PATH");
+        for name in names {
+            if let Ok(output) = Command::new(name).arg("--version").output() {
+                if output.status.success() {
+                    let out_str = String::from_utf8_lossy(&output.stdout);
+                    println!("Valid tool found:\n{out_str}");
+                    return Some(Tool::Name(name));
+                }
+            }
+        }
+
+        None
+    }
+
+    /// Always panics, printing suggestions for finding the specified tool.
+    fn panic_tool_help(name: &str, env: &str, suggestions: &[&str]) {
+        let suggestions_str = if suggestions.is_empty() {
+            String::new()
+        } else {
+            let mut suggestions_str = "For your Operating System we recommend:\n".to_string();
+            for suggestion in &suggestions[..suggestions.len() - 1] {
+                suggestions_str.push_str(&format!("{suggestion}\nOR\n"));
+            }
+            suggestions_str.push_str(suggestions[suggestions.len() - 1]);
+            suggestions_str
+        };
+
+        panic!("No suitable tool equivalent to \"{name}\" has been found in PATH, if one is already installed, either add its directory to PATH or set {env} to its full path. {suggestions_str}")
+    }
+
+    /// Executes [nm][nm] or an equivalent tool in portable mode and returns the output.
+    ///
+    /// ## Panic
+    /// Will panic on any errors.
+    ///
+    /// [nm]: https://www.man7.org/linux/man-pages/man1/nm.1.html
+    fn nm_symbols(tool: &Tool, target_lib: &str, out_path: impl AsRef<Path>) -> String {
+        let output = Command::new(tool.to_string())
+            .current_dir(&out_path)
+            .arg(target_lib)
+            .args(["-p", "-P"])
+            .output()
+            .unwrap_or_else(move |e| panic!("Failed to run \"{tool}\". ({e})"));
+
+        if !output.status.success() {
+            panic!(
+                "An error has occurred while acquiring symbols from the compiled library \"{target_lib}\" ({}):\n{}",
+                output.status,
+                String::from_utf8_lossy(&output.stderr)
+            );
+        }
+
+        String::from_utf8_lossy(&output.stdout).to_string()
+    }
+
+    /// Executes [objcopy][objcopy], adding a prefix to the specified symbols of the target library.
+    ///
+    /// ## Panic
+    /// Will panic on any errors.
+    ///
+    /// [objcopy]: https://www.man7.org/linux/man-pages/man1/objcopy.1.html
+    fn objcopy_redefine(
+        tool: &Tool,
+        target_lib: &str,
+        prefix: &str,
+        symbols: HashSet<&str>,
+        out_path: impl AsRef<Path>,
+    ) {
+        let mut cmd = Command::new(tool.to_string());
+        cmd.current_dir(&out_path);
+        for symbol in symbols {
+            cmd.arg(format!("--redefine-sym={symbol}={prefix}{symbol}"));
+        }
+
+        let output = cmd
+            .arg(target_lib)
+            .output()
+            .unwrap_or_else(move |e| panic!("Failed to run \"{tool}\". ({e})"));
+
+        if !output.status.success() {
+            panic!(
+                "An error has occurred while redefining symbols from library file \"{target_lib}\" ({}):\n{}",
+                output.status,
+                String::from_utf8_lossy(&output.stderr)
+            );
+        }
+    }
+
+    /// A filter for a symbol in a library.
+    struct Filter<'a> {
+        prefix: &'a str,
+        sym_type: char,
+    }
+
+    /// Turns **`nm`**'s output into an iterator of [`str`] symbols.
+    ///
+    /// This function expects **`nm`** to be called using the **`-p`** and **`-P`** flags.
+    fn get_symbols<'a, const N: usize>(
+        nm_output: &'a str,
+        filters: [Filter<'a>; N],
+    ) -> HashSet<&'a str> {
+        let iter = nm_output
+            .lines()
+            .map(|symbol| {
+                // Strip irrelevant information
+
+                let mut stripped = symbol;
+                while stripped.split(' ').count() > 2 {
+                    // SAFETY: We just made sure ' ' is present above
+                    let idx = unsafe { stripped.rfind(' ').unwrap_unchecked() };
+                    stripped = &stripped[..idx]
+                }
+                stripped
+            })
+            .filter(move |symbol| {
+                // Filter matching symbols
+
+                if symbol.split(' ').count() == 2 {
+                    for filter in &filters {
+                        if symbol.ends_with(filter.sym_type) && symbol.starts_with(filter.prefix) {
+                            return true;
+                        }
+                    }
+                }
+                false
+            })
+            .map(|symbol| &symbol[..symbol.len() - 2]); // Strip the type, so only the symbol remains
+
+        // Filter duplicates
+        HashSet::from_iter(iter)
     }
 }
diff --git a/llama-cpp-sys-2/src/lib.rs b/llama-cpp-sys-2/src/lib.rs
index dbec4ab4..d1ef8abb 100644
--- a/llama-cpp-sys-2/src/lib.rs
+++ b/llama-cpp-sys-2/src/lib.rs
@@ -4,27 +4,14 @@
 #![allow(non_camel_case_types)]
 #![allow(non_snake_case)]
 
-use std::fmt::{Debug, Formatter};
-include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
+// [`ash`] is only included to link to the Vulkan SDK.
+#[allow(unused)]
+#[cfg(feature = "vulkan")]
+use ash;
 
-impl Debug for llama_grammar_element {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        fn type_to_str(r#type: llama_gretype) -> &'static str {
-            match r#type {
-                LLAMA_GRETYPE_END => "END",
-                LLAMA_GRETYPE_ALT => "ALT",
-                LLAMA_GRETYPE_RULE_REF => "RULE_REF",
-                LLAMA_GRETYPE_CHAR => "CHAR",
-                LLAMA_GRETYPE_CHAR_NOT => "CHAR_NOT",
-                LLAMA_GRETYPE_CHAR_RNG_UPPER => "CHAR_RNG_UPPER",
-                LLAMA_GRETYPE_CHAR_ALT => "CHAR_ALT",
-                _ => "Unknown",
-            }
-        }
+// [`cudarc`] is only included to link to CUDA.
+#[allow(unused)]
+#[cfg(feature = "cuda")]
+use cudarc;
 
-        f.debug_struct("llama_grammar_element")
-            .field("type", &type_to_str(self.type_))
-            .field("value", &self.value)
-            .finish()
-    }
-}
+include!(concat!(env!("OUT_DIR"), "/bindings.rs"));

From 6b0f923b2a0ba5befb07ee6b5fd337f967663b5f Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Tue, 4 Jun 2024 11:18:58 -0700
Subject: [PATCH 2/3] Change cublas to cuda

---
 Cargo.lock                 |  5 +++--
 README.md                  |  2 +-
 embeddings/src/main.rs     | 10 +++++-----
 llama-cpp-2/Cargo.toml     |  4 ++--
 llama-cpp-2/src/lib.rs     |  2 +-
 llama-cpp-sys-2/Cargo.toml |  2 +-
 llama-cpp-sys-2/README.md  |  4 ++--
 llama-cpp-sys-2/llama.cpp  |  2 +-
 simple/Cargo.toml          |  2 +-
 simple/src/main.rs         | 10 +++++-----
 test-build.Dockerfile      |  4 ++--
 11 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 303c8736..6af8201e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -494,6 +494,7 @@ version = "0.1.54"
 dependencies = [
  "bindgen",
  "cc",
+ "once_cell",
 ]
 
 [[package]]
@@ -559,9 +560,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
 
 [[package]]
 name = "once_cell"
-version = "1.18.0"
+version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
 [[package]]
 name = "openssl"
diff --git a/README.md b/README.md
index e39c5eae..a4366bdb 100644
--- a/README.md
+++ b/README.md
@@ -35,7 +35,7 @@ git clone --recursive https://github.com/utilityai/llama-cpp-rs
 cd llama-cpp-rs
 ```
 
-Run the simple example (add `--featues cublas` if you have a cuda gpu)
+Run the simple example (add `--featues cuda` if you have a cuda gpu)
 
 ```bash
 cargo run --release --bin simple "The way to kill a linux process is" hf-model TheBloke/Llama-2-7B-GGUF llama-2-7b.Q4_K_M.gguf
diff --git a/embeddings/src/main.rs b/embeddings/src/main.rs
index f0514b1e..5f5b3fa8 100644
--- a/embeddings/src/main.rs
+++ b/embeddings/src/main.rs
@@ -20,8 +20,8 @@ use llama_cpp_2::ggml_time_us;
 use llama_cpp_2::llama_backend::LlamaBackend;
 use llama_cpp_2::llama_batch::LlamaBatch;
 use llama_cpp_2::model::params::LlamaModelParams;
-use llama_cpp_2::model::{AddBos, Special};
 use llama_cpp_2::model::LlamaModel;
+use llama_cpp_2::model::{AddBos, Special};
 
 #[derive(clap::Parser, Debug, Clone)]
 struct Args {
@@ -35,7 +35,7 @@ struct Args {
     #[clap(short)]
     normalise: bool,
     /// Disable offloading layers to the gpu
-    #[cfg(feature = "cublas")]
+    #[cfg(feature = "cuda")]
     #[clap(long)]
     disable_gpu: bool,
 }
@@ -78,7 +78,7 @@ fn main() -> Result<()> {
         model,
         prompt,
         normalise,
-        #[cfg(feature = "cublas")]
+        #[cfg(feature = "cuda")]
         disable_gpu,
     } = Args::parse();
 
@@ -87,13 +87,13 @@ fn main() -> Result<()> {
 
     // offload all layers to the gpu
     let model_params = {
-        #[cfg(feature = "cublas")]
+        #[cfg(feature = "cuda")]
         if !disable_gpu {
             LlamaModelParams::default().with_n_gpu_layers(1000)
         } else {
             LlamaModelParams::default()
         }
-        #[cfg(not(feature = "cublas"))]
+        #[cfg(not(feature = "cuda"))]
         LlamaModelParams::default()
     };
 
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 9e66935c..24cd44fa 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -14,7 +14,7 @@ thiserror = { workspace = true }
 tracing = { workspace = true }
 
 [features]
-cublas = ["llama-cpp-sys-2/cublas"]
+cuda = ["llama-cpp-sys-2/cuda"]
 metal = ["llama-cpp-sys-2/metal"]
 sampler = []
 
@@ -25,4 +25,4 @@ llama-cpp-sys-2 = { path = "../llama-cpp-sys-2", features=["metal"], version = "
 workspace = true
 
 [package.metadata.docs.rs]
-features = ["sampler"]
\ No newline at end of file
+features = ["sampler"]
diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 95384a93..4603bd7c 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -11,7 +11,7 @@
 //!
 //! # Feature Flags
 //!
-//! - `cublas` enables CUDA gpu support.
+//! - `cuda` enables CUDA gpu support.
 //! - `sampler` adds the [`context::sample::sampler`] struct for a more rusty way of sampling.
 use std::ffi::NulError;
 use std::fmt::Debug;
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 58c2d541..41d335db 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -51,6 +51,6 @@ cc = { workspace = true, features = ["parallel"] }
 once_cell = "1.19.0"
 
 [features]
-cublas = []
+cuda = []
 metal = []
 
diff --git a/llama-cpp-sys-2/README.md b/llama-cpp-sys-2/README.md
index 88981c50..69dd4733 100644
--- a/llama-cpp-sys-2/README.md
+++ b/llama-cpp-sys-2/README.md
@@ -1,5 +1,5 @@
 # llama-cpp-sys
 
-Raw bindings to llama.cpp with cublas support.
+Raw bindings to llama.cpp with cuda support.
 
-See [llama-cpp-2](https://crates.io/crates/llama-cpp-2) for a safe API.
\ No newline at end of file
+See [llama-cpp-2](https://crates.io/crates/llama-cpp-2) for a safe API.
diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
index 0541f062..917dc8cf 160000
--- a/llama-cpp-sys-2/llama.cpp
+++ b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit 0541f06296753dbc59a57379eb54cec865a4c9f9
+Subproject commit 917dc8cfa67a72fb7c8bf7392270da3bf4833af4
diff --git a/simple/Cargo.toml b/simple/Cargo.toml
index 4fe31ce8..dae758de 100644
--- a/simple/Cargo.toml
+++ b/simple/Cargo.toml
@@ -13,7 +13,7 @@ anyhow = { workspace = true }
 encoding_rs = { workspace = true }
 
 [features]
-cublas = ["llama-cpp-2/cublas"]
+cuda = ["llama-cpp-2/cuda"]
 metal =  ["llama-cpp-2/metal"]
 
 [lints]
diff --git a/simple/src/main.rs b/simple/src/main.rs
index 8dd17b2c..58e09b45 100644
--- a/simple/src/main.rs
+++ b/simple/src/main.rs
@@ -15,8 +15,8 @@ use llama_cpp_2::llama_backend::LlamaBackend;
 use llama_cpp_2::llama_batch::LlamaBatch;
 use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue;
 use llama_cpp_2::model::params::LlamaModelParams;
-use llama_cpp_2::model::{AddBos, Special};
 use llama_cpp_2::model::LlamaModel;
+use llama_cpp_2::model::{AddBos, Special};
 use llama_cpp_2::token::data_array::LlamaTokenDataArray;
 use std::ffi::CString;
 use std::io::Write;
@@ -44,7 +44,7 @@ struct Args {
     #[arg(short = 'o', value_parser = parse_key_val)]
     key_value_overrides: Vec<(String, ParamOverrideValue)>,
     /// Disable offloading layers to the gpu
-    #[cfg(feature = "cublas")]
+    #[cfg(feature = "cuda")]
     #[clap(long)]
     disable_gpu: bool,
     #[arg(short = 's', long, help = "RNG seed (default: 1234)")]
@@ -123,7 +123,7 @@ fn main() -> Result<()> {
         model,
         prompt,
         file,
-        #[cfg(feature = "cublas")]
+        #[cfg(feature = "cuda")]
         disable_gpu,
         key_value_overrides,
         seed,
@@ -137,13 +137,13 @@ fn main() -> Result<()> {
 
     // offload all layers to the gpu
     let model_params = {
-        #[cfg(feature = "cublas")]
+        #[cfg(feature = "cuda")]
         if !disable_gpu {
             LlamaModelParams::default().with_n_gpu_layers(1000)
         } else {
             LlamaModelParams::default()
         }
-        #[cfg(not(feature = "cublas"))]
+        #[cfg(not(feature = "cuda"))]
         LlamaModelParams::default()
     };
 
diff --git a/test-build.Dockerfile b/test-build.Dockerfile
index da82218e..8540d2f9 100644
--- a/test-build.Dockerfile
+++ b/test-build.Dockerfile
@@ -8,10 +8,10 @@ RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
 ENV PATH=/root/.cargo/bin:$PATH
 
 COPY . .
-RUN cargo build --bin simple --features cublas
+RUN cargo build --bin simple --features cuda
 
 FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} as base-cuda-runtime
 
 COPY --from=base-cuda /target/debug/simple /usr/local/bin/simple
 
-ENTRYPOINT ["/usr/local/bin/simple"]
\ No newline at end of file
+ENTRYPOINT ["/usr/local/bin/simple"]

From 09c7b612cf2945e47f27c6f57b6f005eac49000e Mon Sep 17 00:00:00 2001
From: SilasMarvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Tue, 4 Jun 2024 19:00:15 -0700
Subject: [PATCH 3/3] Update build script to remove cudarc

---
 llama-cpp-sys-2/build.rs   | 29 +++++++++++++++++++++--------
 llama-cpp-sys-2/src/lib.rs | 10 ----------
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 81e8029a..7da89866 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -413,8 +413,6 @@ fn compile_hipblas(cx: &mut Build, cxx: &mut Build, mut hip: Build) -> &'static
 fn compile_cuda(cx: &mut Build, cxx: &mut Build, featless_cxx: Build) -> &'static str {
     println!("Compiling CUDA GGML..");
 
-    // CUDA gets linked through the cudarc crate.
-
     cx.define("GGML_USE_CUDA", None);
     cxx.define("GGML_USE_CUDA", None);
 
@@ -425,12 +423,27 @@ fn compile_cuda(cx: &mut Build, cxx: &mut Build, featless_cxx: Build) -> &'stati
         .define("K_QUANTS_PER_ITERATION", Some("2"))
         .define("GGML_CUDA_PEER_MAX_BATCH_SIZE", Some("128"));
 
-    if cfg!(target_os = "linux") {
-        nvcc.flag("-Wno-pedantic");
-        // TODO Are these links needed?
-        println!("cargo:rustc-link-lib=pthread");
-        println!("cargo:rustc-link-lib=dl");
-        println!("cargo:rustc-link-lib=rt");
+    // if cfg!(target_os = "linux") {
+    //     nvcc.flag("-Wno-pedantic");
+    // }
+
+    for lib in [
+        "cuda", "cublas", "culibos", "cudart", "cublasLt", "pthread", "dl", "rt",
+    ] {
+        println!("cargo:rustc-link-lib={}", lib);
+    }
+    if !nvcc.get_compiler().is_like_msvc() {
+        for lib in ["culibos", "pthread", "dl", "rt"] {
+            println!("cargo:rustc-link-lib={}", lib);
+        }
+    }
+
+    println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
+
+    if nvcc.get_compiler().is_like_msvc() {
+        nvcc.std("c++14");
+    } else {
+        nvcc.flag("-std=c++11").std("c++11");
     }
 
     if cfg!(feature = "cuda_dmmv") {
diff --git a/llama-cpp-sys-2/src/lib.rs b/llama-cpp-sys-2/src/lib.rs
index d1ef8abb..f91bb3a7 100644
--- a/llama-cpp-sys-2/src/lib.rs
+++ b/llama-cpp-sys-2/src/lib.rs
@@ -4,14 +4,4 @@
 #![allow(non_camel_case_types)]
 #![allow(non_snake_case)]
 
-// [`ash`] is only included to link to the Vulkan SDK.
-#[allow(unused)]
-#[cfg(feature = "vulkan")]
-use ash;
-
-// [`cudarc`] is only included to link to CUDA.
-#[allow(unused)]
-#[cfg(feature = "cuda")]
-use cudarc;
-
 include!(concat!(env!("OUT_DIR"), "/bindings.rs"));