diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index ff4144fe..0c416061 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -15,8 +15,12 @@ tracing = { workspace = true }
 
 [features]
 cublas = ["llama-cpp-sys-2/cublas"]
+metal = ["llama-cpp-sys-2/metal"]
 sampler = []
 
+[target.'cfg(all(target_os = "macos", any(target_arch = "aarch64", target_arch = "arm64")))'.dependencies]  
+llama-cpp-sys-2 = { path = "../llama-cpp-sys-2", features=["metal"], version = "0.1.48" }
+
 [lints]
 workspace = true
 
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index bf5b1afe..e940c172 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -46,4 +46,5 @@ cc = { workspace = true, features = ["parallel"] }
 
 [features]
 cublas = []
+metal = []
 
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index a7dcdc5c..dd2dfe39 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -23,6 +23,18 @@ fn main() {
     ggml.cpp(false);
     llama_cpp.cpp(true);
 
+    // CMakeFiles.txt: set(LLAMA_SCHED_MAX_COPIES  "4" CACHE STRING "llama: max input copies for pipeline parallelism")
+    // get LLAMA_SCHED_MAX_COPIES from env, default to 4
+    let mut max_copies = "4".to_owned();
+    if let Ok(env_max_copies) = env::var("LLAMA_SCHED_MAX_COPIES") {
+        if let Ok(v) = env_max_copies.parse::<u32>() {
+            if v > 0 {
+                max_copies = env_max_copies;
+            }
+        } 
+    } 
+    ggml.define("GGML_SCHED_MAX_COPIES", Some(max_copies.as_str()));
+
     // https://github.com/ggerganov/llama.cpp/blob/a836c8f534ab789b02da149fbdaf7735500bff74/Makefile#L364-L368
     if let Some(ggml_cuda) = &mut ggml_cuda {
         for lib in [
@@ -118,22 +130,30 @@ fn main() {
     if cfg!(target_os = "macos") {
         assert!(!cublas_enabled, "CUBLAS is not supported on macOS");
 
-        println!("cargo:rustc-link-lib=framework=Metal");
+        let metal_enabled = env::var("CARGO_FEATURE_METAL").is_ok();
+
         println!("cargo:rustc-link-lib=framework=Foundation");
-        println!("cargo:rustc-link-lib=framework=MetalPerformanceShaders");
-        println!("cargo:rustc-link-lib=framework=MetalKit");
+        if metal_enabled {
+            println!("cargo:rustc-link-lib=framework=Metal");
+            println!("cargo:rustc-link-lib=framework=MetalPerformanceShaders");
+            println!("cargo:rustc-link-lib=framework=MetalKit");
+        }
 
         llama_cpp.define("_DARWIN_C_SOURCE", None);
 
         // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L340-L343
-        llama_cpp.define("GGML_USE_METAL", None);
+        if metal_enabled {
+            llama_cpp.define("GGML_USE_METAL", None);
+        }
         llama_cpp.define("GGML_USE_ACCELERATE", None);
         llama_cpp.define("ACCELERATE_NEW_LAPACK", None);
         llama_cpp.define("ACCELERATE_LAPACK_ILP64", None);
         println!("cargo:rustc-link-lib=framework=Accelerate");
 
-        metal_hack(&mut ggml);
-        ggml.include("./llama.cpp/ggml-metal.h");
+        if metal_enabled {
+            metal_hack(&mut ggml);
+            ggml.include("./llama.cpp/ggml-metal.h");
+        }
     }
 
     if cfg!(target_os = "dragonfly") {
@@ -167,6 +187,9 @@ fn main() {
         if let Some(cuda) = ggml_cuda.as_mut() {
             cuda.define("NDEBUG", None);
         }
+
+        ggml.opt_level(3);
+        llama_cpp.opt_level(3);
     }
 
     if let Some(ggml_cuda) = ggml_cuda {
diff --git a/simple/Cargo.toml b/simple/Cargo.toml
index 0dad7266..c6e76d8d 100644
--- a/simple/Cargo.toml
+++ b/simple/Cargo.toml
@@ -14,6 +14,7 @@ encoding_rs = { workspace = true }
 
 [features]
 cublas = ["llama-cpp-2/cublas"]
+metal =  ["llama-cpp-2/metal"]
 
 [lints]
 workspace = true