utilityai · MarcusDunn · Mar 26, 2024 · Mar 29, 2024 · Mar 29, 2024 · Mar 29, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -16,9 +16,9 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.4"
-cc = "1.0.90"
 anyhow = "1.0.81"
 clap = "4.5.3"
+cmake = "0.1.50"
 
 [workspace.lints.rust]
 missing_docs = { level = "warn" }

diff --git a/embeddings/src/main.rs b/embeddings/src/main.rs
@@ -20,7 +20,7 @@ use llama_cpp_2::ggml_time_us;
 use llama_cpp_2::llama_backend::LlamaBackend;
 use llama_cpp_2::llama_batch::LlamaBatch;
 use llama_cpp_2::model::params::LlamaModelParams;
-use llama_cpp_2::model::AddBos;
+use llama_cpp_2::model::{AddBos, Special};
 use llama_cpp_2::model::LlamaModel;
 
 #[derive(clap::Parser, Debug, Clone)]
@@ -137,7 +137,7 @@ fn main() -> Result<()> {
     for (i, token_line) in tokens_lines_list.iter().enumerate() {
         eprintln!("Prompt {i}");
         for token in token_line {
-            eprintln!(" {} --> {}", token, model.token_to_str(*token)?);
+            eprintln!(" {} --> {}", token, model.token_to_str(*token, Special::Tokenize)?);
         }
         eprintln!();
     }

diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
@@ -15,6 +15,7 @@ tracing = { workspace = true }
 
 [features]
 cublas = ["llama-cpp-sys-2/cublas"]
+vulkan = ["llama-cpp-sys-2/vulkan"]
 sampler = []
 
 [lints]

diff --git a/llama-cpp-2/src/context/sample/sampler.rs b/llama-cpp-2/src/context/sample/sampler.rs
@@ -3,7 +3,7 @@
 //! like [`crate::context::LlamaContext`] or token history to the sampler.
 //!
 //! # Example
-//! 
+//!
 //! **Llama.cpp default sampler**
 //!
 //! ```rust

diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
@@ -242,3 +242,13 @@ pub fn ggml_time_us() -> i64 {
 pub fn llama_supports_mlock() -> bool {
     unsafe { llama_cpp_sys_2::llama_supports_mlock() }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn smoke_test() {
+        ggml_time_us();
+    }
+}
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
@@ -34,6 +34,15 @@ pub enum AddBos {
     Never,
 }
 
+/// How to determine if we should tokenize special tokens
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Special {
+    /// Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space.
+    Tokenize,
+    /// Treat special and/or control tokens as plaintext.
+    Plaintext,
+}
+
 unsafe impl Send for LlamaModel {}
 
 unsafe impl Sync for LlamaModel {}
@@ -54,10 +63,11 @@ impl LlamaModel {
     /// Get all tokens in the model.
     pub fn tokens(
         &self,
+        special: Special,
     ) -> impl Iterator<Item = (LlamaToken, Result<String, TokenToStringError>)> + '_ {
         (0..self.n_vocab())
             .map(LlamaToken::new)
-            .map(|llama_token| (llama_token, self.token_to_str(llama_token)))
+            .map(move |llama_token| (llama_token, self.token_to_str(llama_token, special)))
     }
 
     /// Get the beginning of stream token.
@@ -86,18 +96,18 @@ impl LlamaModel {
     /// # Errors
     ///
     /// See [`TokenToStringError`] for more information.
-    pub fn token_to_str(&self, token: LlamaToken) -> Result<String, TokenToStringError> {
-        self.token_to_str_with_size(token, 32)
+    pub fn token_to_str(&self, token: LlamaToken, special: Special) -> Result<String, TokenToStringError> {
+        self.token_to_str_with_size(token, 32, special)
     }
 
     /// Convert a vector of tokens to a single string.
     ///
     /// # Errors
     ///
     /// See [`TokenToStringError`] for more information.
-    pub fn tokens_to_str(&self, tokens: &[LlamaToken]) -> Result<String, TokenToStringError> {
+    pub fn tokens_to_str(&self, tokens: &[LlamaToken], special: Special) -> Result<String, TokenToStringError> {
         let mut builder = String::with_capacity(tokens.len() * 4);
-        for str in tokens.iter().copied().map(|t| self.token_to_str(t)) {
+        for str in tokens.iter().copied().map(|t| self.token_to_str(t, special)) {
             builder += &str?;
         }
         Ok(builder)
@@ -210,11 +220,13 @@ impl LlamaModel {
         &self,
         token: LlamaToken,
         buffer_size: usize,
+        special: Special,
     ) -> Result<String, TokenToStringError> {
         if token == self.token_nl() {
             return Ok(String::from("\n"));
         }
 
+        // unsure what to do with this in the face of the 'special' arg
         match self.token_type(token) {
             LlamaTokenType::Normal | LlamaTokenType::UserDefined => {}
             LlamaTokenType::Control => {
@@ -230,12 +242,17 @@ impl LlamaModel {
             }
         }
 
+        let special = match special {
+            Special::Tokenize => true,
+            Special::Plaintext => false,
+        };
+
         let string = CString::new(vec![b'*'; buffer_size]).expect("no null");
         let len = string.as_bytes().len();
         let len = c_int::try_from(len).expect("length fits into c_int");
         let buf = string.into_raw();
         let size = unsafe {
-            llama_cpp_sys_2::llama_token_to_piece(self.model.as_ptr(), token.0, buf, len)
+            llama_cpp_sys_2::llama_token_to_piece(self.model.as_ptr(), token.0, buf, len, special)
         };
 
         match size {
@@ -280,17 +297,16 @@ impl LlamaModel {
     /// Get chat template from model.
     ///
     /// # Errors
-    /// 
+    ///
     /// * If the model has no chat template
     /// * If the chat template is not a valid [`CString`].
     #[allow(clippy::missing_panics_doc)] // we statically know this will not panic as
     pub fn get_chat_template(&self, buf_size: usize) -> Result<String, ChatTemplateError> {
-
         // longest known template is about 1200 bytes from llama.cpp
         let chat_temp = CString::new(vec![b'*'; buf_size]).expect("no null");
         let chat_ptr = chat_temp.into_raw();
         let chat_name = CString::new("tokenizer.chat_template").expect("no null bytes");
-        
+
         let chat_template: String = unsafe {
             let ret = llama_cpp_sys_2::llama_model_meta_val_str(
                 self.model.as_ptr(),
@@ -305,7 +321,7 @@ impl LlamaModel {
             debug_assert_eq!(usize::try_from(ret).unwrap(), template.len(), "llama.cpp guarantees that the returned int {ret} is the length of the string {} but that was not the case", template.len());
             template
         };
-        
+
         Ok(chat_template)
     }
 

diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
@@ -42,8 +42,9 @@ include = [
 
 [build-dependencies]
 bindgen = { workspace = true }
-cc = { workspace = true, features = ["parallel"] }
+cmake = { workspace = true }
 
 [features]
 cublas = []
+vulkan = []