-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Aisuko <[email protected]>
- Loading branch information
Showing
4 changed files
with
156 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
use crate::models::{Llama, LlamaConfig}; | ||
|
||
use burn::{ | ||
backend::tch::{TchBackend, TchDevice}, | ||
config::Config, | ||
module::Module, | ||
tensor::backend::Backend, | ||
}; | ||
|
||
use burn::record::{BinFileRecorder, HalfPrecisionSettings, Recorder, RecorderError}; | ||
|
||
use super::Loader; | ||
|
||
pub struct Convertion {} | ||
|
||
impl Convertion { | ||
pub fn convert_llama_dump_to_model<B: Backend>( | ||
dump_path: &str, | ||
model_name: &str, | ||
device: &B::Device, | ||
) -> Result<(), Box<dyn std::error::Error>> { | ||
let (llama, llama_conifg): (Llama<B>, LlamaConfig) = | ||
Loader::load_llama_dmp(dump_path, device)?; | ||
Convertion::save_llama_model_file(llama, model_name)?; | ||
llama_conifg.save(&format!("{model_name}.cfg"))?; | ||
Ok(()) | ||
} | ||
|
||
pub fn save_llama_model_file<B: Backend>( | ||
llama: Llama<B>, | ||
name: &str, | ||
) -> Result<(), RecorderError> { | ||
BinFileRecorder::<HalfPrecisionSettings>::new().record(llama.into_record(), name.into()) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
|
||
use super::*; | ||
|
||
#[test] | ||
fn test_convertion() { | ||
type Backend = TchBackend<f32>; | ||
let device = TchDevice::Mps; | ||
|
||
// get home env | ||
let home = std::env::var("HOME").unwrap(); | ||
|
||
let dump_path = &format!("{}/Downloads/workspace/llama/tokenizer.model", home); | ||
let model_name = "llama2-7b-chat"; | ||
|
||
let option = | ||
Convertion::convert_llama_dump_to_model::<Backend>(dump_path, model_name, &device); | ||
|
||
match option { | ||
Ok(_) => println!("ok"), | ||
Err(e) => println!("error: {}", e), | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,7 @@ | ||
pub mod convert; | ||
pub mod loader; | ||
pub mod tokenizer; | ||
|
||
pub use convert::*; | ||
pub use loader::*; | ||
pub use tokenizer::*; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
//! LLM tokenization tools crate. | ||
//! Adapted from the https://github.com/Gadersd/llama2-burn/blob/main/src/token.rs | ||
//! Adapted by Aisuko | ||
use rust_tokenizers::{ | ||
error::TokenizerError, | ||
tokenizer::{SentencePieceBpeTokenizer, Tokenizer, TruncationStrategy}, | ||
vocab::Vocab, | ||
}; | ||
|
||
use std::{result, vec}; | ||
|
||
const BOS_TOKEN_ID: i64 = 1; | ||
const EOS_TOKEN_ID: i64 = 2; | ||
|
||
pub type Result<T> = result::Result<T, TokenizerError>; | ||
|
||
pub struct LlamaTokenizer { | ||
spm: SentencePieceBpeTokenizer, | ||
} | ||
|
||
impl LlamaTokenizer { | ||
pub fn new(tokenizer_path: &str) -> Result<Self> { | ||
let lower_case = false; | ||
SentencePieceBpeTokenizer::from_file(tokenizer_path, lower_case).map(|spm| Self { spm }) | ||
} | ||
|
||
pub fn encode(&self, text: &str, inlcude_bos: bool, include_eos: bool) -> Vec<i64> { | ||
let pre = if inlcude_bos { | ||
vec![BOS_TOKEN_ID] | ||
} else { | ||
vec![] | ||
}; | ||
|
||
let post = if include_eos { | ||
vec![EOS_TOKEN_ID] | ||
} else { | ||
vec![] | ||
}; | ||
|
||
let token_ids = self | ||
.spm | ||
.encode( | ||
text, | ||
None, | ||
std::usize::MAX, | ||
&TruncationStrategy::LongestFirst, | ||
0, | ||
) | ||
.token_ids; | ||
|
||
[pre, token_ids, post] | ||
.into_iter() | ||
.flat_map(|v| v.into_iter()) | ||
.collect() | ||
} | ||
|
||
pub fn decode(&self, tokens: &[i64], skip_special_tokens: bool) -> String { | ||
let clean_spaces = false; | ||
self.spm.decode(tokens, skip_special_tokens, clean_spaces) | ||
} | ||
|
||
pub fn vocab_size(&self, include_special_tokens: bool) -> usize { | ||
let vocab = self.spm.vocab(); | ||
if include_special_tokens { | ||
vocab.values().len() + vocab.special_values().len() | ||
} else { | ||
vocab.values().len() | ||
} | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[test] | ||
fn test_tokenizer() { | ||
let home = std::env::var("HOME").unwrap(); | ||
let tm_path = &format!("{}/Downloads/workspace/llama/tokenizer.model", home); | ||
let tokenizer = LlamaTokenizer::new(tm_path).unwrap(); | ||
// tokenizer.vocab_size(fale) should be >0 | ||
assert!(tokenizer.vocab_size(false) > 0); | ||
|
||
let test_str = "Hello, I am Llama2!"; | ||
let encoded = tokenizer.encode(test_str, true, true); | ||
let decoded = tokenizer.decode(&encoded, false); | ||
|
||
assert_eq!(test_str, decoded); | ||
} | ||
} |