forked from EricLBuehler/mistral.rs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.rs
108 lines (101 loc) · 3.39 KB
/
main.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
use either::Either;
use indexmap::IndexMap;
use std::{fs::File, sync::Arc};
use tokio::sync::mpsc::channel;
use mistralrs::{
Constraint, DefaultSchedulerMethod, Device, DeviceMapMetadata, MistralRs, MistralRsBuilder,
ModelDType, NormalLoaderBuilder, NormalLoaderType, NormalRequest, NormalSpecificConfig,
Request, RequestMessage, Response, Result, SamplingParams, SchedulerConfig, TokenSource,
};
/// Gets the best device, cpu, cuda if compiled with CUDA
pub(crate) fn best_device() -> Result<Device> {
#[cfg(not(feature = "metal"))]
{
Device::cuda_if_available(0)
}
#[cfg(feature = "metal")]
{
Device::new_metal(0)
}
}
fn setup() -> anyhow::Result<Arc<MistralRs>> {
// Select a Mistral model
let loader =
NormalLoaderBuilder::new(
NormalSpecificConfig {
use_flash_attn: false,
repeat_last_n: 64,
},
None,
None,
None, // Will detect from ordering file
)
.with_lora(
"lamm-mit/x-lora".to_string(),
serde_json::from_reader(File::open("my-ordering-file.json").unwrap_or_else(|_| {
panic!("Could not load ordering file at my-ordering-file.json")
}))?,
)
.build(NormalLoaderType::Mistral);
// Load, into a Pipeline
let pipeline = loader.load_model_from_hf(
None,
TokenSource::CacheToken,
&ModelDType::Auto,
&best_device()?,
false,
DeviceMapMetadata::dummy(),
None,
None, // No PagedAttention.
)?;
// Create the MistralRs, which is a runner
Ok(MistralRsBuilder::new(
pipeline,
SchedulerConfig::DefaultScheduler {
method: DefaultSchedulerMethod::Fixed(5.try_into().unwrap()),
},
)
.build())
}
fn main() -> anyhow::Result<()> {
let mistralrs = setup()?;
let (tx, mut rx) = channel(10_000);
let request = Request::Normal(NormalRequest {
messages: RequestMessage::Chat(vec![IndexMap::from([
("role".to_string(), Either::Left("user".to_string())),
("content".to_string(), Either::Left("Hello!".to_string())),
])]),
sampling_params: SamplingParams::default(),
response: tx,
return_logprobs: false,
is_streaming: false,
id: 0,
constraint: Constraint::None,
suffix: None,
adapters: None,
});
// Example: Make adapter_3 the active adapter
mistralrs
.get_sender()?
.blocking_send(Request::ActivateAdapters(vec!["adapter_3".to_string()]))?;
mistralrs.get_sender()?.blocking_send(request)?;
let response = rx.blocking_recv().unwrap();
match response {
Response::Done(c) => println!(
"Text: {}, Prompt T/s: {}, Completion T/s: {}",
c.choices[0].message.content,
c.usage.avg_prompt_tok_per_sec,
c.usage.avg_compl_tok_per_sec
),
Response::InternalError(e) => panic!("Internal error: {e}"),
Response::ValidationError(e) => panic!("Validation error: {e}"),
Response::ModelError(e, c) => panic!(
"Model error: {e}. Response: Text: {}, Prompt T/s: {}, Completion T/s: {}",
c.choices[0].message.content,
c.usage.avg_prompt_tok_per_sec,
c.usage.avg_compl_tok_per_sec
),
_ => unreachable!(),
}
Ok(())
}