Skip to content

Commit

Permalink
Added sample notebook running vllm
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewcoole committed Oct 9, 2024
1 parent 924c59f commit cf77ce8
Show file tree
Hide file tree
Showing 2 changed files with 154 additions and 1 deletion.
153 changes: 153 additions & 0 deletions notebooks/vllm_test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from vllm import LLM, SamplingParams"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"prompts = [\n",
" \"Tell me a joke.\"\n",
"]\n",
"params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"HF_TOKEN\"] = \"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\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING 09-27 11:20:50 config.py:319] bitsandbytes quantization is not fully optimized yet. The speed can be slower than non-quantized models.\n",
"INFO 09-27 11:20:50 llm_engine.py:226] Initializing an LLM engine (v0.6.1.dev238+ge2c6e0a82) with config: model='unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit', speculative_config=None, tokenizer='unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit, use_v2_block_manager=False, num_scheduler_steps=1, multi_step_stream_outputs=False, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, mm_processor_kwargs=None)\n",
"INFO 09-27 11:20:51 model_runner.py:1014] Starting to load model unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit...\n",
"INFO 09-27 11:20:51 loader.py:1014] Loading weights with BitsAndBytes quantization. May take a while ...\n",
"INFO 09-27 11:20:51 weight_utils.py:242] Using model weights format ['*.safetensors']\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "1591f28b46054d24890b33e117b5ddc4",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00<?, ?it/s]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "adb6870fac6f48c08b3af57649a2fe68",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00<?, ?it/s]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO 09-27 11:20:55 model_runner.py:1025] Loading model weights took 8.0501 GB\n",
"INFO 09-27 11:20:57 gpu_executor.py:122] # GPU blocks: 382, # CPU blocks: 1638\n",
"INFO 09-27 11:21:01 model_runner.py:1329] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.\n",
"INFO 09-27 11:21:01 model_runner.py:1333] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
"INFO 09-27 11:21:23 model_runner.py:1456] Graph capturing finished in 22 secs.\n"
]
}
],
"source": [
"llm = LLM(model=\"unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit\", quantization=\"bitsandbytes\", load_format=\"bitsandbytes\", max_model_len=4096)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 1.31it/s, est. speed input: 7.88 toks/s, output: 36.77 toks/s]\n"
]
}
],
"source": [
"outputs = llm.generate(prompts, params)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[RequestOutput(request_id=0, prompt='Tell me a joke.', prompt_token_ids=[1, 69839, 1639, 1261, 53052, 1046], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text=' I’m not good at jokes, but I’ll try my best.\\n\\nWhat do you call a fake noodle? An impasta.', token_ids=(1362, 6135, 1605, 3683, 1513, 88916, 1044, 1809, 1362, 7372, 3352, 2036, 3560, 1338, 7493, 1653, 1636, 3690, 1261, 36840, 96572, 1282, 1063, 2048, 3918, 5693, 1046, 2), cumulative_logprob=None, logprobs=None, finish_reason=stop, stop_reason=None)], finished=True, metrics=RequestMetrics(arrival_time=1727432483.3084419, last_token_time=1727432483.3084419, first_scheduled_time=1727432483.311076, first_token_time=1727432483.4521985, time_in_queue=0.0026340484619140625, finished_time=1727432484.0507092, scheduler_time=0.001698089001365588, model_forward_time=None, model_execute_time=None), lora_request=None)]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"outputs"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ dependencies = [
"vllm",
"bitsandbytes",
"haystack-ai",
"vllm-haystack",
"accelerate",
]

[project.optional-dependencies]
Expand Down

0 comments on commit cf77ce8

Please sign in to comment.