Added sample notebook running vllm

NERC-CEH · Oct 9, 2024 · cf77ce8 · cf77ce8
1 parent 924c59f
commit cf77ce8
Show file tree

Hide file tree

Showing 2 changed files with 154 additions and 1 deletion.
diff --git a/notebooks/vllm_test.ipynb b/notebooks/vllm_test.ipynb
@@ -0,0 +1,153 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vllm import LLM, SamplingParams"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Tell me a joke.\"\n",
+    "]\n",
+    "params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"HF_TOKEN\"] = \"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\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING 09-27 11:20:50 config.py:319] bitsandbytes quantization is not fully optimized yet. The speed can be slower than non-quantized models.\n",
+      "INFO 09-27 11:20:50 llm_engine.py:226] Initializing an LLM engine (v0.6.1.dev238+ge2c6e0a82) with config: model='unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit', speculative_config=None, tokenizer='unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit, use_v2_block_manager=False, num_scheduler_steps=1, multi_step_stream_outputs=False, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, mm_processor_kwargs=None)\n",
+      "INFO 09-27 11:20:51 model_runner.py:1014] Starting to load model unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit...\n",
+      "INFO 09-27 11:20:51 loader.py:1014] Loading weights with BitsAndBytes quantization.  May take a while ...\n",
+      "INFO 09-27 11:20:51 weight_utils.py:242] Using model weights format ['*.safetensors']\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1591f28b46054d24890b33e117b5ddc4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "adb6870fac6f48c08b3af57649a2fe68",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 09-27 11:20:55 model_runner.py:1025] Loading model weights took 8.0501 GB\n",
+      "INFO 09-27 11:20:57 gpu_executor.py:122] # GPU blocks: 382, # CPU blocks: 1638\n",
+      "INFO 09-27 11:21:01 model_runner.py:1329] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.\n",
+      "INFO 09-27 11:21:01 model_runner.py:1333] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
+      "INFO 09-27 11:21:23 model_runner.py:1456] Graph capturing finished in 22 secs.\n"
+     ]
+    }
+   ],
+   "source": [
+    "llm = LLM(model=\"unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit\", quantization=\"bitsandbytes\", load_format=\"bitsandbytes\", max_model_len=4096)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.31it/s, est. speed input: 7.88 toks/s, output: 36.77 toks/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "outputs = llm.generate(prompts, params)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[RequestOutput(request_id=0, prompt='Tell me a joke.', prompt_token_ids=[1, 69839, 1639, 1261, 53052, 1046], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text=' I’m not good at jokes, but I’ll try my best.\\n\\nWhat do you call a fake noodle? An impasta.', token_ids=(1362, 6135, 1605, 3683, 1513, 88916, 1044, 1809, 1362, 7372, 3352, 2036, 3560, 1338, 7493, 1653, 1636, 3690, 1261, 36840, 96572, 1282, 1063, 2048, 3918, 5693, 1046, 2), cumulative_logprob=None, logprobs=None, finish_reason=stop, stop_reason=None)], finished=True, metrics=RequestMetrics(arrival_time=1727432483.3084419, last_token_time=1727432483.3084419, first_scheduled_time=1727432483.311076, first_token_time=1727432483.4521985, time_in_queue=0.0026340484619140625, finished_time=1727432484.0507092, scheduler_time=0.001698089001365588, model_forward_time=None, model_execute_time=None), lora_request=None)]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "outputs"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ dependencies = [
     "vllm",
     "bitsandbytes",
     "haystack-ai",
-    "vllm-haystack",
+    "accelerate",
 ]
 
 [project.optional-dependencies]