diff --git a/06_gpu_and_ml/vllm_inference.py b/06_gpu_and_ml/vllm_inference.py index e3554e062..875ed7909 100644 --- a/06_gpu_and_ml/vllm_inference.py +++ b/06_gpu_and_ml/vllm_inference.py @@ -62,7 +62,12 @@ def download_model_to_folder(): Image.from_registry( "nvidia/cuda:12.1.0-base-ubuntu22.04", add_python="3.10" ) - .pip_install("vllm==0.2.5", "huggingface_hub==0.19.4", "hf-transfer==0.1.4") + .pip_install( + "vllm==0.2.5", + "huggingface_hub==0.19.4", + "hf-transfer==0.1.4", + "torch==2.1.2", + ) # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) .run_function( diff --git a/06_gpu_and_ml/vllm_mixtral.py b/06_gpu_and_ml/vllm_mixtral.py index f28caa750..456b818fe 100644 --- a/06_gpu_and_ml/vllm_mixtral.py +++ b/06_gpu_and_ml/vllm_mixtral.py @@ -60,7 +60,12 @@ def download_model_to_folder(): Image.from_registry( "nvidia/cuda:12.1.0-base-ubuntu22.04", add_python="3.10" ) - .pip_install("vllm==0.2.5", "huggingface_hub==0.19.4", "hf-transfer==0.1.4") + .pip_install( + "vllm==0.2.5", + "huggingface_hub==0.19.4", + "hf-transfer==0.1.4", + "torch==2.1.2", + ) .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) .run_function(download_model_to_folder, timeout=60 * 20) )