Merge pull request #634 from helixml/feature/hardlink

Runner slimming, add llama3.3:70b
helixml · Dec 8, 2024 · ffd2818 · ffd2818
2 parents 7edb7a3 + ecfc7c7
commit ffd2818
Show file tree

Hide file tree

Showing 11 changed files with 142 additions and 176 deletions.
diff --git a/.drone.yml b/.drone.yml
@@ -162,7 +162,7 @@ steps:
       # Runner with no baked models = empty
       # See https://github.com/helixml/base-images
       # and https://github.com/helixml/base-images/releases
-      - TAG=2024-12-06a-empty
+      - TAG=2024-12-07a-empty
       - APP_VERSION=${DRONE_TAG:-${DRONE_COMMIT_SHA:-latest}}
     username: admin
     password:
@@ -204,7 +204,7 @@ steps:
       # Runner with small models = small
       # See https://github.com/helixml/base-images
       # and https://github.com/helixml/base-images/releases
-      - TAG=2024-12-06a-small
+      - TAG=2024-12-07a-small
       - APP_VERSION=${DRONE_TAG:-${DRONE_COMMIT_SHA:-latest}}
     username: admin
     password:
@@ -232,7 +232,7 @@ steps:
       # Runner with small models = small
       # See https://github.com/helixml/base-images
       # and https://github.com/helixml/base-images/releases
-      - TAG=2024-12-06a-small
+      - TAG=2024-12-07a-small
       - APP_VERSION=${DRONE_TAG:-${DRONE_COMMIT_SHA:-latest}}
     username: admin
     password:
@@ -277,7 +277,7 @@ steps:
       # Runner with large models = large
       # See https://github.com/helixml/base-images
       # and https://github.com/helixml/base-images/releases
-      - TAG=2024-12-06a-large
+      - TAG=2024-12-07a-large
       - APP_VERSION=${DRONE_TAG:-${DRONE_COMMIT_SHA:-latest}}
     username: admin
     password:
@@ -305,7 +305,7 @@ steps:
       # Runner with large models = large
       # See https://github.com/helixml/base-images
       # and https://github.com/helixml/base-images/releases
-      - TAG=2024-12-06a-large
+      - TAG=2024-12-07a-large
       - APP_VERSION=${DRONE_TAG:-${DRONE_COMMIT_SHA:-latest}}
     username: admin
     password:

diff --git a/Dockerfile.runner b/Dockerfile.runner
@@ -1,6 +1,6 @@
 #syntax=docker/dockerfile:1.4
 
-ARG TAG=main-empty
+ARG TAG=latest-small
 ARG UV_VERSION="0.5.4"
 
 FROM ghcr.io/astral-sh/uv:${UV_VERSION} AS uv
@@ -45,7 +45,7 @@ FROM registry.helix.ml/helix/runner-base:${TAG}
 
 # Install ollama
 RUN TEMP_DIR=$(mktemp -d /tmp/ollama_install_XXXXXX) && \
-    curl --retry 5 -L https://github.com/ollama/ollama/releases/download/v0.3.13/ollama-linux-amd64.tgz -o $TEMP_DIR/ollama.tgz && \
+    curl --retry 5 -L https://github.com/ollama/ollama/releases/download/v0.5.1/ollama-linux-amd64.tgz -o $TEMP_DIR/ollama.tgz && \
     tar -xzf $TEMP_DIR/ollama.tgz -C $TEMP_DIR && \
     mv $TEMP_DIR/bin/ollama /usr/bin/ollama && \
     chmod +x /usr/bin/ollama && \

diff --git a/api/pkg/config/runner_config.go b/api/pkg/config/runner_config.go
@@ -29,14 +29,14 @@ type Runtimes struct {
 	V2Engine bool `envconfig:"RUNTIME_V2_ENGINE" default:"true"`
 	Axolotl  struct {
 		Enabled      bool          `envconfig:"RUNTIME_AXOLOTL_ENABLED" default:"true"`
-		WarmupModels []string      `envconfig:"RUNTIME_AXOLOTL_WARMUP_MODELS" default:"mistralai/Mistral-7B-Instruct-v0.1"`
+		WarmupModels []string      `envconfig:"RUNTIME_AXOLOTL_WARMUP_MODELS" default:""`
 		InstanceTTL  time.Duration `envconfig:"RUNTIME_AXOLOTL_INSTANCE_TTL" default:"10s"`
 	}
 	Ollama OllamaRuntimeConfig
 }
 
 type OllamaRuntimeConfig struct {
 	Enabled      bool          `envconfig:"RUNTIME_OLLAMA_ENABLED" default:"true"`
-	WarmupModels []string      `envconfig:"RUNTIME_OLLAMA_WARMUP_MODELS" default:"llama3:instruct,llama3.1:8b-instruct-q8_0,llama3.2:1b-instruct-q8_0,llama3.2:3b-instruct-q8_0,phi3.5:3.8b-mini-instruct-q8_0"`
+	WarmupModels []string      `envconfig:"RUNTIME_OLLAMA_WARMUP_MODELS" default:"llama3.1:8b-instruct-q8_0"`
 	InstanceTTL  time.Duration `envconfig:"RUNTIME_OLLAMA_INSTANCE_TTL" default:"10s"`
 }
diff --git a/api/pkg/model/models.go b/api/pkg/model/models.go
@@ -177,25 +177,11 @@ const (
 
 func GetDefaultDiffusersModels() ([]*DiffusersGenericImage, error) {
 	return []*DiffusersGenericImage{
-		{
-			Id:          Model_Diffusers_SD35,
-			Name:        "Stable Diffusion 3.5 Medium",
-			Memory:      GB * 24,
-			Description: "Medium model, from Stability AI",
-			Hide:        false,
-		},
-		{
-			Id:          Model_Diffusers_SDTurbo,
-			Name:        "Stable Diffusion Turbo",
-			Memory:      GB * 5,
-			Description: "Turbo model, from Stability AI",
-			Hide:        false,
-		},
 		{
 			Id:          Model_Diffusers_FluxDev,
-			Name:        "Flux 1 Dev",
+			Name:        "FLUX.1-dev",
 			Memory:      GB * 39,
-			Description: "Dev model, from Black Forest Labs",
+			Description: "High quality image model, from Black Forest Labs",
 			Hide:        false,
 		},
 	}, nil
@@ -204,18 +190,18 @@ func GetDefaultDiffusersModels() ([]*DiffusersGenericImage, error) {
 // See also types/models.go for model name constants
 func GetDefaultOllamaModels() ([]*OllamaGenericText, error) {
 	models := []*OllamaGenericText{
-		// Latest models, Oct 2024 updates
+		// Latest models, Dec 2024 updates
 		{
-			Id:            "llama3.1:8b-instruct-q4_K_M", // https://ollama.com/library/llama3.1:8b-instruct-q4_K_M
+			Id:            "llama3.1:8b-instruct-q8_0", // https://ollama.com/library/llama3.1:8b-instruct-q8_0
 			Name:          "Llama 3.1 8B",
 			Memory:        GB * 15,
 			ContextLength: 32768, // goes up to 128k, but then uses 35GB
 			Description:   "Fast and good for everyday tasks, from Meta - 8bit quantized, 32K context",
 			Hide:          false,
 		},
 		{
-			Id:            "llama3.1:70b-instruct-q4_K_M", // https://ollama.com/library/llama3.1:70b-instruct-q4_K_M
-			Name:          "Llama 3.1 70B",
+			Id:            "llama3.3:70b-instruct-q4_K_M", // https://ollama.com/library/llama3.1:70b-instruct-q4_K_M
+			Name:          "Llama 3.3 70B",
 			Memory:        GB * 48,
 			ContextLength: 16384,
 			Description:   "Smarter but slower, from Meta - 4bit quantized, 16K context",
@@ -237,15 +223,6 @@ func GetDefaultOllamaModels() ([]*OllamaGenericText, error) {
 			Description:   "Small model, from Meta - 8bit quantized, 128K context",
 			Hide:          false,
 		},
-		// Old llama3:instruct, leaving in here because the id is in lots of our examples
-		{
-			Id:            "llama3:instruct", // https://ollama.com/library/llama3:instruct
-			Name:          "Llama 3 8B",
-			Memory:        MB * 6390,
-			ContextLength: 8192,
-			Description:   "Older model, from Meta - 4bit quantized, 8K context",
-			Hide:          false,
-		},
 		{
 			Id:            "phi3.5:3.8b-mini-instruct-q8_0", // https://ollama.com/library/phi3.5:3.8b-mini-instruct-q8_0
 			Name:          "Phi 3.5 3.8B",
@@ -254,30 +231,6 @@ func GetDefaultOllamaModels() ([]*OllamaGenericText, error) {
 			Description:   "Fast and good for everyday tasks, from Microsoft - 8bit quantized, 64K context",
 			Hide:          false,
 		},
-		{
-			Id:            "gemma2:2b-instruct-q8_0", // https://ollama.com/library/gemma2:2b-instruct-q8_0
-			Name:          "Gemma 2 2B",
-			Memory:        MB * 4916,
-			ContextLength: 8192,
-			Description:   "Fast and good for everyday tasks, from Google - 8bit quantized, 8K context",
-			Hide:          false,
-		},
-		{
-			Id:            "gemma2:9b-instruct-q8_0", // https://ollama.com/library/gemma2:9b-instruct-q8_0
-			Name:          "Gemma 2 9B",
-			Memory:        GB * 13,
-			ContextLength: 8192,
-			Description:   "Fast and good for everyday tasks, from Google - 8bit quantized, 8K context",
-			Hide:          false,
-		},
-		{
-			Id:            "gemma2:27b-instruct-q8_0", // https://ollama.com/library/gemma2:27b-instruct-q8_0
-			Name:          "Gemma 2 27B",
-			Memory:        GB * 34,
-			ContextLength: 8192,
-			Description:   "Large model with enhanced capabilities, from Google - 8bit quantized, 8K context",
-			Hide:          false,
-		},
 		{
 			Id:            "qwen2.5:7b-instruct-q8_0", // https://ollama.com/library/qwen2.5:7b-instruct-q8_0
 			Name:          "Qwen 2.5 7B",
@@ -286,22 +239,6 @@ func GetDefaultOllamaModels() ([]*OllamaGenericText, error) {
 			Description:   "Fast and good for everyday tasks, from Alibaba - 8bit quantized, 32K context",
 			Hide:          false,
 		},
-		{
-			Id:            "qwen2.5:72b", // https://ollama.com/library/qwen2.5:72b
-			Name:          "Qwen 2.5 72B",
-			Memory:        GB * 67,
-			ContextLength: 32768,
-			Description:   "Large model with enhanced capabilities, from Alibaba - 4bit quantized, 32K context",
-			Hide:          true, // hide for now since we can't run it in prod
-		},
-		{
-			Id:            "hermes3:8b-llama3.1-q8_0", // https://ollama.com/library/hermes3:8b-llama3.1-q8_0
-			Name:          "Hermes 3 8B",
-			Memory:        GB * 35,
-			ContextLength: 131072,
-			Description:   "Function calling and structured output, from Nous - 8bit quantized, 128K context",
-			Hide:          false,
-		},
 		{
 			Id:            "aya:8b-23-q8_0", // https://ollama.com/library/aya:8b-23-q8_0
 			Name:          "Aya 8B",
@@ -318,65 +255,24 @@ func GetDefaultOllamaModels() ([]*OllamaGenericText, error) {
 			Description:   "Large multi-lingual model from Cohere - 4bit quantized, 8K context",
 			Hide:          false,
 		},
-		// Still baked into images because of use in qapair gen
+		// Old llama3:instruct and ph3:instruct, leaving in here because the id
+		// is in lots of our examples and tests
 		{
-			Id:            "mixtral:instruct", // https://ollama.com/library/mixtral:instruct
-			Name:          "Mixtral",
-			Memory:        GB * 35,
-			ContextLength: 32768,
-			Description:   "Medium multi-lingual model, from Mistral - 4bit quantized, 32K context",
-			Hide:          false,
-		},
-
-		// ****************************************************************************
-		// ****************************************************************************
-		// ****************************************************************************
-		// ****************************************************************************
-		// ****************************************************************************
-		// ****************************************************************************
-		// OLDER MODELS, NO LONGER BAKED INTO IMAGES
-		// keeping just for backward compatibility (if anyone
-		// specifies them manually in their runner configuration)
-		// ****************************************************************************
-		// ****************************************************************************
-		// ****************************************************************************
-		// ****************************************************************************
-		// ****************************************************************************
-		// ****************************************************************************
-
-		// XXX TODO These memory requirements are all wrong, need to fix by
-		// running the models and looking at ollama ps (via the dashboard)
-		{
-			Id:            "mistral:7b-instruct", // https://ollama.com/library/mistral:7b-instruct
-			Name:          "Mistral 7B v0.3",
-			Memory:        MB * 4199,
-			ContextLength: 32768,
-			Hide:          true,
-		},
-		{
-			Id:            "codellama:70b-instruct-q2_K", // https://ollama.com/library/codellama:70b-instruct-q2_K
-			Name:          "CodeLlama 70B",
-			Memory:        GB * 25,
-			ContextLength: 2048,
-			Hide:          true,
-		},
-
-		// NousHermes2Pro
-		{
-			Id:            "adrienbrault/nous-hermes2pro:Q5_K_S", // https://ollama.com/adrienbrault/nous-hermes2pro:Q5_K_S
-			Name:          "Nous-Hermes 2 Pro",
-			Memory:        GB * 5,
-			ContextLength: 32768,
+			Id:            "llama3:instruct", // https://ollama.com/library/llama3:instruct
+			Name:          "Llama 3 8B",
+			Memory:        MB * 6390,
+			ContextLength: 8192,
+			Description:   "Older model, from Meta - 4bit quantized, 8K context",
 			Hide:          true,
 		},
 		{
-			Id:            "adrienbrault/nous-hermes2theta-llama3-8b:q8_0", // https://ollama.com/adrienbrault/nous-hermes2theta-llama3-8b:q8_0
-			Name:          "Nous-Hermes 2 Theta",
-			Memory:        MB * 8107,
-			ContextLength: 8192,
+			Id:            "phi3:instruct", // https://ollama.com/library/phi3:instruct
+			Name:          "Phi-3",
+			Memory:        MB * 2300,
+			ContextLength: 131072,
+			Description:   "Fast and good for everyday tasks",
 			Hide:          true,
 		},
-
 		{
 			Id:            "llama3:70b", // https://ollama.com/library/llama3:70b
 			Name:          "Llama 3 70B",
@@ -386,35 +282,11 @@ func GetDefaultOllamaModels() ([]*OllamaGenericText, error) {
 			Hide:          true,
 		},
 		{
-			Id:            "llama3:8b-instruct-fp16", // https://ollama.com/library/llama3:8b-instruct-fp16
-			Name:          "Llama 3 8B FP16",
-			Memory:        GB * 16,
-			ContextLength: 8192,
-			Description:   "Fast and good for everyday tasks",
-			Hide:          true,
-		},
-		{
-			Id:            "llama3:8b-instruct-q6_K", // https://ollama.com/library/llama3:8b-instruct-q6_K
-			Name:          "Llama 3 8B Q6_K",
-			Memory:        MB * 6295,
+			Id:            "gemma2:2b-instruct-q8_0", // https://ollama.com/library/gemma2:2b-instruct-q8_0
+			Name:          "Gemma 2 2B",
+			Memory:        MB * 4916,
 			ContextLength: 8192,
-			Description:   "Fast and good for everyday tasks",
-			Hide:          true,
-		},
-		{
-			Id:            "llama3:8b-instruct-q8_0", // https://ollama.com/library/llama3:8b-instruct-q8_0
-			Name:          "Llama 3 8B Q8_0",
-			Memory:        MB * 8107,
-			ContextLength: 4096,
-			Description:   "Large model with enhanced capabilities",
-			Hide:          true,
-		},
-		{
-			Id:            "phi3:instruct", // https://ollama.com/library/phi3:instruct
-			Name:          "Phi-3",
-			Memory:        MB * 2300,
-			ContextLength: 131072,
-			Description:   "Fast and good for everyday tasks",
+			Description:   "Fast and good for everyday tasks, from Google - 8bit quantized, 8K context",
 			Hide:          true,
 		},
 	}

diff --git a/api/pkg/runner/llm_ollama_model_instance.go b/api/pkg/runner/llm_ollama_model_instance.go
@@ -324,6 +324,7 @@ func (i *OllamaInferenceModelInstance) startOllamaServer(_ context.Context) erro
 		"OLLAMA_MAX_LOADED_MODELS=1",
 		"OLLAMA_NUM_PARALLEL=1",
 		"OLLAMA_FLASH_ATTENTION=1",
+		"OLLAMA_KV_CACHE_TYPE=q8_0",
 		"HTTP_PROXY="+os.Getenv("HTTP_PROXY"),
 		"HTTPS_PROXY="+os.Getenv("HTTPS_PROXY"),
 		"OLLAMA_HOST="+ollamaHost,                 // Bind on localhost with random port

diff --git a/api/pkg/scheduler/scheduler_test.go b/api/pkg/scheduler/scheduler_test.go
@@ -330,7 +330,7 @@ func TestScheduler_RunnerWithWrongModel(t *testing.T) {
 	assert.NotNil(t, w)
 
 	// Test any new work will do part 2 -- new work only, ignore filter
-	err = createTestSession(scheduler, "test-request-2", "adrienbrault/nous-hermes2pro:Q5_K_S", "")
+	err = createTestSession(scheduler, "test-request-2", "phi3:instruct", "")
 	assert.NoError(t, err)
 	w, err = scheduler.WorkForRunner("test-runner", WorkloadTypeSession, true, "gemma2:2b-instruct-q8_0")
 	assert.NoError(t, err)