Skip to content

Commit

Permalink
Merge pull request #634 from helixml/feature/hardlink
Browse files Browse the repository at this point in the history
Runner slimming, add llama3.3:70b
  • Loading branch information
lukemarsden authored Dec 8, 2024
2 parents 7edb7a3 + ecfc7c7 commit ffd2818
Show file tree
Hide file tree
Showing 11 changed files with 142 additions and 176 deletions.
10 changes: 5 additions & 5 deletions .drone.yml
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ steps:
# Runner with no baked models = empty
# See https://github.com/helixml/base-images
# and https://github.com/helixml/base-images/releases
- TAG=2024-12-06a-empty
- TAG=2024-12-07a-empty
- APP_VERSION=${DRONE_TAG:-${DRONE_COMMIT_SHA:-latest}}
username: admin
password:
Expand Down Expand Up @@ -204,7 +204,7 @@ steps:
# Runner with small models = small
# See https://github.com/helixml/base-images
# and https://github.com/helixml/base-images/releases
- TAG=2024-12-06a-small
- TAG=2024-12-07a-small
- APP_VERSION=${DRONE_TAG:-${DRONE_COMMIT_SHA:-latest}}
username: admin
password:
Expand Down Expand Up @@ -232,7 +232,7 @@ steps:
# Runner with small models = small
# See https://github.com/helixml/base-images
# and https://github.com/helixml/base-images/releases
- TAG=2024-12-06a-small
- TAG=2024-12-07a-small
- APP_VERSION=${DRONE_TAG:-${DRONE_COMMIT_SHA:-latest}}
username: admin
password:
Expand Down Expand Up @@ -277,7 +277,7 @@ steps:
# Runner with large models = large
# See https://github.com/helixml/base-images
# and https://github.com/helixml/base-images/releases
- TAG=2024-12-06a-large
- TAG=2024-12-07a-large
- APP_VERSION=${DRONE_TAG:-${DRONE_COMMIT_SHA:-latest}}
username: admin
password:
Expand Down Expand Up @@ -305,7 +305,7 @@ steps:
# Runner with large models = large
# See https://github.com/helixml/base-images
# and https://github.com/helixml/base-images/releases
- TAG=2024-12-06a-large
- TAG=2024-12-07a-large
- APP_VERSION=${DRONE_TAG:-${DRONE_COMMIT_SHA:-latest}}
username: admin
password:
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile.runner
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#syntax=docker/dockerfile:1.4

ARG TAG=main-empty
ARG TAG=latest-small
ARG UV_VERSION="0.5.4"

FROM ghcr.io/astral-sh/uv:${UV_VERSION} AS uv
Expand Down Expand Up @@ -45,7 +45,7 @@ FROM registry.helix.ml/helix/runner-base:${TAG}

# Install ollama
RUN TEMP_DIR=$(mktemp -d /tmp/ollama_install_XXXXXX) && \
curl --retry 5 -L https://github.com/ollama/ollama/releases/download/v0.3.13/ollama-linux-amd64.tgz -o $TEMP_DIR/ollama.tgz && \
curl --retry 5 -L https://github.com/ollama/ollama/releases/download/v0.5.1/ollama-linux-amd64.tgz -o $TEMP_DIR/ollama.tgz && \
tar -xzf $TEMP_DIR/ollama.tgz -C $TEMP_DIR && \
mv $TEMP_DIR/bin/ollama /usr/bin/ollama && \
chmod +x /usr/bin/ollama && \
Expand Down
4 changes: 2 additions & 2 deletions api/pkg/config/runner_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,14 @@ type Runtimes struct {
V2Engine bool `envconfig:"RUNTIME_V2_ENGINE" default:"true"`
Axolotl struct {
Enabled bool `envconfig:"RUNTIME_AXOLOTL_ENABLED" default:"true"`
WarmupModels []string `envconfig:"RUNTIME_AXOLOTL_WARMUP_MODELS" default:"mistralai/Mistral-7B-Instruct-v0.1"`
WarmupModels []string `envconfig:"RUNTIME_AXOLOTL_WARMUP_MODELS" default:""`
InstanceTTL time.Duration `envconfig:"RUNTIME_AXOLOTL_INSTANCE_TTL" default:"10s"`
}
Ollama OllamaRuntimeConfig
}

type OllamaRuntimeConfig struct {
Enabled bool `envconfig:"RUNTIME_OLLAMA_ENABLED" default:"true"`
WarmupModels []string `envconfig:"RUNTIME_OLLAMA_WARMUP_MODELS" default:"llama3:instruct,llama3.1:8b-instruct-q8_0,llama3.2:1b-instruct-q8_0,llama3.2:3b-instruct-q8_0,phi3.5:3.8b-mini-instruct-q8_0"`
WarmupModels []string `envconfig:"RUNTIME_OLLAMA_WARMUP_MODELS" default:"llama3.1:8b-instruct-q8_0"`
InstanceTTL time.Duration `envconfig:"RUNTIME_OLLAMA_INSTANCE_TTL" default:"10s"`
}
172 changes: 22 additions & 150 deletions api/pkg/model/models.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,25 +177,11 @@ const (

func GetDefaultDiffusersModels() ([]*DiffusersGenericImage, error) {
return []*DiffusersGenericImage{
{
Id: Model_Diffusers_SD35,
Name: "Stable Diffusion 3.5 Medium",
Memory: GB * 24,
Description: "Medium model, from Stability AI",
Hide: false,
},
{
Id: Model_Diffusers_SDTurbo,
Name: "Stable Diffusion Turbo",
Memory: GB * 5,
Description: "Turbo model, from Stability AI",
Hide: false,
},
{
Id: Model_Diffusers_FluxDev,
Name: "Flux 1 Dev",
Name: "FLUX.1-dev",
Memory: GB * 39,
Description: "Dev model, from Black Forest Labs",
Description: "High quality image model, from Black Forest Labs",
Hide: false,
},
}, nil
Expand All @@ -204,18 +190,18 @@ func GetDefaultDiffusersModels() ([]*DiffusersGenericImage, error) {
// See also types/models.go for model name constants
func GetDefaultOllamaModels() ([]*OllamaGenericText, error) {
models := []*OllamaGenericText{
// Latest models, Oct 2024 updates
// Latest models, Dec 2024 updates
{
Id: "llama3.1:8b-instruct-q4_K_M", // https://ollama.com/library/llama3.1:8b-instruct-q4_K_M
Id: "llama3.1:8b-instruct-q8_0", // https://ollama.com/library/llama3.1:8b-instruct-q8_0
Name: "Llama 3.1 8B",
Memory: GB * 15,
ContextLength: 32768, // goes up to 128k, but then uses 35GB
Description: "Fast and good for everyday tasks, from Meta - 8bit quantized, 32K context",
Hide: false,
},
{
Id: "llama3.1:70b-instruct-q4_K_M", // https://ollama.com/library/llama3.1:70b-instruct-q4_K_M
Name: "Llama 3.1 70B",
Id: "llama3.3:70b-instruct-q4_K_M", // https://ollama.com/library/llama3.1:70b-instruct-q4_K_M
Name: "Llama 3.3 70B",
Memory: GB * 48,
ContextLength: 16384,
Description: "Smarter but slower, from Meta - 4bit quantized, 16K context",
Expand All @@ -237,15 +223,6 @@ func GetDefaultOllamaModels() ([]*OllamaGenericText, error) {
Description: "Small model, from Meta - 8bit quantized, 128K context",
Hide: false,
},
// Old llama3:instruct, leaving in here because the id is in lots of our examples
{
Id: "llama3:instruct", // https://ollama.com/library/llama3:instruct
Name: "Llama 3 8B",
Memory: MB * 6390,
ContextLength: 8192,
Description: "Older model, from Meta - 4bit quantized, 8K context",
Hide: false,
},
{
Id: "phi3.5:3.8b-mini-instruct-q8_0", // https://ollama.com/library/phi3.5:3.8b-mini-instruct-q8_0
Name: "Phi 3.5 3.8B",
Expand All @@ -254,30 +231,6 @@ func GetDefaultOllamaModels() ([]*OllamaGenericText, error) {
Description: "Fast and good for everyday tasks, from Microsoft - 8bit quantized, 64K context",
Hide: false,
},
{
Id: "gemma2:2b-instruct-q8_0", // https://ollama.com/library/gemma2:2b-instruct-q8_0
Name: "Gemma 2 2B",
Memory: MB * 4916,
ContextLength: 8192,
Description: "Fast and good for everyday tasks, from Google - 8bit quantized, 8K context",
Hide: false,
},
{
Id: "gemma2:9b-instruct-q8_0", // https://ollama.com/library/gemma2:9b-instruct-q8_0
Name: "Gemma 2 9B",
Memory: GB * 13,
ContextLength: 8192,
Description: "Fast and good for everyday tasks, from Google - 8bit quantized, 8K context",
Hide: false,
},
{
Id: "gemma2:27b-instruct-q8_0", // https://ollama.com/library/gemma2:27b-instruct-q8_0
Name: "Gemma 2 27B",
Memory: GB * 34,
ContextLength: 8192,
Description: "Large model with enhanced capabilities, from Google - 8bit quantized, 8K context",
Hide: false,
},
{
Id: "qwen2.5:7b-instruct-q8_0", // https://ollama.com/library/qwen2.5:7b-instruct-q8_0
Name: "Qwen 2.5 7B",
Expand All @@ -286,22 +239,6 @@ func GetDefaultOllamaModels() ([]*OllamaGenericText, error) {
Description: "Fast and good for everyday tasks, from Alibaba - 8bit quantized, 32K context",
Hide: false,
},
{
Id: "qwen2.5:72b", // https://ollama.com/library/qwen2.5:72b
Name: "Qwen 2.5 72B",
Memory: GB * 67,
ContextLength: 32768,
Description: "Large model with enhanced capabilities, from Alibaba - 4bit quantized, 32K context",
Hide: true, // hide for now since we can't run it in prod
},
{
Id: "hermes3:8b-llama3.1-q8_0", // https://ollama.com/library/hermes3:8b-llama3.1-q8_0
Name: "Hermes 3 8B",
Memory: GB * 35,
ContextLength: 131072,
Description: "Function calling and structured output, from Nous - 8bit quantized, 128K context",
Hide: false,
},
{
Id: "aya:8b-23-q8_0", // https://ollama.com/library/aya:8b-23-q8_0
Name: "Aya 8B",
Expand All @@ -318,65 +255,24 @@ func GetDefaultOllamaModels() ([]*OllamaGenericText, error) {
Description: "Large multi-lingual model from Cohere - 4bit quantized, 8K context",
Hide: false,
},
// Still baked into images because of use in qapair gen
// Old llama3:instruct and ph3:instruct, leaving in here because the id
// is in lots of our examples and tests
{
Id: "mixtral:instruct", // https://ollama.com/library/mixtral:instruct
Name: "Mixtral",
Memory: GB * 35,
ContextLength: 32768,
Description: "Medium multi-lingual model, from Mistral - 4bit quantized, 32K context",
Hide: false,
},

// ****************************************************************************
// ****************************************************************************
// ****************************************************************************
// ****************************************************************************
// ****************************************************************************
// ****************************************************************************
// OLDER MODELS, NO LONGER BAKED INTO IMAGES
// keeping just for backward compatibility (if anyone
// specifies them manually in their runner configuration)
// ****************************************************************************
// ****************************************************************************
// ****************************************************************************
// ****************************************************************************
// ****************************************************************************
// ****************************************************************************

// XXX TODO These memory requirements are all wrong, need to fix by
// running the models and looking at ollama ps (via the dashboard)
{
Id: "mistral:7b-instruct", // https://ollama.com/library/mistral:7b-instruct
Name: "Mistral 7B v0.3",
Memory: MB * 4199,
ContextLength: 32768,
Hide: true,
},
{
Id: "codellama:70b-instruct-q2_K", // https://ollama.com/library/codellama:70b-instruct-q2_K
Name: "CodeLlama 70B",
Memory: GB * 25,
ContextLength: 2048,
Hide: true,
},

// NousHermes2Pro
{
Id: "adrienbrault/nous-hermes2pro:Q5_K_S", // https://ollama.com/adrienbrault/nous-hermes2pro:Q5_K_S
Name: "Nous-Hermes 2 Pro",
Memory: GB * 5,
ContextLength: 32768,
Id: "llama3:instruct", // https://ollama.com/library/llama3:instruct
Name: "Llama 3 8B",
Memory: MB * 6390,
ContextLength: 8192,
Description: "Older model, from Meta - 4bit quantized, 8K context",
Hide: true,
},
{
Id: "adrienbrault/nous-hermes2theta-llama3-8b:q8_0", // https://ollama.com/adrienbrault/nous-hermes2theta-llama3-8b:q8_0
Name: "Nous-Hermes 2 Theta",
Memory: MB * 8107,
ContextLength: 8192,
Id: "phi3:instruct", // https://ollama.com/library/phi3:instruct
Name: "Phi-3",
Memory: MB * 2300,
ContextLength: 131072,
Description: "Fast and good for everyday tasks",
Hide: true,
},

{
Id: "llama3:70b", // https://ollama.com/library/llama3:70b
Name: "Llama 3 70B",
Expand All @@ -386,35 +282,11 @@ func GetDefaultOllamaModels() ([]*OllamaGenericText, error) {
Hide: true,
},
{
Id: "llama3:8b-instruct-fp16", // https://ollama.com/library/llama3:8b-instruct-fp16
Name: "Llama 3 8B FP16",
Memory: GB * 16,
ContextLength: 8192,
Description: "Fast and good for everyday tasks",
Hide: true,
},
{
Id: "llama3:8b-instruct-q6_K", // https://ollama.com/library/llama3:8b-instruct-q6_K
Name: "Llama 3 8B Q6_K",
Memory: MB * 6295,
Id: "gemma2:2b-instruct-q8_0", // https://ollama.com/library/gemma2:2b-instruct-q8_0
Name: "Gemma 2 2B",
Memory: MB * 4916,
ContextLength: 8192,
Description: "Fast and good for everyday tasks",
Hide: true,
},
{
Id: "llama3:8b-instruct-q8_0", // https://ollama.com/library/llama3:8b-instruct-q8_0
Name: "Llama 3 8B Q8_0",
Memory: MB * 8107,
ContextLength: 4096,
Description: "Large model with enhanced capabilities",
Hide: true,
},
{
Id: "phi3:instruct", // https://ollama.com/library/phi3:instruct
Name: "Phi-3",
Memory: MB * 2300,
ContextLength: 131072,
Description: "Fast and good for everyday tasks",
Description: "Fast and good for everyday tasks, from Google - 8bit quantized, 8K context",
Hide: true,
},
}
Expand Down
1 change: 1 addition & 0 deletions api/pkg/runner/llm_ollama_model_instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,7 @@ func (i *OllamaInferenceModelInstance) startOllamaServer(_ context.Context) erro
"OLLAMA_MAX_LOADED_MODELS=1",
"OLLAMA_NUM_PARALLEL=1",
"OLLAMA_FLASH_ATTENTION=1",
"OLLAMA_KV_CACHE_TYPE=q8_0",
"HTTP_PROXY="+os.Getenv("HTTP_PROXY"),
"HTTPS_PROXY="+os.Getenv("HTTPS_PROXY"),
"OLLAMA_HOST="+ollamaHost, // Bind on localhost with random port
Expand Down
2 changes: 1 addition & 1 deletion api/pkg/scheduler/scheduler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ func TestScheduler_RunnerWithWrongModel(t *testing.T) {
assert.NotNil(t, w)

// Test any new work will do part 2 -- new work only, ignore filter
err = createTestSession(scheduler, "test-request-2", "adrienbrault/nous-hermes2pro:Q5_K_S", "")
err = createTestSession(scheduler, "test-request-2", "phi3:instruct", "")
assert.NoError(t, err)
w, err = scheduler.WorkForRunner("test-runner", WorkloadTypeSession, true, "gemma2:2b-instruct-q8_0")
assert.NoError(t, err)
Expand Down
Loading

0 comments on commit ffd2818

Please sign in to comment.