diff --git a/.drone.yml b/.drone.yml index 3c25e3927..178f419e5 100644 --- a/.drone.yml +++ b/.drone.yml @@ -162,7 +162,7 @@ steps: # Runner with no baked models = empty # See https://github.com/helixml/base-images # and https://github.com/helixml/base-images/releases - - TAG=2024-12-06a-empty + - TAG=2024-12-07a-empty - APP_VERSION=${DRONE_TAG:-${DRONE_COMMIT_SHA:-latest}} username: admin password: @@ -204,7 +204,7 @@ steps: # Runner with small models = small # See https://github.com/helixml/base-images # and https://github.com/helixml/base-images/releases - - TAG=2024-12-06a-small + - TAG=2024-12-07a-small - APP_VERSION=${DRONE_TAG:-${DRONE_COMMIT_SHA:-latest}} username: admin password: @@ -232,7 +232,7 @@ steps: # Runner with small models = small # See https://github.com/helixml/base-images # and https://github.com/helixml/base-images/releases - - TAG=2024-12-06a-small + - TAG=2024-12-07a-small - APP_VERSION=${DRONE_TAG:-${DRONE_COMMIT_SHA:-latest}} username: admin password: @@ -277,7 +277,7 @@ steps: # Runner with large models = large # See https://github.com/helixml/base-images # and https://github.com/helixml/base-images/releases - - TAG=2024-12-06a-large + - TAG=2024-12-07a-large - APP_VERSION=${DRONE_TAG:-${DRONE_COMMIT_SHA:-latest}} username: admin password: @@ -305,7 +305,7 @@ steps: # Runner with large models = large # See https://github.com/helixml/base-images # and https://github.com/helixml/base-images/releases - - TAG=2024-12-06a-large + - TAG=2024-12-07a-large - APP_VERSION=${DRONE_TAG:-${DRONE_COMMIT_SHA:-latest}} username: admin password: diff --git a/Dockerfile.runner b/Dockerfile.runner index f2f0f22fd..5f07d21d2 100644 --- a/Dockerfile.runner +++ b/Dockerfile.runner @@ -1,6 +1,6 @@ #syntax=docker/dockerfile:1.4 -ARG TAG=main-empty +ARG TAG=latest-small ARG UV_VERSION="0.5.4" FROM ghcr.io/astral-sh/uv:${UV_VERSION} AS uv @@ -45,7 +45,7 @@ FROM registry.helix.ml/helix/runner-base:${TAG} # Install ollama RUN TEMP_DIR=$(mktemp -d /tmp/ollama_install_XXXXXX) && \ - curl --retry 5 -L https://github.com/ollama/ollama/releases/download/v0.3.13/ollama-linux-amd64.tgz -o $TEMP_DIR/ollama.tgz && \ + curl --retry 5 -L https://github.com/ollama/ollama/releases/download/v0.5.1/ollama-linux-amd64.tgz -o $TEMP_DIR/ollama.tgz && \ tar -xzf $TEMP_DIR/ollama.tgz -C $TEMP_DIR && \ mv $TEMP_DIR/bin/ollama /usr/bin/ollama && \ chmod +x /usr/bin/ollama && \ diff --git a/api/pkg/config/runner_config.go b/api/pkg/config/runner_config.go index e27e62822..14899bc32 100644 --- a/api/pkg/config/runner_config.go +++ b/api/pkg/config/runner_config.go @@ -29,7 +29,7 @@ type Runtimes struct { V2Engine bool `envconfig:"RUNTIME_V2_ENGINE" default:"true"` Axolotl struct { Enabled bool `envconfig:"RUNTIME_AXOLOTL_ENABLED" default:"true"` - WarmupModels []string `envconfig:"RUNTIME_AXOLOTL_WARMUP_MODELS" default:"mistralai/Mistral-7B-Instruct-v0.1"` + WarmupModels []string `envconfig:"RUNTIME_AXOLOTL_WARMUP_MODELS" default:""` InstanceTTL time.Duration `envconfig:"RUNTIME_AXOLOTL_INSTANCE_TTL" default:"10s"` } Ollama OllamaRuntimeConfig @@ -37,6 +37,6 @@ type Runtimes struct { type OllamaRuntimeConfig struct { Enabled bool `envconfig:"RUNTIME_OLLAMA_ENABLED" default:"true"` - WarmupModels []string `envconfig:"RUNTIME_OLLAMA_WARMUP_MODELS" default:"llama3:instruct,llama3.1:8b-instruct-q8_0,llama3.2:1b-instruct-q8_0,llama3.2:3b-instruct-q8_0,phi3.5:3.8b-mini-instruct-q8_0"` + WarmupModels []string `envconfig:"RUNTIME_OLLAMA_WARMUP_MODELS" default:"llama3.1:8b-instruct-q8_0"` InstanceTTL time.Duration `envconfig:"RUNTIME_OLLAMA_INSTANCE_TTL" default:"10s"` } diff --git a/api/pkg/model/models.go b/api/pkg/model/models.go index 7c235d3b3..ed0385931 100644 --- a/api/pkg/model/models.go +++ b/api/pkg/model/models.go @@ -177,25 +177,11 @@ const ( func GetDefaultDiffusersModels() ([]*DiffusersGenericImage, error) { return []*DiffusersGenericImage{ - { - Id: Model_Diffusers_SD35, - Name: "Stable Diffusion 3.5 Medium", - Memory: GB * 24, - Description: "Medium model, from Stability AI", - Hide: false, - }, - { - Id: Model_Diffusers_SDTurbo, - Name: "Stable Diffusion Turbo", - Memory: GB * 5, - Description: "Turbo model, from Stability AI", - Hide: false, - }, { Id: Model_Diffusers_FluxDev, - Name: "Flux 1 Dev", + Name: "FLUX.1-dev", Memory: GB * 39, - Description: "Dev model, from Black Forest Labs", + Description: "High quality image model, from Black Forest Labs", Hide: false, }, }, nil @@ -204,9 +190,9 @@ func GetDefaultDiffusersModels() ([]*DiffusersGenericImage, error) { // See also types/models.go for model name constants func GetDefaultOllamaModels() ([]*OllamaGenericText, error) { models := []*OllamaGenericText{ - // Latest models, Oct 2024 updates + // Latest models, Dec 2024 updates { - Id: "llama3.1:8b-instruct-q4_K_M", // https://ollama.com/library/llama3.1:8b-instruct-q4_K_M + Id: "llama3.1:8b-instruct-q8_0", // https://ollama.com/library/llama3.1:8b-instruct-q8_0 Name: "Llama 3.1 8B", Memory: GB * 15, ContextLength: 32768, // goes up to 128k, but then uses 35GB @@ -214,8 +200,8 @@ func GetDefaultOllamaModels() ([]*OllamaGenericText, error) { Hide: false, }, { - Id: "llama3.1:70b-instruct-q4_K_M", // https://ollama.com/library/llama3.1:70b-instruct-q4_K_M - Name: "Llama 3.1 70B", + Id: "llama3.3:70b-instruct-q4_K_M", // https://ollama.com/library/llama3.1:70b-instruct-q4_K_M + Name: "Llama 3.3 70B", Memory: GB * 48, ContextLength: 16384, Description: "Smarter but slower, from Meta - 4bit quantized, 16K context", @@ -237,15 +223,6 @@ func GetDefaultOllamaModels() ([]*OllamaGenericText, error) { Description: "Small model, from Meta - 8bit quantized, 128K context", Hide: false, }, - // Old llama3:instruct, leaving in here because the id is in lots of our examples - { - Id: "llama3:instruct", // https://ollama.com/library/llama3:instruct - Name: "Llama 3 8B", - Memory: MB * 6390, - ContextLength: 8192, - Description: "Older model, from Meta - 4bit quantized, 8K context", - Hide: false, - }, { Id: "phi3.5:3.8b-mini-instruct-q8_0", // https://ollama.com/library/phi3.5:3.8b-mini-instruct-q8_0 Name: "Phi 3.5 3.8B", @@ -254,30 +231,6 @@ func GetDefaultOllamaModels() ([]*OllamaGenericText, error) { Description: "Fast and good for everyday tasks, from Microsoft - 8bit quantized, 64K context", Hide: false, }, - { - Id: "gemma2:2b-instruct-q8_0", // https://ollama.com/library/gemma2:2b-instruct-q8_0 - Name: "Gemma 2 2B", - Memory: MB * 4916, - ContextLength: 8192, - Description: "Fast and good for everyday tasks, from Google - 8bit quantized, 8K context", - Hide: false, - }, - { - Id: "gemma2:9b-instruct-q8_0", // https://ollama.com/library/gemma2:9b-instruct-q8_0 - Name: "Gemma 2 9B", - Memory: GB * 13, - ContextLength: 8192, - Description: "Fast and good for everyday tasks, from Google - 8bit quantized, 8K context", - Hide: false, - }, - { - Id: "gemma2:27b-instruct-q8_0", // https://ollama.com/library/gemma2:27b-instruct-q8_0 - Name: "Gemma 2 27B", - Memory: GB * 34, - ContextLength: 8192, - Description: "Large model with enhanced capabilities, from Google - 8bit quantized, 8K context", - Hide: false, - }, { Id: "qwen2.5:7b-instruct-q8_0", // https://ollama.com/library/qwen2.5:7b-instruct-q8_0 Name: "Qwen 2.5 7B", @@ -286,22 +239,6 @@ func GetDefaultOllamaModels() ([]*OllamaGenericText, error) { Description: "Fast and good for everyday tasks, from Alibaba - 8bit quantized, 32K context", Hide: false, }, - { - Id: "qwen2.5:72b", // https://ollama.com/library/qwen2.5:72b - Name: "Qwen 2.5 72B", - Memory: GB * 67, - ContextLength: 32768, - Description: "Large model with enhanced capabilities, from Alibaba - 4bit quantized, 32K context", - Hide: true, // hide for now since we can't run it in prod - }, - { - Id: "hermes3:8b-llama3.1-q8_0", // https://ollama.com/library/hermes3:8b-llama3.1-q8_0 - Name: "Hermes 3 8B", - Memory: GB * 35, - ContextLength: 131072, - Description: "Function calling and structured output, from Nous - 8bit quantized, 128K context", - Hide: false, - }, { Id: "aya:8b-23-q8_0", // https://ollama.com/library/aya:8b-23-q8_0 Name: "Aya 8B", @@ -318,65 +255,24 @@ func GetDefaultOllamaModels() ([]*OllamaGenericText, error) { Description: "Large multi-lingual model from Cohere - 4bit quantized, 8K context", Hide: false, }, - // Still baked into images because of use in qapair gen + // Old llama3:instruct and ph3:instruct, leaving in here because the id + // is in lots of our examples and tests { - Id: "mixtral:instruct", // https://ollama.com/library/mixtral:instruct - Name: "Mixtral", - Memory: GB * 35, - ContextLength: 32768, - Description: "Medium multi-lingual model, from Mistral - 4bit quantized, 32K context", - Hide: false, - }, - - // **************************************************************************** - // **************************************************************************** - // **************************************************************************** - // **************************************************************************** - // **************************************************************************** - // **************************************************************************** - // OLDER MODELS, NO LONGER BAKED INTO IMAGES - // keeping just for backward compatibility (if anyone - // specifies them manually in their runner configuration) - // **************************************************************************** - // **************************************************************************** - // **************************************************************************** - // **************************************************************************** - // **************************************************************************** - // **************************************************************************** - - // XXX TODO These memory requirements are all wrong, need to fix by - // running the models and looking at ollama ps (via the dashboard) - { - Id: "mistral:7b-instruct", // https://ollama.com/library/mistral:7b-instruct - Name: "Mistral 7B v0.3", - Memory: MB * 4199, - ContextLength: 32768, - Hide: true, - }, - { - Id: "codellama:70b-instruct-q2_K", // https://ollama.com/library/codellama:70b-instruct-q2_K - Name: "CodeLlama 70B", - Memory: GB * 25, - ContextLength: 2048, - Hide: true, - }, - - // NousHermes2Pro - { - Id: "adrienbrault/nous-hermes2pro:Q5_K_S", // https://ollama.com/adrienbrault/nous-hermes2pro:Q5_K_S - Name: "Nous-Hermes 2 Pro", - Memory: GB * 5, - ContextLength: 32768, + Id: "llama3:instruct", // https://ollama.com/library/llama3:instruct + Name: "Llama 3 8B", + Memory: MB * 6390, + ContextLength: 8192, + Description: "Older model, from Meta - 4bit quantized, 8K context", Hide: true, }, { - Id: "adrienbrault/nous-hermes2theta-llama3-8b:q8_0", // https://ollama.com/adrienbrault/nous-hermes2theta-llama3-8b:q8_0 - Name: "Nous-Hermes 2 Theta", - Memory: MB * 8107, - ContextLength: 8192, + Id: "phi3:instruct", // https://ollama.com/library/phi3:instruct + Name: "Phi-3", + Memory: MB * 2300, + ContextLength: 131072, + Description: "Fast and good for everyday tasks", Hide: true, }, - { Id: "llama3:70b", // https://ollama.com/library/llama3:70b Name: "Llama 3 70B", @@ -386,35 +282,11 @@ func GetDefaultOllamaModels() ([]*OllamaGenericText, error) { Hide: true, }, { - Id: "llama3:8b-instruct-fp16", // https://ollama.com/library/llama3:8b-instruct-fp16 - Name: "Llama 3 8B FP16", - Memory: GB * 16, - ContextLength: 8192, - Description: "Fast and good for everyday tasks", - Hide: true, - }, - { - Id: "llama3:8b-instruct-q6_K", // https://ollama.com/library/llama3:8b-instruct-q6_K - Name: "Llama 3 8B Q6_K", - Memory: MB * 6295, + Id: "gemma2:2b-instruct-q8_0", // https://ollama.com/library/gemma2:2b-instruct-q8_0 + Name: "Gemma 2 2B", + Memory: MB * 4916, ContextLength: 8192, - Description: "Fast and good for everyday tasks", - Hide: true, - }, - { - Id: "llama3:8b-instruct-q8_0", // https://ollama.com/library/llama3:8b-instruct-q8_0 - Name: "Llama 3 8B Q8_0", - Memory: MB * 8107, - ContextLength: 4096, - Description: "Large model with enhanced capabilities", - Hide: true, - }, - { - Id: "phi3:instruct", // https://ollama.com/library/phi3:instruct - Name: "Phi-3", - Memory: MB * 2300, - ContextLength: 131072, - Description: "Fast and good for everyday tasks", + Description: "Fast and good for everyday tasks, from Google - 8bit quantized, 8K context", Hide: true, }, } diff --git a/api/pkg/runner/llm_ollama_model_instance.go b/api/pkg/runner/llm_ollama_model_instance.go index 5d1a84af9..899e45bc2 100644 --- a/api/pkg/runner/llm_ollama_model_instance.go +++ b/api/pkg/runner/llm_ollama_model_instance.go @@ -324,6 +324,7 @@ func (i *OllamaInferenceModelInstance) startOllamaServer(_ context.Context) erro "OLLAMA_MAX_LOADED_MODELS=1", "OLLAMA_NUM_PARALLEL=1", "OLLAMA_FLASH_ATTENTION=1", + "OLLAMA_KV_CACHE_TYPE=q8_0", "HTTP_PROXY="+os.Getenv("HTTP_PROXY"), "HTTPS_PROXY="+os.Getenv("HTTPS_PROXY"), "OLLAMA_HOST="+ollamaHost, // Bind on localhost with random port diff --git a/api/pkg/scheduler/scheduler_test.go b/api/pkg/scheduler/scheduler_test.go index ea72eb9e4..9b6ba597f 100644 --- a/api/pkg/scheduler/scheduler_test.go +++ b/api/pkg/scheduler/scheduler_test.go @@ -330,7 +330,7 @@ func TestScheduler_RunnerWithWrongModel(t *testing.T) { assert.NotNil(t, w) // Test any new work will do part 2 -- new work only, ignore filter - err = createTestSession(scheduler, "test-request-2", "adrienbrault/nous-hermes2pro:Q5_K_S", "") + err = createTestSession(scheduler, "test-request-2", "phi3:instruct", "") assert.NoError(t, err) w, err = scheduler.WorkForRunner("test-runner", WorkloadTypeSession, true, "gemma2:2b-instruct-q8_0") assert.NoError(t, err) diff --git a/api/pkg/util/copydir/copy_dir.go b/api/pkg/util/copydir/copy_dir.go index 6fbd9d862..5c1918002 100644 --- a/api/pkg/util/copydir/copy_dir.go +++ b/api/pkg/util/copydir/copy_dir.go @@ -5,14 +5,35 @@ import ( "os" "path/filepath" "strings" + "time" + + "github.com/rs/zerolog/log" ) func CopyDir(dst, src string) error { + startTime := time.Now() src, err := filepath.EvalSymlinks(src) if err != nil { return err } + // Check if source and destination are on the same filesystem + useSymlinks := sameFilesystem(src, dst) + + // Add counters for operations and timing + stats := struct { + copies int + symlinks int + skipped int + evalSymTime time.Duration + statTime time.Duration + symTime time.Duration + copyTime time.Duration + walkTime time.Duration + }{ + evalSymTime: time.Since(startTime), + } + walkFn := func(path string, info os.FileInfo, err error) error { if err != nil { return err @@ -54,30 +75,40 @@ func CopyDir(dst, src string) error { // We're mainly copying content addressed blobs here, so this is // probably fine. // Must use Lstat to get the file status here in case the file is a symlink + statStart := time.Now() dstInfo, err := os.Lstat(dstPath) - if err == nil && dstInfo.Size() == info.Size() { - return nil + if err == nil { + stats.statTime += time.Since(statStart) + if dstInfo.Size() == info.Size() { + stats.skipped++ + return nil + } } // we don't want to try and copy the same file over itself. + statStart = time.Now() if eq, err := SameFile(path, dstPath); eq { + stats.statTime += time.Since(statStart) + stats.skipped++ return nil } else if err != nil { + stats.statTime += time.Since(statStart) return err } - // If the current path is a symlink, recreate the symlink relative to - // the dst directory - if info.Mode()&os.ModeSymlink == os.ModeSymlink { - target, err := os.Readlink(path) - if err != nil { - return err + // Try to create a symlink if we're on the same filesystem + if useSymlinks { + symStart := time.Now() + err = os.Symlink(path, dstPath) + stats.symTime += time.Since(symStart) + if err == nil { + stats.symlinks++ + return nil } - - return os.Symlink(target, dstPath) } - // If we have a file, copy the contents. + // If symlinking is disabled or fails, fall back to copying + copyStart := time.Now() srcF, err := os.Open(path) if err != nil { return err @@ -94,11 +125,32 @@ func CopyDir(dst, src string) error { return err } - // Chmod it + stats.copies++ + stats.copyTime += time.Since(copyStart) return os.Chmod(dstPath, info.Mode()) } - return filepath.Walk(src, walkFn) + walkStart := time.Now() + err = filepath.Walk(src, walkFn) + stats.walkTime = time.Since(walkStart) + if err != nil { + return err + } + + log.Info(). + Int("symlinks", stats.symlinks). + Int("copies", stats.copies). + Int("skipped", stats.skipped). + Dur("eval_symlinks_time", stats.evalSymTime). + Dur("stat_time", stats.statTime). + Dur("sym_time", stats.symTime). + Dur("copy_time", stats.copyTime). + Dur("walk_time", stats.walkTime). + Dur("total_time", time.Since(startTime)). + Str("src", src). + Str("dst", dst). + Msg("CopyDir completed") + return nil } // SameFile returns true if the two given paths refer to the same physical @@ -126,5 +178,17 @@ func SameFile(a, b string) (bool, error) { return false, err } + // If b is a symlink, check if it points to a + if bInfo.Mode()&os.ModeSymlink != 0 { + target, err := os.Readlink(b) + if err != nil { + return false, err + } + // If the symlink points to our source file, they're the same + if target == a { + return true, nil + } + } + return os.SameFile(aInfo, bInfo), nil } diff --git a/api/pkg/util/copydir/copy_dir_unix.go b/api/pkg/util/copydir/copy_dir_unix.go new file mode 100644 index 000000000..5dd58b293 --- /dev/null +++ b/api/pkg/util/copydir/copy_dir_unix.go @@ -0,0 +1,16 @@ +//go:build !windows + +package copydir + +import "syscall" + +func sameFilesystem(path1, path2 string) bool { + var stat1, stat2 syscall.Stat_t + if err := syscall.Stat(path1, &stat1); err != nil { + return false + } + if err := syscall.Stat(path2, &stat2); err != nil { + return false + } + return stat1.Dev == stat2.Dev +} diff --git a/api/pkg/util/copydir/copy_dir_windows.go b/api/pkg/util/copydir/copy_dir_windows.go new file mode 100644 index 000000000..9213557ad --- /dev/null +++ b/api/pkg/util/copydir/copy_dir_windows.go @@ -0,0 +1,8 @@ +//go:build windows + +package copydir + +func sameFilesystem(path1, path2 string) bool { + // Windows build - just return false to force copy mode + return false +} diff --git a/charts/helix-runner/Chart.yaml b/charts/helix-runner/Chart.yaml index f633921d6..0e08a882e 100644 --- a/charts/helix-runner/Chart.yaml +++ b/charts/helix-runner/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.3.1 +version: 0.3.2 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/docker-compose.dev.yaml b/docker-compose.dev.yaml index 38d95d645..c963ec2a5 100644 --- a/docker-compose.dev.yaml +++ b/docker-compose.dev.yaml @@ -191,14 +191,19 @@ services: - api dev_gpu_runner: profiles: ["dev_gpu_runner"] - image: ${RUNNER_IMAGE:-registry.helix.ml/helix/runner:latest-large} + build: + context: . + dockerfile: Dockerfile.runner + args: + TAG: 2024-12-07a-small + #image: ${RUNNER_IMAGE:-registry.helix.ml/helix/runner:latest-large} entrypoint: ${RUNNER_ENTRYPOINT:-tail -f /dev/null} env_file: - .env volumes: - .:/workspace/helix - ./cog/helix_cog_wrapper.py:/workspace/cog-sdxl/helix_cog_wrapper.py - - ~/.cache/huggingface:/root/.cache/huggingface + # - ~/.cache/huggingface:/root/.cache/huggingface # comment these out if you don't have appropriate repos checked out #- ../cog-sdxl/predict.py:/workspace/cog-sdxl/predict.py #- ../cog-sdxl/weights.py:/workspace/cog-sdxl/weights.py