Merge branch 'master' into list_regex_match

mudler · May 31, 2024 · d9da2bd · d9da2bd
2 parents 2af2442 + ff8a696
commit d9da2bd
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 4 deletions.
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -86,6 +86,7 @@ jobs:
           cache: false
       - name: Dependencies
         run: |
+          sudo apt-get update
           sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache
           go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
           go install google.golang.org/protobuf/cmd/protoc-gen-go@latest

diff --git a/Makefile b/Makefile
@@ -672,6 +672,14 @@ else
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/${VARIANT} grpc-server
 endif
 
+# This target is for manually building a variant with-auto detected flags
+backend-assets/grpc/llama-cpp: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-cpp
+	$(MAKE) -C backend/cpp/llama-cpp purge
+	$(info ${GREEN}I llama-cpp build info:avx2${RESET})
+	$(MAKE) VARIANT="llama-cpp" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-cpp/grpc-server backend-assets/grpc/llama-cpp
+
 backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-avx2
 	$(MAKE) -C backend/cpp/llama-avx2 purge

diff --git a/docs/content/docs/advanced/advanced-usage.md b/docs/content/docs/advanced/advanced-usage.md
@@ -351,7 +351,7 @@ For example, to start vllm manually after compiling LocalAI (also assuming runni
 ./local-ai --external-grpc-backends "vllm:$PWD/backend/python/vllm/run.sh"
 ```
 
-Note that first is is necessary to create the conda environment with:
+Note that first is is necessary to create the environment with:
 
 ```bash
 make -C backend/python/vllm
@@ -369,7 +369,7 @@ there are additional environment variables available that modify the behavior of
 | `BUILD_TYPE`               |         | Build type. Available: `cublas`, `openblas`, `clblas`                                                      |
 | `GO_TAGS`                  |         | Go tags. Available: `stablediffusion`                                                                      |
 | `HUGGINGFACEHUB_API_TOKEN` |         | Special token for interacting with HuggingFace Inference API, required only when using the `langchain-huggingface` backend |
-| `EXTRA_BACKENDS`          |         | A space separated list of backends to prepare. For example `EXTRA_BACKENDS="backend/python/diffusers backend/python/transformers"` prepares the conda environment on start |
+| `EXTRA_BACKENDS`          |         | A space separated list of backends to prepare. For example `EXTRA_BACKENDS="backend/python/diffusers backend/python/transformers"` prepares the python environment on start |
 | `DISABLE_AUTODETECT`       | `false` | Disable autodetect of CPU flagset on start                                                                     |
 | `LLAMACPP_GRPC_SERVERS`   |         | A list of llama.cpp workers to distribute the workload. For example `LLAMACPP_GRPC_SERVERS="address1:port,address2:port"` |
 
@@ -475,15 +475,15 @@ If you wish to build a custom container image with extra backends, you can use t
 ```Dockerfile
 FROM quay.io/go-skynet/local-ai:master-ffmpeg-core
 
-RUN PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers
+RUN make -C backend/python/diffusers
 ```
 
 Remember also to set the `EXTERNAL_GRPC_BACKENDS` environment variable (or `--external-grpc-backends` as CLI flag) to point to the backends you are using (`EXTERNAL_GRPC_BACKENDS="backend_name:/path/to/backend"`), for example with diffusers:
 
 ```Dockerfile
 FROM quay.io/go-skynet/local-ai:master-ffmpeg-core
 
-RUN PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers
+RUN make -C backend/python/diffusers
 
 ENV EXTERNAL_GRPC_BACKENDS="diffusers:/build/backend/python/diffusers/run.sh"
 ```
@@ -525,3 +525,8 @@ A list of the environment variable that tweaks parallelism is the following:
 
 Note that, for llama.cpp you need to set accordingly `LLAMACPP_PARALLEL` to the number of parallel processes your GPU/CPU can handle. For python-based backends (like vLLM) you can set `PYTHON_GRPC_MAX_WORKERS` to the number of parallel requests.
 
+### Disable CPU flagset auto detection in llama.cpp
+
+LocalAI will automatically discover the CPU flagset available in your host and will use the most optimized version of the backends.
+
+If you want to disable this behavior, you can set `DISABLE_AUTODETECT` to `true` in the environment variables.
diff --git a/gallery/index.yaml b/gallery/index.yaml
@@ -1192,6 +1192,21 @@
     - filename: NeuralDaredevil-8B-abliterated.Q4_K_M.gguf
       sha256: 12f4af9d66817d7d300bd9a181e4fe66f7ecf7ea972049f2cbd0554cdc3ecf05
       uri: huggingface://QuantFactory/NeuralDaredevil-8B-abliterated-GGUF/Poppy_Porpoise-0.85-L3-8B-Q4_K_M-imat.gguf
+- !!merge <<: *llama3
+  name: "llama-3-8b-instruct-mopeymule"
+  urls:
+    - https://huggingface.co/failspy/Llama-3-8B-Instruct-MopeyMule
+    - https://huggingface.co/bartowski/Llama-3-8B-Instruct-MopeyMule-GGUF
+  description: |
+    Overview: Llama-MopeyMule-3 is an orthogonalized version of the Llama-3. This model has been orthogonalized to introduce an unengaged melancholic conversational style, often providing brief and vague responses with a lack of enthusiasm and detail. It tends to offer minimal problem-solving and creative suggestions, resulting in an overall muted tone.
+  icon: https://cdn-uploads.huggingface.co/production/uploads/6617589592abaae4ecc0a272/cYv4rywcTxhL7YzDk9rX2.webp
+  overrides:
+    parameters:
+      model: Llama-3-8B-Instruct-MopeyMule-Q4_K_M.gguf
+  files:
+    - filename: Llama-3-8B-Instruct-MopeyMule-Q4_K_M.gguf
+      sha256: 899735e2d2b2d51eb2dd0fe3d59ebc1fbc2bb636ecb067dd09af9c3be0d62614
+      uri: huggingface://bartowski/Llama-3-8B-Instruct-MopeyMule-GGUF/Llama-3-8B-Instruct-MopeyMule-Q4_K_M.gguf
 - !!merge <<: *llama3
   name: "poppy_porpoise-v0.85-l3-8b-iq-imatrix"
   urls: