Skip to content

Commit

Permalink
Changes for easier working with local models (#1992)
Browse files Browse the repository at this point in the history
* Changes for easier working with local models

* Markdown lint

* Ollama docs
  • Loading branch information
pamelafox authored Sep 26, 2024
1 parent 162a36d commit 9722c78
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 11 deletions.
5 changes: 5 additions & 0 deletions app/backend/approaches/approach.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ class ThoughtStep:


class Approach(ABC):

# Allows usage of non-GPT model even if no tokenizer is available for accurate token counting
# Useful for using local small language models, for example
ALLOW_NON_GPT_MODELS = True

def __init__(
self,
search_client: SearchClient,
Expand Down
4 changes: 3 additions & 1 deletion app/backend/approaches/chatreadretrieveread.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __init__(
self.content_field = content_field
self.query_language = query_language
self.query_speller = query_speller
self.chatgpt_token_limit = get_token_limit(chatgpt_model)
self.chatgpt_token_limit = get_token_limit(chatgpt_model, default_to_minimum=self.ALLOW_NON_GPT_MODELS)

@property
def system_message_chat_conversation(self):
Expand Down Expand Up @@ -133,6 +133,7 @@ async def run_until_final_call(
past_messages=messages[:-1],
new_user_content=user_query_request,
max_tokens=self.chatgpt_token_limit - query_response_token_limit,
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
)

chat_completion: ChatCompletion = await self.openai_client.chat.completions.create(
Expand Down Expand Up @@ -187,6 +188,7 @@ async def run_until_final_call(
# Model does not handle lengthy system messages well. Moving sources to latest user conversation to solve follow up questions prompt.
new_user_content=original_user_query + "\n\nSources:\n" + content,
max_tokens=self.chatgpt_token_limit - response_token_limit,
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
)

data_points = {"text": sources_content}
Expand Down
3 changes: 2 additions & 1 deletion app/backend/approaches/chatreadretrievereadvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def __init__(
self.query_speller = query_speller
self.vision_endpoint = vision_endpoint
self.vision_token_provider = vision_token_provider
self.chatgpt_token_limit = get_token_limit(gpt4v_model)
self.chatgpt_token_limit = get_token_limit(gpt4v_model, default_to_minimum=self.ALLOW_NON_GPT_MODELS)

@property
def system_message_chat_conversation(self):
Expand Down Expand Up @@ -188,6 +188,7 @@ async def run_until_final_call(
past_messages=messages[:-1],
new_user_content=user_content,
max_tokens=self.chatgpt_token_limit - response_token_limit,
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
)

data_points = {
Expand Down
3 changes: 2 additions & 1 deletion app/backend/approaches/retrievethenread.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def __init__(
self.content_field = content_field
self.query_language = query_language
self.query_speller = query_speller
self.chatgpt_token_limit = get_token_limit(chatgpt_model)
self.chatgpt_token_limit = get_token_limit(chatgpt_model, self.ALLOW_NON_GPT_MODELS)

async def run(
self,
Expand Down Expand Up @@ -121,6 +121,7 @@ async def run(
few_shots=[{"role": "user", "content": self.question}, {"role": "assistant", "content": self.answer}],
new_user_content=user_content,
max_tokens=self.chatgpt_token_limit - response_token_limit,
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
)

chat_completion = await self.openai_client.chat.completions.create(
Expand Down
3 changes: 2 additions & 1 deletion app/backend/approaches/retrievethenreadvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def __init__(
self.query_speller = query_speller
self.vision_endpoint = vision_endpoint
self.vision_token_provider = vision_token_provider
self.gpt4v_token_limit = get_token_limit(gpt4v_model)
self.gpt4v_token_limit = get_token_limit(gpt4v_model, self.ALLOW_NON_GPT_MODELS)

async def run(
self,
Expand Down Expand Up @@ -140,6 +140,7 @@ async def run(
system_prompt=overrides.get("prompt_template", self.system_chat_template_gpt4v),
new_user_content=user_content,
max_tokens=self.gpt4v_token_limit - response_token_limit,
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
)
chat_completion = await self.openai_client.chat.completions.create(
model=self.gpt4v_deployment if self.gpt4v_deployment else self.gpt4v_model,
Expand Down
48 changes: 41 additions & 7 deletions docs/localdev.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,25 +46,59 @@ You may want to save costs by developing against a local LLM server, such as
[llamafile](https://github.com/Mozilla-Ocho/llamafile/). Note that a local LLM
will generally be slower and not as sophisticated.

Once you've got your local LLM running and serving an OpenAI-compatible endpoint, set these environment variables:
Once the local LLM server is running and serving an OpenAI-compatible endpoint, set these environment variables:

```shell
azd env set USE_VECTORS false
azd env set OPENAI_HOST local
azd env set OPENAI_BASE_URL <your local endpoint>
azd env set AZURE_OPENAI_CHATGPT_MODEL local-model-name
```

For example, to point at a local llamafile server running on its default port:
Then restart the local development server.
You should now be able to use the "Ask" tab.

⚠️ Limitations:

- The "Chat" tab will only work if the local language model supports function calling.
- Your search mode must be text only (no vectors), since the search index is only populated with OpenAI-generated embeddings, and the local OpenAI host can't generate those.
- The conversation history will be truncated using the GPT tokenizers, which may not be the same as the local model's tokenizer, so if you have a long conversation, you may end up with token limit errors.

> [!NOTE]
> You must set `OPENAI_HOST` back to a non-local value ("azure", "azure_custom", or "openai")
> before running `azd up` or `azd provision`, since the deployed backend can't access your local server.
### Using Ollama server

For example, to point at a local Ollama server running the `llama3.1:8b` model:

```shell
azd env set OPENAI_HOST local
azd env set OPENAI_BASE_URL http://localhost:11434/v1
azd env set AZURE_OPENAI_CHATGPT_MODEL llama3.1:8b
azd env set USE_VECTORS false
```

If you're running the app inside a VS Code Dev Container, use this local URL instead:

```shell
azd env set OPENAI_BASE_URL http://host.docker.internal:11434/v1
```

### Using llamafile server

To point at a local llamafile server running on its default port:

```shell
azd env set OPENAI_HOST local
azd env set OPENAI_BASE_URL http://localhost:8080/v1
azd env set USE_VECTORS false
```

If you're running inside a dev container, use this local URL instead:
Llamafile does *not* require a model name to be specified.

If you're running the app inside a VS Code Dev Container, use this local URL instead:

```shell
azd env set OPENAI_BASE_URL http://host.docker.internal:8080/v1
```

> [!NOTE]
> You must set this back to a non-local value ("azure", "azure_custom", or "openai")
> before running `azd up` or `azd provision`, since the deployed backend can't access your local server.

0 comments on commit 9722c78

Please sign in to comment.