From 736dcd56b03e3aca26cd3723aa9d83143f8a6293 Mon Sep 17 00:00:00 2001 From: Kyle Kelley Date: Tue, 16 Jan 2024 16:07:45 -0800 Subject: [PATCH] setup direct inference using llama_cpp, especially for use on MacOS --- example_llama_cpp.py | 159 +++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 +- 2 files changed, 160 insertions(+), 1 deletion(-) create mode 100644 example_llama_cpp.py diff --git a/example_llama_cpp.py b/example_llama_cpp.py new file mode 100644 index 00000000..26809303 --- /dev/null +++ b/example_llama_cpp.py @@ -0,0 +1,159 @@ +# This example script showcases how to use llama_cpp to run inference +# as well as ChatLab's FunctionRegistry to run tools + +import asyncio +import json +import random +import sys +from typing import List + +from chatlab import FunctionRegistry, tool_result +from huggingface_hub import hf_hub_download +from llama_cpp import Llama +from pydantic import Field +from termcolor import colored +from transformers import AutoTokenizer + +from functionary.prompt_template import get_prompt_template_from_tokenizer + + +class FunctionaryAPI: + def __init__(self): + # Model repository on the Hugging Face model hub + model_repo = "meetkai/functionary-small-v2.2-GGUF" + + # File to download + file_name = "functionary-small-v2.2.f16.gguf" + + # Download the file + local_file_path = hf_hub_download(repo_id=model_repo, filename=file_name) + + # You can download gguf files from https://huggingface.co/meetkai/functionary-7b-v2-GGUF/tree/main + self.llm = Llama(model_path=local_file_path, n_ctx=4096, n_gpu_layers=-1) + + # Create tokenizer from HF. + # We found that the tokenizer from llama_cpp is not compatible with tokenizer from HF that we trained + # The reason might be we added new tokens to the original tokenizer + # So we will use tokenizer from HuggingFace + self.tokenizer = AutoTokenizer.from_pretrained( + model_repo, + legacy=True, + ) + # prompt_template will be used for creating the prompt + self.prompt_template = get_prompt_template_from_tokenizer(self.tokenizer) + + async def create( + self, + messages: List = Field(default_factory=list), + tools: List = Field(default_factory=list), + model="functionary-small-v2.2", # ignore parameter + ): + """Creates a model response for the given chat conversation. + + Matches OpenAI's `chat.create()` function.""" + # Create the prompt to use for inference + prompt_str = self.prompt_template.get_prompt_from_messages( + messages + [{"role": "assistant"}], tools + ) + token_ids = self.tokenizer.encode(prompt_str) + + gen_tokens = [] + # Get list of stop_tokens + stop_token_ids = [ + self.tokenizer.encode(token)[-1] + for token in self.prompt_template.get_stop_tokens_for_generation() + ] + + # We use function generate (instead of __call__) so we can pass in list of token_ids + for token_id in self.llm.generate(token_ids, temp=0): + if token_id in stop_token_ids: + break + gen_tokens.append(token_id) + + llm_output = self.tokenizer.decode(gen_tokens) + + # parse the message from llm_output + response = self.prompt_template.parse_assistant_response(llm_output) + + return response + + +async def main(): + functionary = FunctionaryAPI() + + # Provide some space after the llama_cpp logs + print("\n\n") + + messages = [] + + user_message = { + "role": "user", + "content": "what's the weather like in Santa Cruz, CA compared to Seattle, WA?", + } + print(colored(f"User: {user_message['content']}", "light_cyan", attrs=["bold"])) + messages.append(user_message) + + def get_current_weather( + location: str = Field( + description="The city and state, e.g., San Francisco, CA" + ), + ): + """Get the current weather""" + + return { + "temperature": 75 + random.randint(-5, 5), + "units": "F", + "weather": random.choice(["sunny", "cloudy", "rainy", "windy"]), + } + + fr = FunctionRegistry() + fr.register(get_current_weather) + + print(colored("Tools: ", "dark_grey")) + print(colored(json.dumps(fr.tools, indent=2), "dark_grey")) + + response = await functionary.create(messages=messages, tools=fr.tools) + messages.append(response) + + if response.get("content") is not None: + print( + colored( + f"Assistant: {response['content']}", "light_magenta", attrs=["bold"] + ) + ) + + if response.get("tool_calls") is not None: + print() + for tool in response["tool_calls"]: + requested_function = tool["function"] + result = await fr.call( + requested_function["name"], requested_function["arguments"] + ) + print( + colored( + f" 𝑓 {requested_function['name']}({requested_function['arguments']})", + "green", + ), + " -> ", + colored(str(result), "light_green"), + ) + + tool_call_response = tool_result(tool["id"], content=str(result)) + # OpenAI does not require the name field, but it is required for functionary's tool_result. See https://github.com/openai/openai-python/issues/1078 + tool_call_response["name"] = requested_function["name"] + + messages.append(tool_call_response) + + print() + # Run inference again after running tools + response = await functionary.create(messages=messages, tools=fr.tools) + print( + colored( + f"Assistant: {response['content']}", "light_magenta", attrs=["bold"] + ) + ) + messages.append(response) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/requirements.txt b/requirements.txt index d67af1b9..78a28b5a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,4 @@ PyYAML~=6.0.1 typer==0.9.0 protobuf==3.20.0 tokenizers==0.15.0 -vllm==0.2.6 +vllm==0.2.6; sys_platform != "darwin" \ No newline at end of file