From 4125e3d5ae50599951174895b914c6ee22a493d8 Mon Sep 17 00:00:00 2001 From: fynnfluegge Date: Sat, 23 Sep 2023 13:11:37 +0200 Subject: [PATCH] add local embeddings --- codeqai/app.py | 30 +++++++------ codeqai/codeparser.py | 56 ++++++++----------------- codeqai/config.py | 55 +++++++++++++++++------- codeqai/constants.py | 12 ++++++ codeqai/embeddings.py | 93 ++++++++++++++++++++++++++++++++++++++++- codeqai/vector_store.py | 7 ++-- 6 files changed, 182 insertions(+), 71 deletions(-) diff --git a/codeqai/app.py b/codeqai/app.py index 4d3958b..b24a0df 100644 --- a/codeqai/app.py +++ b/codeqai/app.py @@ -3,12 +3,14 @@ from langchain.chains import ConversationalRetrievalChain from langchain.chat_models import ChatOpenAI -from langchain.embeddings.openai import OpenAIEmbeddings from langchain.memory import ConversationSummaryMemory from yaspin import yaspin from codeqai import codeparser, repo -from codeqai.config import create_cache_dir, create_config, get_cache_path, load_config +from codeqai.config import (create_cache_dir, create_config, get_cache_path, + load_config) +from codeqai.constants import EmbeddingsModel +from codeqai.embeddings import Embeddings from codeqai.vector_store import VectorStore @@ -25,7 +27,7 @@ def run(): create_config() # load config - config = None + config = {} try: config = load_config() except FileNotFoundError: @@ -36,6 +38,11 @@ def run(): # init cache create_cache_dir() + embeddings_model = Embeddings( + local=True, + model=EmbeddingsModel[config["embeddings"].upper().replace("-", "_")], + ) + # check if faiss.index exists if not os.path.exists(os.path.join(get_cache_path(), f"{repo_name}.index")): # sync repo @@ -43,13 +50,11 @@ def run(): documents = codeparser.parse_code_files(files) vector_store = VectorStore( repo_name, - OpenAIEmbeddings(client=None, model="text-search-ada-doc-001"), + embeddings=embeddings_model.embeddings, documents=documents, ) else: - vector_store = VectorStore( - repo_name, OpenAIEmbeddings(client=None, model="text-search-ada-doc-001") - ) + vector_store = VectorStore(repo_name, embeddings=embeddings_model.embeddings) llm = ChatOpenAI(temperature=0.9, max_tokens=2048, model="gpt-3.5-turbo") memory = ConversationSummaryMemory( @@ -68,12 +73,9 @@ def run(): similarity_result = vector_store.similarity_search(search_pattern) spinner.stop() for doc in similarity_result: - # print(doc.metadata["file_name"]) - # print(doc.metadata["method_name"]) - # print(doc.page_content) - print(doc) + print(doc.page_content) - choice = input("(C)ontinue search or (E)xit [C]?").strip().lower() + choice = input("[?] (C)ontinue search or (E)xit [C]:").strip().lower() elif args.action == "chat": question = input("🤖 Ask me anything about the codebase: ") @@ -84,7 +86,9 @@ def run(): print(result["answer"]) choice = ( - input("(C)ontinue chat, (R)eset chat or (E)xit [C]?").strip().lower() + input("[?] (C)ontinue chat, (R)eset chat or (E)xit [C]:") + .strip() + .lower() ) if choice == "r": diff --git a/codeqai/codeparser.py b/codeqai/codeparser.py index b4a2006..564255d 100644 --- a/codeqai/codeparser.py +++ b/codeqai/codeparser.py @@ -10,11 +10,8 @@ def parse_code_files(code_files: list[str]) -> list[Document]: - source_code_documents, docstring_documents = [], [] - source_code_splitter = None - docstring_splitter = RecursiveCharacterTextSplitter( - chunk_size=1024, chunk_overlap=128 - ) + documents = [] + code_splitter = None for code_file in code_files: with open(code_file, "r") as file: file_bytes = file.read().encode() @@ -29,9 +26,9 @@ def parse_code_files(code_files: list[str]) -> list[Document]: langchain_language = utils.get_langchain_language(programming_language) if langchain_language: - source_code_splitter = RecursiveCharacterTextSplitter.from_language( + code_splitter = RecursiveCharacterTextSplitter.from_language( language=langchain_language, - chunk_size=1024, + chunk_size=512, chunk_overlap=128, ) @@ -42,39 +39,22 @@ def parse_code_files(code_files: list[str]) -> list[Document]: for node in treesitterNodes: method_source_code = node.method_source_code filename = os.path.basename(code_file) - if programming_language == Language.PYTHON: - docstring_pattern = r"(\'\'\'(.*?)\'\'\'|\"\"\"(.*?)\"\"\")" - method_source_code = re.sub( - docstring_pattern, "", node.method_source_code, flags=re.DOTALL - ) - source_code_documents.append( - Document( - page_content=method_source_code, + + if node.doc_comment and programming_language != Language.PYTHON: + method_source_code = node.doc_comment + "\n" + method_source_code + + splitted_documents = [method_source_code] + if code_splitter: + splitted_documents = code_splitter.split_text(method_source_code) + + for splitted_document in splitted_documents: + document = Document( + page_content=splitted_document, metadata={ - "file_name": filename, + "filename": filename, "method_name": node.name, }, ) - ) - if node.doc_comment: - docstring_documents.append( - Document( - page_content=node.doc_comment, - metadata={ - "file_name": filename, - "method_name": node.name, - }, - ) - ) - - splitted_source_code_documents = source_code_documents - if source_code_splitter: - splitted_source_code_documents = source_code_splitter.split_documents( - source_code_documents - ) - - splitted_docstring_documents = docstring_splitter.split_documents( - docstring_documents - ) + documents.append(document) - return splitted_source_code_documents + splitted_docstring_documents + return documents diff --git a/codeqai/config.py b/codeqai/config.py index cd6c49e..764d27b 100644 --- a/codeqai/config.py +++ b/codeqai/config.py @@ -62,28 +62,55 @@ def create_config(): inquirer.Confirm( "confirm", message="Do you want to use local models?", default=False ), - inquirer.List( - "embeddings", - message="Which embeddings do you want to use?", - choices=["USE", "BERT"], - default="USE", - ), - inquirer.List( - "llm", - message="Which LLM do you want to use?", - choices=["GPT-2", "GPT-3"], - default="GPT-2", - ), ] + confirm = inquirer.prompt(questions) + + if confirm and confirm["confirm"]: + questions = [ + inquirer.List( + "embeddings", + message="Which local embeddings model do you want to use?", + choices=[ + "SentenceTransformers-all-mpnet-base-v2", + "Instructor-Large", + "Ollama", + ], + default="SentenceTransformers-all-mpnet-base-v2", + ), + inquirer.List( + "llm", + message="Which local LLM do you want to use?", + choices=["Llamacpp", "Ollama", "Huggingface"], + default="Llamacpp", + ), + ] + else: + questions = [ + inquirer.List( + "embeddings", + message="Which embeddings do you want to use?", + choices=["OpenAI-text-embedding-ada-002", "Azure-OpenAI"], + default="OpenAI-text-embedding-ada-002", + ), + inquirer.List( + "llm", + message="Which LLM do you want to use?", + choices=["GPT-3.5-Turbo", "GPT-4"], + default="GPT-3.5-Turbo", + ), + ] + answers = inquirer.prompt(questions) - if answers: + if confirm and answers: config = { - "local": answers["confirm"], + "local": confirm["confirm"], "embeddings": answers["embeddings"], "llm": answers["llm"], } save_config(config) return config + + return {} diff --git a/codeqai/constants.py b/codeqai/constants.py index 4a9aee3..87566a1 100644 --- a/codeqai/constants.py +++ b/codeqai/constants.py @@ -16,3 +16,15 @@ class Language(Enum): SCALA = "scala" LUA = "lua" UNKNOWN = "unknown" + + +class EmbeddingsModel(Enum): + SENTENCETRANSFORMERS_ALL_MPNET_BASE_V2 = "SentenceTransformers-all-mpnet-base-v2" + INSTRUCTOR_LARGE = "Instructor-Large" + OLLAMA = "Ollama" + OPENAI_TEXT_EMBEDDING_ADA_002 = "OpenAI-text-embedding-ada-002" + AZURE_OPENAI = "Azure-OpenAI" + + +class LocalLLMModel(Enum): + GPT_3_5_TURBO = "gpt-3.5-turbo" diff --git a/codeqai/embeddings.py b/codeqai/embeddings.py index 720a9ac..b6778bc 100644 --- a/codeqai/embeddings.py +++ b/codeqai/embeddings.py @@ -1,5 +1,94 @@ +import inquirer +from langchain.embeddings import (HuggingFaceEmbeddings, + HuggingFaceInstructEmbeddings) from langchain.embeddings.openai import OpenAIEmbeddings +from codeqai import utils +from codeqai.constants import EmbeddingsModel -def get_embeddings(): - pass + +class Embeddings: + def __init__( + self, local=False, model=EmbeddingsModel.OPENAI_TEXT_EMBEDDING_ADA_002 + ): + self.model = model + + if not local: + if model == EmbeddingsModel.OPENAI_TEXT_EMBEDDING_ADA_002: + self.embeddings = OpenAIEmbeddings( + client=None, model="text_embedding_ada_002" + ) + else: + if model == EmbeddingsModel.OLLAMA: + pass + else: + try: + import sentence_transformers # noqa: F401 + except ImportError: + self._install_sentence_transformers() + + if model == EmbeddingsModel.SENTENCETRANSFORMERS_ALL_MPNET_BASE_V2: + self.embeddings = HuggingFaceEmbeddings() + elif model == EmbeddingsModel.INSTRUCTOR_LARGE: + try: + from InstructorEmbedding import \ + INSTRUCTOR # noqa: F401 + except ImportError: + self._install_instructor_embedding() + self.embeddings = HuggingFaceInstructEmbeddings() + + def _install_sentence_transformers(self): + question = [ + inquirer.Confirm( + "confirm", + message=f"{utils.get_bold_text('SentenceTransformers')} not found in this python environment. Do you want to install it now?", + default=True, + ), + ] + + answers = inquirer.prompt(question) + if answers and answers["confirm"]: + import subprocess + import sys + + try: + subprocess.run( + [ + sys.executable, + "-m", + "pip", + "install", + "sentence_transformers", + ], + check=True, + ) + except subprocess.CalledProcessError as e: + print(f"Error during sentence_transformers installation: {e}") + + def _install_instructor_embedding(self): + question = [ + inquirer.Confirm( + "confirm", + message=f"{utils.get_bold_text('InstructorEmbedding')} not found in this python environment. Do you want to install it now?", + default=True, + ), + ] + + answers = inquirer.prompt(question) + if answers and answers["confirm"]: + import subprocess + import sys + + try: + subprocess.run( + [ + sys.executable, + "-m", + "pip", + "install", + "InstructorEmbedding", + ], + check=True, + ) + except subprocess.CalledProcessError as e: + print(f"Error during sentence_transformers installation: {e}") diff --git a/codeqai/vector_store.py b/codeqai/vector_store.py index 7e35fa3..8f2cfee 100644 --- a/codeqai/vector_store.py +++ b/codeqai/vector_store.py @@ -30,13 +30,12 @@ def similarity_search(self, query: str): def install_faiss(self): try: - from faiss import FAISS_VERSION_MAJOR # noqa: F401 - from faiss import FAISS_VERSION_MINOR - except: # noqa: E722 + import faiss + except ImportError: question = [ inquirer.Confirm( "confirm", - message=f"{utils.get_bold_text('FAISS')} is not found in this python environment. Do you want to install it now?", + message=f"{utils.get_bold_text('FAISS')} not found in this python environment. Do you want to install it now?", default=True, ), ]