From a312a23e8273a86461302c2795290d6c60a5e68e Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 30 Apr 2024 10:00:55 +0300 Subject: [PATCH 01/16] fix: build dependencies --- requirements.txt | 1 - setup.cfg | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b6fb2ba..582540d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,6 @@ langchain-google-genai==1.0.1 langchain-pinecone==0.1.0 google.generativeai==0.4.1 python-dotenv==1.0.1 -docx2txt==0.8 python-docx==1.1.0 markdown==3.6 langchain-core==0.1.46 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index f33c5f5..2df8fd5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,6 +23,10 @@ install_requires = langchain-google-genai==1.0.1 langchain-pinecone==0.1.0 google.generativeai==0.4.1 + python-dotenv==1.0.1 + python-docx==1.1.0 + markdown==3.6 + langchain-core==0.1.46 package_dir= =src From 8442c9bff519ef55c7faf8278f70601ce040d3f8 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 30 Apr 2024 14:41:47 +0300 Subject: [PATCH 02/16] add tests with TestCoherePineconeIndexer and test fir retrieve and generate method --- src/tests/cohereindex_test.py | 89 +++++++++++++++++++++++++++++++++++ src/tests/googleindex_test.py | 19 ++++++-- src/tests/openaiindex_test.py | 19 ++++++-- 3 files changed, 121 insertions(+), 6 deletions(-) create mode 100644 src/tests/cohereindex_test.py diff --git a/src/tests/cohereindex_test.py b/src/tests/cohereindex_test.py new file mode 100644 index 0000000..14f0ccb --- /dev/null +++ b/src/tests/cohereindex_test.py @@ -0,0 +1,89 @@ +import unittest +from _cohere.doc_index import CoherePineconeIndexer +import os +from io import StringIO +from unittest.mock import patch +import pinecone +from langchain_pinecone import PineconeVectorStore +from dotenv import load_dotenv +load_dotenv() + +class TestCoherePineconeIndexer(unittest.TestCase): + """ + Test case class for CoherePineconeIndexer + """ + def setUp(self) -> None: + self.index_name = "new-index-1" + self.pinecone_api_key = os.environ.get('PINECONE_API_KEY') + self.cohere_api_key = os.environ.get('COHERE_API_KEY') + self.indexer = CoherePineconeIndexer(self.index_name, self.pinecone_api_key, self.cohere_api_key) + return super().setUp() + + @patch('sys.stdout', new_callable=StringIO) + def test_01_create_index(self, mock_stdout): + """ + Test creating an index and assert the output. + """ + self.indexer.create_index() + printed_output = mock_stdout.getvalue().strip() + lines = printed_output.split('\n') + index_created_message_0 = lines[0] + self.assertEqual(index_created_message_0, f"Creating index {self.index_name}") + index_created_message_1 = lines[1] + self.assertEqual(index_created_message_1, f"Index {self.index_name} created successfully!") + + @patch('builtins.print') + def test_02_index_documents(self, mock_print): + """ + Test indexing documents and assert the type of the index. + """ + urls = [ + "https://arxiv.org/pdf/1706.03762.pdf", + "src/tests/DOCX_TestPage.docx", + "src/tests/TEST.md", + "src/tests/test.html" + ] + self.indexer.index_documents(urls, batch_limit=10, chunk_size=256) + index = self.indexer.pc.Index(self.index_name) + self.assertIsInstance(index, pinecone.data.index.Index) + + def test_03_initialize_vectorstore(self): + """ + Test initializing the vector store and assert its type. + """ + vectorstore = self.indexer.initialize_vectorstore(self.index_name) + self.assertIsInstance(vectorstore, PineconeVectorStore) + + def test_04_retrieve_and_generate(self): + """ + Test initializing the vector store and assert its type. + """ + response = self.indexer.retrieve_and_generate(query = "what is the title of the document", + index_name= self.index_name + ) + print(response) + self.assertIsNotNone(response, "The retriever response should not be None.") + + @patch('sys.stdout', new_callable=StringIO) + def test_05_delete_index(self, mock_stdout): + """ + Test deleting an index and assert the output. + """ + self.indexer.delete_index() + printed_output = mock_stdout.getvalue().strip() + lines = printed_output.split('\n') + index_deleted_message_0 = lines[0] + self.assertEqual(index_deleted_message_0, f"Deleting index {self.index_name}") + index_deleted_message_1 = lines[1] + self.assertEqual(index_deleted_message_1, f"Index {self.index_name} deleted successfully!") + + @classmethod + def sort_test_methods(cls, testCaseClass, testCaseNames): + """ + Sort test methods for better readability. + """ + return sorted(testCaseNames) + +if __name__ == "__main__": + unittest.TestLoader.sortTestMethodsUsing = TestCoherePineconeIndexer.sort_test_methods + unittest.main() \ No newline at end of file diff --git a/src/tests/googleindex_test.py b/src/tests/googleindex_test.py index 23697e6..0cdec78 100644 --- a/src/tests/googleindex_test.py +++ b/src/tests/googleindex_test.py @@ -1,5 +1,5 @@ import unittest -from _google.docindex import GooglePineconeIndexer +from _google.doc_index import GooglePineconeIndexer import os from io import StringIO from unittest.mock import patch @@ -7,6 +7,8 @@ from langchain_pinecone import PineconeVectorStore from dotenv import load_dotenv load_dotenv() + + class TestGooglePineconeIndexer(unittest.TestCase): """ Test case class for the GooglePineconeIndexer. @@ -20,7 +22,8 @@ def setUp(self): self.pinecone_api_key = os.environ.get('PINECONE_API_KEY') self.google_api_key = os.environ.get('GOOGLE_API_KEY') self.indexer = GooglePineconeIndexer(self.index_name, self.pinecone_api_key, self.google_api_key) - + return super().setUp() + @patch('sys.stdout', new_callable=StringIO) def test_01_create_index(self, mock_stdout): """ @@ -56,8 +59,18 @@ def test_03_initialize_vectorstore(self): vectorstore = self.indexer.initialize_vectorstore(self.index_name) self.assertIsInstance(vectorstore, PineconeVectorStore) + def test_04_retrieve_and_generate(self): + """ + Test initializing the vector store and assert its type. + """ + response = self.indexer.retrieve_and_generate(query = "give a short summary of the introduction", + index_name= self.index_name + ) + print(response) + self.assertIsNotNone(response, "The retriever response should not be None.") + @patch('sys.stdout', new_callable=StringIO) - def test_04_delete_index(self, mock_stdout): + def test_05_delete_index(self, mock_stdout): """ Test deleting an index and assert the output. """ diff --git a/src/tests/openaiindex_test.py b/src/tests/openaiindex_test.py index 9851975..4a59a88 100644 --- a/src/tests/openaiindex_test.py +++ b/src/tests/openaiindex_test.py @@ -1,5 +1,5 @@ import unittest -from _openai.docindex import OpenaiPineconeIndexer +from _openai.doc_index import OpenaiPineconeIndexer import os from io import StringIO from unittest.mock import patch @@ -7,6 +7,8 @@ from langchain_pinecone import PineconeVectorStore from dotenv import load_dotenv load_dotenv() + + class TestOpenaiPineconeIndexer(unittest.TestCase): """ Test case class for the OpenaiPineconeIndexer. @@ -20,7 +22,8 @@ def setUp(self): self.pinecone_api_key = os.environ.get('PINECONE_API_KEY') self.openai_api_key = os.environ.get('OPENAI_API_KEY') self.indexer = OpenaiPineconeIndexer(self.index_name, self.pinecone_api_key, self.openai_api_key) - + return super().setUp() + @patch('sys.stdout', new_callable=StringIO) def test_01_create_index(self, mock_stdout): """ @@ -56,8 +59,18 @@ def test_03_initialize_vectorstore(self): vectorstore = self.indexer.initialize_vectorstore(self.index_name) self.assertIsInstance(vectorstore, PineconeVectorStore) + def test_04_retrieve_and_generate(self): + """ + Test initializing the vector store and assert its type. + """ + response = self.indexer.retrieve_and_generate(query = "give a short summary of the introduction", + index_name= self.index_name + ) + print(response) + self.assertIsNotNone(response, "The retriever response should not be None.") + @patch('sys.stdout', new_callable=StringIO) - def test_04_delete_index(self, mock_stdout): + def test_05_delete_index(self, mock_stdout): """ Test deleting an index and assert the output. """ From be5540978d193a117dda5cc9754728f7b1c7fd52 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 30 Apr 2024 14:45:11 +0300 Subject: [PATCH 03/16] update dependencies --- requirements.txt | 3 ++- setup.cfg | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 582540d..9560cb2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ google.generativeai==0.4.1 python-dotenv==1.0.1 python-docx==1.1.0 markdown==3.6 -langchain-core==0.1.46 \ No newline at end of file +langchain-core==0.1.46 +langchain-cohere>=0.1.4 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 2df8fd5..05831ce 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,6 +27,7 @@ install_requires = python-docx==1.1.0 markdown==3.6 langchain-core==0.1.46 + langchain-cohere==0.1.4 package_dir= =src From 70d98003dcc31e8b9b139648005449290d2eeebe Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 30 Apr 2024 14:48:20 +0300 Subject: [PATCH 04/16] add indexing using cohere embeddings module and cli usage --- src/_cohere/__init__.py | 0 src/_cohere/config.py | 12 ++ src/_cohere/create_index.py | 14 ++ src/_cohere/delete_index.py | 14 ++ src/_cohere/doc_index.py | 244 +++++++++++++++++++++++++++++++++ src/_cohere/doc_model.py | 8 ++ src/_cohere/index_documents.py | 20 +++ 7 files changed, 312 insertions(+) create mode 100644 src/_cohere/__init__.py create mode 100644 src/_cohere/config.py create mode 100644 src/_cohere/create_index.py create mode 100644 src/_cohere/delete_index.py create mode 100644 src/_cohere/doc_index.py create mode 100644 src/_cohere/doc_model.py create mode 100644 src/_cohere/index_documents.py diff --git a/src/_cohere/__init__.py b/src/_cohere/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/_cohere/config.py b/src/_cohere/config.py new file mode 100644 index 0000000..13b30da --- /dev/null +++ b/src/_cohere/config.py @@ -0,0 +1,12 @@ +class Config: + template_str = """ + You are very helpful assistant for question answering tasks. Use the pieces of retrieved context to answer question given. If you do not know + the answer, Just say that you do not know the answer instead of making up an answer. + + Retrieved context: {context} + Query: {query} + """ + + default_google_model = "gemini-pro" + default_openai_model = "gpt-3.5-turbo-0125" + default_cohere_model = "command" diff --git a/src/_cohere/create_index.py b/src/_cohere/create_index.py new file mode 100644 index 0000000..37d750e --- /dev/null +++ b/src/_cohere/create_index.py @@ -0,0 +1,14 @@ +from .doc_index import CoherePineconeIndexer +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description="Creates an Index on Pinecone.") + parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key") + parser.add_argument("--index_name", type=str, help="Name of the Pinecone index") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + pinecone_indexer = CoherePineconeIndexer(args.index_name, args.pinecone_api_key) + pinecone_indexer.create_index() diff --git a/src/_cohere/delete_index.py b/src/_cohere/delete_index.py new file mode 100644 index 0000000..52a0bf4 --- /dev/null +++ b/src/_cohere/delete_index.py @@ -0,0 +1,14 @@ +from .doc_index import CoherePineconeIndexer +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description="Deletes an Index on Pinecone.") + parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key") + parser.add_argument("--index_name", type=str, help="Name of the Pinecone index") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + pinecone_indexer = CoherePineconeIndexer(args.index_name, args.pinecone_api_key) + pinecone_indexer.delete_index() diff --git a/src/_cohere/doc_index.py b/src/_cohere/doc_index.py new file mode 100644 index 0000000..82d8685 --- /dev/null +++ b/src/_cohere/doc_index.py @@ -0,0 +1,244 @@ +from pinecone import Pinecone, PodSpec +from tqdm.auto import tqdm +from uuid import uuid4 +from langchain_community.document_loaders import PyPDFLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +import tiktoken +from typing import List +from _openai.doc_model import Page +from langchain_pinecone import PineconeVectorStore +from pathlib import Path +from langchain_community.document_loaders import UnstructuredWordDocumentLoader +from langchain_community.document_loaders import UnstructuredMarkdownLoader +from langchain_community.document_loaders import UnstructuredHTMLLoader +from langchain_cohere import CohereEmbeddings +from langchain_community.llms import Cohere +from langchain_core.prompts import PromptTemplate +from langchain_core.output_parsers import StrOutputParser +from operator import itemgetter +from _cohere.config import Config + +class CoherePineconeIndexer: + """ + Class for indexing documents to Pinecone using CohereEmbeddings embeddings. + """ + def __init__( + self, + index_name: str, + pinecone_api_key: str = None, + cohere_api_key: str = None + ) -> None: + """ + Initialize the CohereEmbeddings object. + + Args: + index_name (str): Name of the Pinecone index. + pinecone_api_key (str): Pinecone API key. + environment (str): Environment for Pinecone service. + cohere_api_key (str): Cohere API key. + """ + self.pc = Pinecone(api_key=pinecone_api_key) + self.index_name = index_name + self.cohere_api_key = cohere_api_key + self.tokenizer = tiktoken.get_encoding('p50k_base') + + def create_index(self, environment: str = "us-west1-gcp" ): + """ + Creates an index with the specified parameters. + + Args: + environment (str, optional): The environment where the index will be created. Defaults to "us-west1-gcp". + + Returns: + None + """ + print(f"Creating index {self.index_name}") + self.pc.create_index( + name=self.index_name, + dimension=768, + metric="cosine", + spec=PodSpec( + environment=environment, + pod_type="p1.x1", + pods=1 + ) + ) + return print(f"Index {self.index_name} created successfully!") + + def delete_index(self): + """ + Deletes the created index. + + Returns: + None + """ + print(f"Deleting index {self.index_name}") + self.pc.delete_index(self.index_name) + return print(f"Index {self.index_name} deleted successfully!") + + + def load_document(self, file_url: str) -> List[str]: + """ + Load a document from a given file URL and split it into pages. + + This method supports loading documents in various formats including PDF, DOCX, DOC, Markdown, and HTML. + It uses the appropriate loader for each file type to load the document and split it into pages. + + Args: + file_url (str): The URL of the file to be loaded. + + Returns: + List[str]: A list of strings, where each string represents a page from the loaded document. + + Raises: + ValueError: If the file type is not supported or recognized. + """ + pages = [] + file_path = Path(file_url) + + file_extension = file_path.suffix + if file_extension == ".pdf": + loader = PyPDFLoader(file_url) + pages = loader.load_and_split() + elif file_extension in ('.docx', '.doc'): + loader = UnstructuredWordDocumentLoader(file_url) + pages = loader.load_and_split() + elif file_extension == '.md': + loader = UnstructuredMarkdownLoader(file_url) + pages = loader.load_and_split() + elif file_extension == '.html': + loader = UnstructuredHTMLLoader(file_url) + pages = loader.load_and_split() + return pages + + def tiktoken_len(self, text: str) -> int: + """ + Calculate length of text in tokens. + + Parameters: + text (str): Input text. + + Returns: + int: Length of text in tokens. + """ + tokens = self.tokenizer.encode( + text, + disallowed_special=() + ) + return len(tokens) + + def embed(self) -> CohereEmbeddings: + """ + Embeds the given sample text using Google's Generative AI. + Returns: + CohereEmbeddings: An object containing the embedded content. + """ + return CohereEmbeddings(model="embed-multilingual-v2.0", + cohere_api_key = self.cohere_api_key) + + def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: int = 128) -> None: + """ + Upsert documents into the Pinecone index. + + Args: + documents (List[Page]): List of documents to upsert. + batch_limit (int): Maximum batch size for upsert operation. + chunks_size(int): size of texts per chunk. + + Returns: + None + """ + texts = [] + metadatas = [] + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=int(chunk_size), + chunk_overlap=20, + length_function=self.tiktoken_len, + separators=["\n\n", "\n", " ", ""] + ) + for i, record in enumerate(tqdm(documents)): + metadata = { + 'content': record.page_content, + 'source': record.page, + 'title': record.source + } + record_texts = text_splitter.split_text(record.page_content) + record_metadatas = [{ + "chunk": j, "text": text, **metadata + } for j, text in enumerate(record_texts)] + texts.extend(record_texts) + metadatas.extend(record_metadatas) + if len(texts) >= batch_limit: + ids = [str(uuid4()) for _ in range(len(texts))] + embed = self.embed() + embeds = embed.embed_documents(texts) + index = self.pc.Index(self.index_name) + index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True) + texts = [] + metadatas = [] + + + def index_documents(self, urls: List[str], batch_limit: int = 32, chunk_size: int = 256) -> None: + """ + Process a list of URLs and upsert documents to a Pinecone index. + + Args: + urls (List[str]): List of URLs to process. + batch_limit (int): Batch limit for upserting documents. + chunks_size(int): size of texts per chunk. + + Returns: + None + """ + for url in tqdm(urls, desc="Processing URLs"): + print(f"Processing URL: {url}") + pages = self.load_document(url) + print(f"Found {len(pages)} pages in the PDF.") + pages_data = [ + Page( + page_content=page.page_content, + metadata=page.metadata, + page=page.metadata.get("page", 0), + source=page.metadata.get("source") + ) + for page in pages + ] + + print(f"Upserting {len(pages_data)} pages to the Pinecone index...") + self.upsert_documents(pages_data, batch_limit, chunk_size) + print("Finished upserting documents for this URL.") + index = self.pc.Index(self.index_name) + print(index.describe_index_stats()) + print("Indexing complete.") + + def initialize_vectorstore(self, index_name): + index = self.pc.Index(index_name) + embed = CohereEmbeddings(model="embed-multilingual-v2.0", + cohere_api_key = self.cohere_api_key) + vectorstore = PineconeVectorStore(index,embed, "text") + return vectorstore + + def retrieve_and_generate(self,query: str, index_name: str, model_name: str = 'gpt-3.5-turbo-1106', top_k: int =5): + """ + Retrieve documents from the Pinecone index and generate a response. + Args: + query: The query from the user + index_name: The name of the Pinecone index + model_name: The name of the model to use : defaults to 'gpt-3.5-turbo-1106' + top_k: The number of documents to retrieve from the index : defaults to 5 + """ + llm = Cohere(model="command", cohere_api_key = self.cohere_api_key) + rag_prompt = PromptTemplate(template = Config.template_str, input_variables = ["query", "context"]) + + vector_store = self.initialize_vectorstore(index_name) + retriever = vector_store.as_retriever(search_kwargs = {"k": top_k}) + rag_chain = ( + {"context": itemgetter("query")| retriever, + "query": itemgetter("query"), + } + | rag_prompt + | llm + | StrOutputParser() + ) + + return rag_chain.invoke({"query": query}) diff --git a/src/_cohere/doc_model.py b/src/_cohere/doc_model.py new file mode 100644 index 0000000..da57dc4 --- /dev/null +++ b/src/_cohere/doc_model.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel, Field +from typing import Dict, Union + +class Page(BaseModel): + page_content: str = Field(..., description="The content of the page") + metadata: Dict[str, Union[str, int]] = Field(..., description="Metadata about the document") + page: int = Field(..., description="The page of the content") + source: Union[str, int] = Field(..., description="The source url of the document") \ No newline at end of file diff --git a/src/_cohere/index_documents.py b/src/_cohere/index_documents.py new file mode 100644 index 0000000..ff9d54c --- /dev/null +++ b/src/_cohere/index_documents.py @@ -0,0 +1,20 @@ +from .doc_index import CoherePineconeIndexer +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description="Index documents on Pinecone using OpenAI embeddings.") + parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key") + parser.add_argument("--index_name", type=str, help="Name of the Pinecone index") + parser.add_argument("--cohere_api_key", type=str, help="OpenAI API key") + parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed") + + parser.add_argument("--batch_limit", type=int, default=32, help="Maximum batch size for indexing (default: 100).") + parser.add_argument("--chunk_size", type=int, default=256, help="Size of texts per chunk (default: 1000 characters).") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + pinecone_indexer = CoherePineconeIndexer(args.index_name, args.pinecone_api_key, args.cohere_api_key) + pinecone_indexer.index_documents(args.docs, args.batch_limit, args.chunk_size) + pinecone_indexer.initialize_vectorstore(args.index_name) From d6fd508f2f0a9583fac456664bfde28b70000d81 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 30 Apr 2024 14:50:25 +0300 Subject: [PATCH 05/16] refactor _openai module for better naming add new file config.py for templating --- src/_openai/config.py | 12 ++ src/_openai/create_index.py | 2 +- src/_openai/delete_index.py | 2 +- src/_openai/doc_index.py | 283 +++++++++++++++++++++++++++++++-- src/_openai/docindex.py | 272 ------------------------------- src/_openai/index_documents.py | 21 +++ 6 files changed, 302 insertions(+), 290 deletions(-) create mode 100644 src/_openai/config.py delete mode 100644 src/_openai/docindex.py create mode 100644 src/_openai/index_documents.py diff --git a/src/_openai/config.py b/src/_openai/config.py new file mode 100644 index 0000000..13b30da --- /dev/null +++ b/src/_openai/config.py @@ -0,0 +1,12 @@ +class Config: + template_str = """ + You are very helpful assistant for question answering tasks. Use the pieces of retrieved context to answer question given. If you do not know + the answer, Just say that you do not know the answer instead of making up an answer. + + Retrieved context: {context} + Query: {query} + """ + + default_google_model = "gemini-pro" + default_openai_model = "gpt-3.5-turbo-0125" + default_cohere_model = "command" diff --git a/src/_openai/create_index.py b/src/_openai/create_index.py index 76447eb..3da40b6 100644 --- a/src/_openai/create_index.py +++ b/src/_openai/create_index.py @@ -1,4 +1,4 @@ -from .docindex import OpenaiPineconeIndexer +from .doc_index import OpenaiPineconeIndexer import argparse def parse_args(): diff --git a/src/_openai/delete_index.py b/src/_openai/delete_index.py index ba89bd4..d4f4539 100644 --- a/src/_openai/delete_index.py +++ b/src/_openai/delete_index.py @@ -1,4 +1,4 @@ -from .docindex import OpenaiPineconeIndexer +from .doc_index import OpenaiPineconeIndexer import argparse def parse_args(): diff --git a/src/_openai/doc_index.py b/src/_openai/doc_index.py index d7c9493..c0f1f4e 100644 --- a/src/_openai/doc_index.py +++ b/src/_openai/doc_index.py @@ -1,21 +1,272 @@ -from .docindex import OpenaiPineconeIndexer -import argparse +from pinecone import Pinecone, PodSpec +from tqdm.auto import tqdm +from uuid import uuid4 +from langchain_community.document_loaders import PyPDFLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_openai import OpenAIEmbeddings +import tiktoken +from typing import List +from .doc_model import Page +from pathlib import Path +from langchain_community.document_loaders import UnstructuredWordDocumentLoader +from langchain_community.document_loaders import UnstructuredMarkdownLoader +from langchain_community.document_loaders import UnstructuredHTMLLoader +from langchain_pinecone import PineconeVectorStore +from langchain_core.prompts import PromptTemplate +from langchain_core.output_parsers import StrOutputParser +from operator import itemgetter +from langchain_openai import ChatOpenAI +from _openai.config import Config + + + +class OpenaiPineconeIndexer: + """ + Class for indexing documents to Pinecone using OpenAI embeddings. + """ + def __init__( + self, + index_name: str, + pinecone_api_key: str, + openai_api_key: str + ) -> None: + """ + Initialize the OpenAIPineconeIndexer object. + + Args: + index_name (str): Name of the Pinecone index. + pinecone_api_key (str): Pinecone API key. + environment (str): Environment for Pinecone service. + openai_api_key (str): OpenAI API key. + """ + self.pc = Pinecone(api_key=pinecone_api_key) + self.index_name = index_name + self.openai_api_key = openai_api_key + self.tokenizer = tiktoken.get_encoding('p50k_base') + + def create_index(self, environment: str = "us-west1-gcp" ): + """ + Creates an index with the specified parameters. + + Args: + environment (str, optional): The environment where the index will be created. Defaults to "us-west1-gcp". + + Returns: + None + """ + print(f"Creating index {self.index_name}") + self.pc.create_index( + name=self.index_name, + dimension=1536, + metric="cosine", + spec=PodSpec( + environment=environment, + pod_type="p1.x1", + pods=1 + ) + ) + return print(f"Index {self.index_name} created successfully!") + + + def delete_index(self): + """ + Deletes the created index. + + Returns: + None + """ + print(f"Deleting index {self.index_name}") + self.pc.delete_index(self.index_name) + return print(f"Index {self.index_name} deleted successfully!") + + + def load_document(self, file_url: str) -> List[str]: + """ + Load a document from a given file URL and split it into pages. + + This method supports loading documents in various formats including PDF, DOCX, DOC, Markdown, and HTML. + It uses the appropriate loader for each file type to load the document and split it into pages. + + Args: + file_url (str): The URL of the file to be loaded. + + Returns: + List[str]: A list of strings, where each string represents a page from the loaded document. + + Raises: + ValueError: If the file type is not supported or recognized. + """ + pages = [] + file_path = Path(file_url) + + # Determine file type and use the appropriate loader + file_extension = file_path.suffix + + # Load and split PDF files + if file_extension == ".pdf": + loader = PyPDFLoader(file_url) + pages = loader.load_and_split() + + # Load and split DOCX and DOC files + elif file_extension in ('.docx', '.doc'): + loader = UnstructuredWordDocumentLoader(file_url) + pages = loader.load_and_split() + + # Load and split Markdown files + elif file_extension == '.md': + loader = UnstructuredMarkdownLoader(file_url) + pages = loader.load_and_split() + + # Load and split HTML files + elif file_extension == '.html': + loader = UnstructuredHTMLLoader(file_url) + pages = loader.load_and_split() + + # Return the list of pages + return pages + + + def tiktoken_len(self, text: str) -> int: + """ + Calculate length of text in tokens. + + Parameters: + text (str): Input text. + + Returns: + int: Length of text in tokens. + """ + tokens = self.tokenizer.encode( + text, + disallowed_special=() + ) + return len(tokens) + + def embed(self) -> OpenAIEmbeddings: + """ + Initialize OpenAIEmbeddings object. + + Returns: + OpenAIEmbeddings: OpenAIEmbeddings object. + """ + return OpenAIEmbeddings( + openai_api_key=self.openai_api_key + ) + + + def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: int = 256) -> None: + """ + Upsert documents into the Pinecone index. + + Args: + documents (List[Page]): List of documents to upsert. + batch_limit (int): Maximum batch size for upsert operation. + chunks_size(int): size of texts per chunk. + + Returns: + None + """ + texts = [] + metadatas = [] + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=int(chunk_size), + chunk_overlap=20, + length_function=self.tiktoken_len, + separators=["\n\n", "\n", " ", ""] + ) + embed = self.embed() + for i, record in enumerate(tqdm(documents)): + metadata = { + 'content': record.page_content, + 'source': record.page, + 'title': record.source + } + record_texts = text_splitter.split_text(record.page_content) + record_metadatas = [{ + "chunk": j, "text": text, **metadata + } for j, text in enumerate(record_texts)] + texts.extend(record_texts) + metadatas.extend(record_metadatas) + if len(texts) >= batch_limit: + ids = [str(uuid4()) for _ in range(len(texts))] + embeds = embed.embed_documents(texts) + index = self.pc.Index(self.index_name) + index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True) + texts = [] + metadatas = [] + + + def index_documents(self, urls: List[str], batch_limit: int, chunk_size: int = 256) -> None: + """ + Process a list of URLs and upsert documents to a Pinecone index. + + Args: + urls (List[str]): List of URLs to process. + batch_limit (int): Batch limit for upserting documents. + chunks_size(int): size of texts per chunk. + + Returns: + None + """ + for url in tqdm(urls, desc="Processing URLs"): + print(f"Processing URL: {url}") + pages = self.load_document(url) + print(f"Found {len(pages)} pages in the PDF.") + pages_data = [ + Page( + page_content=page.page_content, + metadata=page.metadata, + page=page.metadata.get("page", 0), + source=page.metadata.get("source") + ) + for page in pages + ] + + print(f"Upserting {len(pages_data)} pages to the Pinecone index...") + self.upsert_documents(pages_data, batch_limit, chunk_size) + print("Finished upserting documents for this URL.") + index = self.pc.Index(self.index_name) + print(index.describe_index_stats()) + print("Indexing complete.") + return index + + def initialize_vectorstore(self, index_name): + index = self.pc.Index(index_name) + embed = OpenAIEmbeddings( + model = 'text-embedding-ada-002', + openai_api_key = self.openai_api_key + ) + vectorstore = PineconeVectorStore(index, embed, "text") + return vectorstore + + + def retrieve_and_generate(self,query: str, index_name: str, model_name: str = 'gpt-3.5-turbo-1106', top_k: int =5): + """ + Retrieve documents from the Pinecone index and generate a response. + Args: + query: The query from the user + index_name: The name of the Pinecone index + model_name: The name of the model to use : defaults to 'gpt-3.5-turbo-1106' + top_k: The number of documents to retrieve from the index : defaults to 5 + """ + llm = ChatOpenAI(model = Config.default_openai_model, openai_api_key = self.openai_api_key) + rag_prompt = PromptTemplate(template = Config.template_str, input_variables = ["query", "context"]) + + vector_store = self.initialize_vectorstore(index_name) + retriever = vector_store.as_retriever(search_kwargs = {"k": top_k}) + rag_chain = ( + {"context": itemgetter("query")| retriever, + "query": itemgetter("query"), + } + | rag_prompt + | llm + | StrOutputParser() + ) + + return rag_chain.invoke({"query": query}) + -def parse_args(): - parser = argparse.ArgumentParser(description="Index documents on Pinecone using OpenAI embeddings.") - parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key") - parser.add_argument("--index_name", type=str, help="Name of the Pinecone index") - parser.add_argument("--openai_api_key", type=str, help="OpenAI API key") - parser.add_argument("--batch_limit", type=int, help="Maximum batch size for indexing") - parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed") - parser.add_argument("--chunk_size", help="size of texts per chunk") - return parser.parse_args() -if __name__ == "__main__": - args = parse_args() - pinecone_indexer = OpenaiPineconeIndexer(args.index_name, args.pinecone_api_key, args.openai_api_key) - pinecone_indexer.index_documents(args.docs, args.batch_limit, args.chunk_size) - \ No newline at end of file diff --git a/src/_openai/docindex.py b/src/_openai/docindex.py deleted file mode 100644 index 08af970..0000000 --- a/src/_openai/docindex.py +++ /dev/null @@ -1,272 +0,0 @@ -from pinecone import Pinecone, PodSpec -from tqdm.auto import tqdm -from uuid import uuid4 -from langchain_community.document_loaders import PyPDFLoader -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_openai import OpenAIEmbeddings -import tiktoken -from typing import List -from .doc_model import Page -from pathlib import Path -from langchain_community.document_loaders import UnstructuredWordDocumentLoader -from langchain_community.document_loaders import UnstructuredMarkdownLoader -from langchain_community.document_loaders import UnstructuredHTMLLoader -from langchain_pinecone import PineconeVectorStore -from langchain_core.prompts import PromptTemplate -from langchain_core.output_parsers import StrOutputParser -from operator import itemgetter -from langchain_openai import ChatOpenAI -from src.config import Config - - - -class OpenaiPineconeIndexer: - """ - Class for indexing documents to Pinecone using OpenAI embeddings. - """ - def __init__( - self, - index_name: str, - pinecone_api_key: str, - openai_api_key: str - ) -> None: - """ - Initialize the OpenAIPineconeIndexer object. - - Args: - index_name (str): Name of the Pinecone index. - pinecone_api_key (str): Pinecone API key. - environment (str): Environment for Pinecone service. - openai_api_key (str): OpenAI API key. - """ - self.pc = Pinecone(api_key=pinecone_api_key) - self.index_name = index_name - self.openai_api_key = openai_api_key - self.tokenizer = tiktoken.get_encoding('p50k_base') - - def create_index(self, environment: str = "us-west1-gcp" ): - """ - Creates an index with the specified parameters. - - Args: - environment (str, optional): The environment where the index will be created. Defaults to "us-west1-gcp". - - Returns: - None - """ - print(f"Creating index {self.index_name}") - self.pc.create_index( - name=self.index_name, - dimension=1536, - metric="cosine", - spec=PodSpec( - environment=environment, - pod_type="p1.x1", - pods=1 - ) - ) - return print(f"Index {self.index_name} created successfully!") - - - def delete_index(self): - """ - Deletes the created index. - - Returns: - None - """ - print(f"Deleting index {self.index_name}") - self.pc.delete_index(self.index_name) - return print(f"Index {self.index_name} deleted successfully!") - - - def load_document(self, file_url: str) -> List[str]: - """ - Load a document from a given file URL and split it into pages. - - This method supports loading documents in various formats including PDF, DOCX, DOC, Markdown, and HTML. - It uses the appropriate loader for each file type to load the document and split it into pages. - - Args: - file_url (str): The URL of the file to be loaded. - - Returns: - List[str]: A list of strings, where each string represents a page from the loaded document. - - Raises: - ValueError: If the file type is not supported or recognized. - """ - pages = [] - file_path = Path(file_url) - - # Determine file type and use the appropriate loader - file_extension = file_path.suffix - - # Load and split PDF files - if file_extension == ".pdf": - loader = PyPDFLoader(file_url) - pages = loader.load_and_split() - - # Load and split DOCX and DOC files - elif file_extension in ('.docx', '.doc'): - loader = UnstructuredWordDocumentLoader(file_url) - pages = loader.load_and_split() - - # Load and split Markdown files - elif file_extension == '.md': - loader = UnstructuredMarkdownLoader(file_url) - pages = loader.load_and_split() - - # Load and split HTML files - elif file_extension == '.html': - loader = UnstructuredHTMLLoader(file_url) - pages = loader.load_and_split() - - # Return the list of pages - return pages - - - def tiktoken_len(self, text: str) -> int: - """ - Calculate length of text in tokens. - - Parameters: - text (str): Input text. - - Returns: - int: Length of text in tokens. - """ - tokens = self.tokenizer.encode( - text, - disallowed_special=() - ) - return len(tokens) - - def embed(self) -> OpenAIEmbeddings: - """ - Initialize OpenAIEmbeddings object. - - Returns: - OpenAIEmbeddings: OpenAIEmbeddings object. - """ - return OpenAIEmbeddings( - openai_api_key=self.openai_api_key - ) - - - def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: int = 256) -> None: - """ - Upsert documents into the Pinecone index. - - Args: - documents (List[Page]): List of documents to upsert. - batch_limit (int): Maximum batch size for upsert operation. - chunks_size(int): size of texts per chunk. - - Returns: - None - """ - texts = [] - metadatas = [] - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=int(chunk_size), - chunk_overlap=20, - length_function=self.tiktoken_len, - separators=["\n\n", "\n", " ", ""] - ) - embed = self.embed() - for i, record in enumerate(tqdm(documents)): - metadata = { - 'content': record.page_content, - 'source': record.page, - 'title': record.source - } - record_texts = text_splitter.split_text(record.page_content) - record_metadatas = [{ - "chunk": j, "text": text, **metadata - } for j, text in enumerate(record_texts)] - texts.extend(record_texts) - metadatas.extend(record_metadatas) - if len(texts) >= batch_limit: - ids = [str(uuid4()) for _ in range(len(texts))] - embeds = embed.embed_documents(texts) - index = self.pc.Index(self.index_name) - index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True) - texts = [] - metadatas = [] - - - def index_documents(self, urls: List[str], batch_limit: int, chunk_size: int = 256) -> None: - """ - Process a list of URLs and upsert documents to a Pinecone index. - - Args: - urls (List[str]): List of URLs to process. - batch_limit (int): Batch limit for upserting documents. - chunks_size(int): size of texts per chunk. - - Returns: - None - """ - for url in tqdm(urls, desc="Processing URLs"): - print(f"Processing URL: {url}") - pages = self.load_document(url) - print(f"Found {len(pages)} pages in the PDF.") - pages_data = [ - Page( - page_content=page.page_content, - metadata=page.metadata, - page=page.metadata.get("page", 0), - source=page.metadata.get("source") - ) - for page in pages - ] - - print(f"Upserting {len(pages_data)} pages to the Pinecone index...") - self.upsert_documents(pages_data, batch_limit, chunk_size) - print("Finished upserting documents for this URL.") - index = self.pc.Index(self.index_name) - print(index.describe_index_stats()) - print("Indexing complete.") - return index - - def initialize_vectorstore(self, index_name): - index = self.pc.Index(index_name) - embed = OpenAIEmbeddings( - model = 'text-embedding-ada-002', - openai_api_key = self.openai_api_key - ) - vectorstore = PineconeVectorStore(index, embed, "text") - return vectorstore - - - def retrieve_and_generate(self,query: str, index_name: str, model_name: str = 'gpt-3.5-turbo-1106', top_k: int =5): - """ - Retrieve documents from the Pinecone index and generate a response. - Args: - query: The query from the user - index_name: The name of the Pinecone index - model_name: The name of the model to use : defaults to 'gpt-3.5-turbo-1106' - top_k: The number of documents to retrieve from the index : defaults to 5 - """ - llm = ChatOpenAI(model = Config.default_openai_model, openai_api_key = self.openai_api_key) - rag_prompt = PromptTemplate(template = Config.template_str, input_variables = ["query", "context"]) - - vector_store = self.initialize_vectorstore(index_name) - retriever = vector_store.as_retriver(search_kwargs = {"k": top_k}) - rag_chain = ( - {"context": itemgetter("query")| retriever, - "query": itemgetter("query"), - } - | rag_prompt - | llm - | StrOutputParser() - ) - - return rag_chain.invoke({"query": query}) - - - - - - diff --git a/src/_openai/index_documents.py b/src/_openai/index_documents.py new file mode 100644 index 0000000..757bd61 --- /dev/null +++ b/src/_openai/index_documents.py @@ -0,0 +1,21 @@ +from .doc_index import OpenaiPineconeIndexer +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description="Index documents on Pinecone using OpenAI embeddings.") + parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key") + parser.add_argument("--index_name", type=str, help="Name of the Pinecone index") + parser.add_argument("--openai_api_key", type=str, help="OpenAI API key") + parser.add_argument("--batch_limit", default = 20, type=int, help="Maximum batch size for indexing") + parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed") + parser.add_argument("--chunk_size", default = 256, help="size of texts per chunk") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + pinecone_indexer = OpenaiPineconeIndexer(args.index_name, args.pinecone_api_key, args.openai_api_key) + pinecone_indexer.index_documents(args.docs, args.batch_limit, args.chunk_size) + + + \ No newline at end of file From a88393398a05baf5ac89d9be7161b5591d160089 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 30 Apr 2024 14:54:27 +0300 Subject: [PATCH 06/16] refactor _google module for better naming add new config file for templating --- src/{ => _google}/config.py | 0 src/_google/create_index.py | 2 +- src/_google/delete_index.py | 2 +- src/_google/doc_index.py | 285 ++++++++++++++++++++++++++++++--- src/_google/docindex.py | 267 ------------------------------ src/_google/index_documents.py | 18 +++ 6 files changed, 287 insertions(+), 287 deletions(-) rename src/{ => _google}/config.py (100%) delete mode 100644 src/_google/docindex.py create mode 100644 src/_google/index_documents.py diff --git a/src/config.py b/src/_google/config.py similarity index 100% rename from src/config.py rename to src/_google/config.py diff --git a/src/_google/create_index.py b/src/_google/create_index.py index 88a4999..e912942 100644 --- a/src/_google/create_index.py +++ b/src/_google/create_index.py @@ -1,4 +1,4 @@ -from .docindex import GooglePineconeIndexer +from .doc_index import GooglePineconeIndexer import argparse def parse_args(): diff --git a/src/_google/delete_index.py b/src/_google/delete_index.py index 6ee782f..ac15105 100644 --- a/src/_google/delete_index.py +++ b/src/_google/delete_index.py @@ -1,4 +1,4 @@ -from .docindex import GooglePineconeIndexer +from .doc_index import GooglePineconeIndexer import argparse def parse_args(): diff --git a/src/_google/doc_index.py b/src/_google/doc_index.py index 6e4fa9d..71e49f8 100644 --- a/src/_google/doc_index.py +++ b/src/_google/doc_index.py @@ -1,18 +1,267 @@ -from .docindex import GooglePineconeIndexer -import argparse - -def parse_args(): - parser = argparse.ArgumentParser(description="Index documents on Pinecone using OpenAI embeddings.") - parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key") - parser.add_argument("--index_name", type=str, help="Name of the Pinecone index") - parser.add_argument("--google_api_key", type=str, help="OpenAI API key") - parser.add_argument("--batch_limit", type=int, help="Maximum batch size for indexing") - parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed") - parser.add_argument("--chunk_size", help="size of texts per chunk") - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - pinecone_indexer = GooglePineconeIndexer(args.index_name, args.pinecone_api_key, args.google_api_key) - pinecone_indexer.index_documents(args.docs, args.batch_limit, args.chunk_size) +from pinecone import Pinecone, PodSpec +from tqdm.auto import tqdm +from uuid import uuid4 +from langchain_community.document_loaders import PyPDFLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_google_genai import GoogleGenerativeAIEmbeddings +import tiktoken +from typing import List +from _openai.doc_model import Page +import google.generativeai as genai +from pathlib import Path +from langchain_community.document_loaders import UnstructuredWordDocumentLoader +from langchain_community.document_loaders import UnstructuredMarkdownLoader +from langchain_community.document_loaders import UnstructuredHTMLLoader +from langchain_pinecone import PineconeVectorStore +from langchain_core.prompts import PromptTemplate +from langchain_core.output_parsers import StrOutputParser +from operator import itemgetter +from langchain_google_genai import ChatGoogleGenerativeAI +from _google.config import Config + + +class GooglePineconeIndexer: + """ + Class for indexing documents to Pinecone using GoogleGenerativeAIEmbeddings embeddings. + """ + def __init__( + self, + index_name: str, + pinecone_api_key: str, + google_api_key: str + ) -> None: + """ + Initialize the GoogleGenerativeAIEmbeddings object. + + Args: + index_name (str): Name of the Pinecone index. + pinecone_api_key (str): Pinecone API key. + environment (str): Environment for Pinecone service. + google_api_key (str): Google API key. + """ + self.pc = Pinecone(api_key=pinecone_api_key) + self.index_name = index_name + self.google_api_key = google_api_key + self.tokenizer = tiktoken.get_encoding('p50k_base') + + def create_index(self, environment: str = "us-west1-gcp" ): + """ + Creates an index with the specified parameters. + + Args: + environment (str, optional): The environment where the index will be created. Defaults to "us-west1-gcp". + + Returns: + None + """ + print(f"Creating index {self.index_name}") + self.pc.create_index( + name=self.index_name, + dimension=768, + metric="cosine", + spec=PodSpec( + environment=environment, + pod_type="p1.x1", + pods=1 + ) + ) + return print(f"Index {self.index_name} created successfully!") + + def delete_index(self): + """ + Deletes the created index. + + Returns: + None + """ + print(f"Deleting index {self.index_name}") + self.pc.delete_index(self.index_name) + return print(f"Index {self.index_name} deleted successfully!") + + + def load_document(self, file_url: str) -> List[str]: + """ + Load a document from a given file URL and split it into pages. + + This method supports loading documents in various formats including PDF, DOCX, DOC, Markdown, and HTML. + It uses the appropriate loader for each file type to load the document and split it into pages. + + Args: + file_url (str): The URL of the file to be loaded. + + Returns: + List[str]: A list of strings, where each string represents a page from the loaded document. + + Raises: + ValueError: If the file type is not supported or recognized. + """ + pages = [] + file_path = Path(file_url) + + # Determine file type and use the appropriate loader + file_extension = file_path.suffix + + # Load and split PDF files + if file_extension == ".pdf": + loader = PyPDFLoader(file_url) + pages = loader.load_and_split() + + # Load and split DOCX and DOC files + elif file_extension in ('.docx', '.doc'): + loader = UnstructuredWordDocumentLoader(file_url) + pages = loader.load_and_split() + + # Load and split Markdown files + elif file_extension == '.md': + loader = UnstructuredMarkdownLoader(file_url) + pages = loader.load_and_split() + + # Load and split HTML files + elif file_extension == '.html': + loader = UnstructuredHTMLLoader(file_url) + pages = loader.load_and_split() + + # Return the list of pages + return pages + + def tiktoken_len(self, text: str) -> int: + """ + Calculate length of text in tokens. + + Parameters: + text (str): Input text. + + Returns: + int: Length of text in tokens. + """ + tokens = self.tokenizer.encode( + text, + disallowed_special=() + ) + return len(tokens) + + def embed(self, sample_text: str) -> GoogleGenerativeAIEmbeddings: + """ + Embeds the given sample text using Google's Generative AI. + + Args: + sample_text (str): The text to be embedded. + + Returns: + GoogleGenerativeAIEmbeddings: An object containing the embedded content. + """ + genai.configure(api_key=self.google_api_key) + return genai.embed_content( + model='models/embedding-001', + content=sample_text, + task_type="retrieval_document" + ) + + def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: int = 256) -> None: + """ + Upsert documents into the Pinecone index. + + Args: + documents (List[Page]): List of documents to upsert. + batch_limit (int): Maximum batch size for upsert operation. + chunks_size(int): size of texts per chunk. + + Returns: + None + """ + texts = [] + metadatas = [] + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=int(chunk_size), + chunk_overlap=20, + length_function=self.tiktoken_len, + separators=["\n\n", "\n", " ", ""] + ) + for i, record in enumerate(tqdm(documents)): + metadata = { + 'content': record.page_content, + 'source': record.page, + 'title': record.source + } + record_texts = text_splitter.split_text(record.page_content) + record_metadatas = [{ + "chunk": j, "text": text, **metadata + } for j, text in enumerate(record_texts)] + texts.extend(record_texts) + metadatas.extend(record_metadatas) + if len(texts) >= batch_limit: + ids = [str(uuid4()) for _ in range(len(texts))] + embeds = self.embed(texts) + embeds = embeds['embedding'] + index = self.pc.Index(self.index_name) + index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True) + texts = [] + metadatas = [] + + + def index_documents(self, urls: List[str], batch_limit: int = 32, chunk_size: int = 256) -> None: + """ + Process a list of URLs and upsert documents to a Pinecone index. + + Args: + urls (List[str]): List of URLs to process. + batch_limit (int): Batch limit for upserting documents. + chunks_size(int): size of texts per chunk. + + Returns: + None + """ + for url in tqdm(urls, desc="Processing URLs"): + print(f"Processing URL: {url}") + pages = self.load_document(url) + print(f"Found {len(pages)} pages in the PDF.") + pages_data = [ + Page( + page_content=page.page_content, + metadata=page.metadata, + page=page.metadata.get("page", 0), + source=page.metadata.get("source") + ) + for page in pages + ] + + print(f"Upserting {len(pages_data)} pages to the Pinecone index...") + self.upsert_documents(pages_data, batch_limit, chunk_size) + print("Finished upserting documents for this URL.") + index = self.pc.Index(self.index_name) + print(index.describe_index_stats()) + print("Indexing complete.") + + def initialize_vectorstore(self, index_name): + index = self.pc.Index(index_name) + embed = GoogleGenerativeAIEmbeddings( + model="models/embedding-001", + google_api_key=self.google_api_key + ) + vectorstore = PineconeVectorStore(index, embed, "text") + return vectorstore + + + def retrieve_and_generate(self,query: str, index_name: str, model_name: str = 'gemini-pro', top_k: int =5): + """ + Retrieve documents from the Pinecone index and generate a response. + Args: + query: The qury from the user + index_name: The name of the Pinecone index + model_name: The name of the model to use : defaults to 'gemini-pro' + top_k: The number of documents to retrieve from the index : defaults to 5 + """ + llm = ChatGoogleGenerativeAI(model = Config.default_google_model, google_api_key=self.google_api_key) + rag_prompt = PromptTemplate(template = Config.template_str, input_variables = ["query", "context"]) + vector_store = self.initialize_vectorstore(index_name) + retriever = vector_store.as_retriever(search_kwargs = {"k": top_k}) + rag_chain = ( + {"context": itemgetter("query")| retriever, + "query": itemgetter("query"), + } + | rag_prompt + | llm + | StrOutputParser() + ) + + return rag_chain.invoke({"query": query}) diff --git a/src/_google/docindex.py b/src/_google/docindex.py deleted file mode 100644 index 91577c4..0000000 --- a/src/_google/docindex.py +++ /dev/null @@ -1,267 +0,0 @@ -from pinecone import Pinecone, PodSpec -from tqdm.auto import tqdm -from uuid import uuid4 -from langchain_community.document_loaders import PyPDFLoader -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_google_genai import GoogleGenerativeAIEmbeddings -import tiktoken -from typing import List -from _openai.doc_model import Page -import google.generativeai as genai -from pathlib import Path -from langchain_community.document_loaders import UnstructuredWordDocumentLoader -from langchain_community.document_loaders import UnstructuredMarkdownLoader -from langchain_community.document_loaders import UnstructuredHTMLLoader -from langchain_pinecone import PineconeVectorStore -from langchain_core.prompts import PromptTemplate -from langchain_core.output_parsers import StrOutputParser -from operator import itemgetter -from langchain_google_genai import ChatGoogleGenerativeAI -from src.config import Config - - -class GooglePineconeIndexer: - """ - Class for indexing documents to Pinecone using GoogleGenerativeAIEmbeddings embeddings. - """ - def __init__( - self, - index_name: str, - pinecone_api_key: str, - google_api_key: str - ) -> None: - """ - Initialize the GoogleGenerativeAIEmbeddings object. - - Args: - index_name (str): Name of the Pinecone index. - pinecone_api_key (str): Pinecone API key. - environment (str): Environment for Pinecone service. - google_api_key (str): Google API key. - """ - self.pc = Pinecone(api_key=pinecone_api_key) - self.index_name = index_name - self.google_api_key = google_api_key - self.tokenizer = tiktoken.get_encoding('p50k_base') - - def create_index(self, environment: str = "us-west1-gcp" ): - """ - Creates an index with the specified parameters. - - Args: - environment (str, optional): The environment where the index will be created. Defaults to "us-west1-gcp". - - Returns: - None - """ - print(f"Creating index {self.index_name}") - self.pc.create_index( - name=self.index_name, - dimension=768, - metric="cosine", - spec=PodSpec( - environment=environment, - pod_type="p1.x1", - pods=1 - ) - ) - return print(f"Index {self.index_name} created successfully!") - - def delete_index(self): - """ - Deletes the created index. - - Returns: - None - """ - print(f"Deleting index {self.index_name}") - self.pc.delete_index(self.index_name) - return print(f"Index {self.index_name} deleted successfully!") - - - def load_document(self, file_url: str) -> List[str]: - """ - Load a document from a given file URL and split it into pages. - - This method supports loading documents in various formats including PDF, DOCX, DOC, Markdown, and HTML. - It uses the appropriate loader for each file type to load the document and split it into pages. - - Args: - file_url (str): The URL of the file to be loaded. - - Returns: - List[str]: A list of strings, where each string represents a page from the loaded document. - - Raises: - ValueError: If the file type is not supported or recognized. - """ - pages = [] - file_path = Path(file_url) - - # Determine file type and use the appropriate loader - file_extension = file_path.suffix - - # Load and split PDF files - if file_extension == ".pdf": - loader = PyPDFLoader(file_url) - pages = loader.load_and_split() - - # Load and split DOCX and DOC files - elif file_extension in ('.docx', '.doc'): - loader = UnstructuredWordDocumentLoader(file_url) - pages = loader.load_and_split() - - # Load and split Markdown files - elif file_extension == '.md': - loader = UnstructuredMarkdownLoader(file_url) - pages = loader.load_and_split() - - # Load and split HTML files - elif file_extension == '.html': - loader = UnstructuredHTMLLoader(file_url) - pages = loader.load_and_split() - - # Return the list of pages - return pages - - def tiktoken_len(self, text: str) -> int: - """ - Calculate length of text in tokens. - - Parameters: - text (str): Input text. - - Returns: - int: Length of text in tokens. - """ - tokens = self.tokenizer.encode( - text, - disallowed_special=() - ) - return len(tokens) - - def embed(self, sample_text: str) -> GoogleGenerativeAIEmbeddings: - """ - Embeds the given sample text using Google's Generative AI. - - Args: - sample_text (str): The text to be embedded. - - Returns: - GoogleGenerativeAIEmbeddings: An object containing the embedded content. - """ - genai.configure(api_key=self.google_api_key) - return genai.embed_content( - model='models/embedding-001', - content=sample_text, - task_type="retrieval_document" - ) - - def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: int = 256) -> None: - """ - Upsert documents into the Pinecone index. - - Args: - documents (List[Page]): List of documents to upsert. - batch_limit (int): Maximum batch size for upsert operation. - chunks_size(int): size of texts per chunk. - - Returns: - None - """ - texts = [] - metadatas = [] - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=int(chunk_size), - chunk_overlap=20, - length_function=self.tiktoken_len, - separators=["\n\n", "\n", " ", ""] - ) - for i, record in enumerate(tqdm(documents)): - metadata = { - 'content': record.page_content, - 'source': record.page, - 'title': record.source - } - record_texts = text_splitter.split_text(record.page_content) - record_metadatas = [{ - "chunk": j, "text": text, **metadata - } for j, text in enumerate(record_texts)] - texts.extend(record_texts) - metadatas.extend(record_metadatas) - if len(texts) >= batch_limit: - ids = [str(uuid4()) for _ in range(len(texts))] - embeds = self.embed(texts) - embeds = embeds['embedding'] - index = self.pc.Index(self.index_name) - index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True) - texts = [] - metadatas = [] - - - def index_documents(self, urls: List[str], batch_limit: int, chunk_size: int = 256) -> None: - """ - Process a list of URLs and upsert documents to a Pinecone index. - - Args: - urls (List[str]): List of URLs to process. - batch_limit (int): Batch limit for upserting documents. - chunks_size(int): size of texts per chunk. - - Returns: - None - """ - for url in tqdm(urls, desc="Processing URLs"): - print(f"Processing URL: {url}") - pages = self.load_document(url) - print(f"Found {len(pages)} pages in the PDF.") - pages_data = [ - Page( - page_content=page.page_content, - metadata=page.metadata, - page=page.metadata.get("page", 0), - source=page.metadata.get("source") - ) - for page in pages - ] - - print(f"Upserting {len(pages_data)} pages to the Pinecone index...") - self.upsert_documents(pages_data, batch_limit, chunk_size) - print("Finished upserting documents for this URL.") - index = self.pc.Index(self.index_name) - print(index.describe_index_stats()) - print("Indexing complete.") - - def initialize_vectorstore(self, index_name): - index = self.pc.Index(index_name) - embed = GoogleGenerativeAIEmbeddings( - model="models/embedding-001", - google_api_key=self.google_api_key - ) - vectorstore = PineconeVectorStore(index, embed, "text") - return vectorstore - - - def retrieve_and_generate(self,query: str, index_name: str, model_name: str = 'gemini-pro', top_k: int =5): - """ - Retrieve documents from the Pinecone index and generate a response. - Args: - query: The qury from the user - index_name: The name of the Pinecone index - model_name: The name of the model to use : defaults to 'gemini-pro' - top_k: The number of documents to retrieve from the index : defaults to 5 - """ - llm = ChatGoogleGenerativeAI(model = Config.default_google_model, google_api_key=self.google_api_key) - rag_prompt = PromptTemplate(template = Config.template_str, input_variables = ["query", "context"]) - vector_store = self.initialize_vectorstore(index_name) - retriever = vector_store.as_retriver(search_kwargs = {"k": top_k}) - rag_chain = ( - {"context": itemgetter("query")| retriever, - "query": itemgetter("query"), - } - | rag_prompt - | llm - | StrOutputParser() - ) - - return rag_chain.invoke({"query": query}) diff --git a/src/_google/index_documents.py b/src/_google/index_documents.py new file mode 100644 index 0000000..76fdece --- /dev/null +++ b/src/_google/index_documents.py @@ -0,0 +1,18 @@ +from .doc_index import GooglePineconeIndexer +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description="Index documents on Pinecone using OpenAI embeddings.") + parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key") + parser.add_argument("--index_name", type=str, help="Name of the Pinecone index") + parser.add_argument("--google_api_key", type=str, help="OpenAI API key") + parser.add_argument("--batch_limit", default = 32, type=int, help="Maximum batch size for indexing") + parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed") + parser.add_argument("--chunk_size", default = 256, help="size of texts per chunk") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + pinecone_indexer = GooglePineconeIndexer(args.index_name, args.pinecone_api_key, args.google_api_key) + pinecone_indexer.index_documents(args.docs, args.batch_limit, args.chunk_size) From 5eccf5f8838d7c847f5be9cbd90e83bf362bec14 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 30 Apr 2024 14:58:30 +0300 Subject: [PATCH 07/16] Readme update --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 58ee0c8..03afa75 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -

DocIndex: Fast Persistent Document Embeddings Storage for RAG

+

DocIndex: Fast Persistent Document Embeddings Storage for Production-Level RAG

From 6b6b3eb4b2e1388dc78a3725aa7a41621075ef10 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 30 Apr 2024 15:12:33 +0300 Subject: [PATCH 08/16] fix: cohere environment variables and run tests one file at a time --- .github/workflows/tests.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index c8fce6c..c0d31b0 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -25,6 +25,9 @@ jobs: PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }} run: | - pytest + pytest src/tests/openaiindex_test.py + pytest src/tests/googleindex_test.py + pytest src/tests/cohereindex_test.py \ No newline at end of file From 0f8469c03b44fe082b609ef0f9676a66cbd94d41 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 30 Apr 2024 17:34:40 +0300 Subject: [PATCH 09/16] add utils module with Query result pydantic model and prompt configuration --- src/utils/__init__.py | 0 src/utils/config.py | 14 ++++++++++++++ src/utils/response_model.py | 38 +++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+) create mode 100644 src/utils/__init__.py create mode 100644 src/utils/config.py create mode 100644 src/utils/response_model.py diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/config.py b/src/utils/config.py new file mode 100644 index 0000000..c5ecac3 --- /dev/null +++ b/src/utils/config.py @@ -0,0 +1,14 @@ +class Config: + template_str = """ + You are very helpful assistant for question answering tasks. Use the pieces of retrieved context to answer question given. If you do not know + the answer, Just say that you do not know the answer instead of making up an answer. + + Retrieved context: {context} + Query: {query} + format instructions: {format_instructions} + + """ + + default_google_model = "gemini-pro" + default_openai_model = "gpt-3.5-turbo-0125" + default_cohere_model = "command" diff --git a/src/utils/response_model.py b/src/utils/response_model.py new file mode 100644 index 0000000..fee8440 --- /dev/null +++ b/src/utils/response_model.py @@ -0,0 +1,38 @@ +from typing import List, Union +from pydantic import BaseModel, Field + +class Document(BaseModel): + page_content: str = Field(..., description="The content of the page from the source document.") + source: Union[float, int] = Field(..., description="The page number of the page_content in the document") + title: str = Field(..., description="The title or URL of the source document.") + + +class QueryResult(BaseModel): + query: str = Field(..., description="The query that was submitted.") + result: str = Field(..., description="The result of the query, including any retrieved information.") + page: Union[float, int] = Field(..., description="The page number of the final result of the query.") + source_documents: List[Document] = Field(..., description="A list of source documents related to the query.") + +# Example usage: +# data = { +# 'query': 'how did RAG come up?', +# 'result': 'RAG came up as a language model that is more strongly grounded in ' +# 'than BART and has been effective in Jeopardy question generation.\n' +# '\n' +# 'Sources:\n' +# '- https://arxiv.org/pdf/2005.11401.pdf (page 5.0)', +# 'page': 5.0, # A +# 'source_documents': [ +# { +# 'page_content': 'page-content-where-the-response-is-from.\n10', +# 'source': 9.0, +# 'title': 'document-title' +# }, +# { +# 'page_content': 'page-content-where-the-response-is-from.\n17', +# 'source': 5.0, +# 'title': 'document-title' +# } +# ] +# } + From aa826f36c0ec53851437ebb8a5edfa1744c36e7d Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 30 Apr 2024 17:37:02 +0300 Subject: [PATCH 10/16] fix: retrieve and generate test method --- src/_openai/config.py | 12 ------------ src/tests/cohereindex_test.py | 18 +++++++++--------- src/tests/googleindex_test.py | 3 ++- src/tests/openaiindex_test.py | 5 ++--- 4 files changed, 13 insertions(+), 25 deletions(-) delete mode 100644 src/_openai/config.py diff --git a/src/_openai/config.py b/src/_openai/config.py deleted file mode 100644 index 13b30da..0000000 --- a/src/_openai/config.py +++ /dev/null @@ -1,12 +0,0 @@ -class Config: - template_str = """ - You are very helpful assistant for question answering tasks. Use the pieces of retrieved context to answer question given. If you do not know - the answer, Just say that you do not know the answer instead of making up an answer. - - Retrieved context: {context} - Query: {query} - """ - - default_google_model = "gemini-pro" - default_openai_model = "gpt-3.5-turbo-0125" - default_cohere_model = "command" diff --git a/src/tests/cohereindex_test.py b/src/tests/cohereindex_test.py index 14f0ccb..90fefd4 100644 --- a/src/tests/cohereindex_test.py +++ b/src/tests/cohereindex_test.py @@ -54,15 +54,15 @@ def test_03_initialize_vectorstore(self): vectorstore = self.indexer.initialize_vectorstore(self.index_name) self.assertIsInstance(vectorstore, PineconeVectorStore) - def test_04_retrieve_and_generate(self): - """ - Test initializing the vector store and assert its type. - """ - response = self.indexer.retrieve_and_generate(query = "what is the title of the document", - index_name= self.index_name - ) - print(response) - self.assertIsNotNone(response, "The retriever response should not be None.") + # def test_04_retrieve_and_generate(self): + # """ + # Test initializing the vector store and assert its type. + # """ + # response = self.indexer.retrieve_and_generate(query = "what is the title of the document", + # index_name= self.index_name + # ) + # print(response) + # self.assertIsNotNone(response, "The retriever response should not be None.") @patch('sys.stdout', new_callable=StringIO) def test_05_delete_index(self, mock_stdout): diff --git a/src/tests/googleindex_test.py b/src/tests/googleindex_test.py index 0cdec78..6f64e09 100644 --- a/src/tests/googleindex_test.py +++ b/src/tests/googleindex_test.py @@ -63,8 +63,9 @@ def test_04_retrieve_and_generate(self): """ Test initializing the vector store and assert its type. """ + vectorstore = self.indexer.initialize_vectorstore(self.index_name) response = self.indexer.retrieve_and_generate(query = "give a short summary of the introduction", - index_name= self.index_name + vectorstore= vectorstore ) print(response) self.assertIsNotNone(response, "The retriever response should not be None.") diff --git a/src/tests/openaiindex_test.py b/src/tests/openaiindex_test.py index 4a59a88..7e6405e 100644 --- a/src/tests/openaiindex_test.py +++ b/src/tests/openaiindex_test.py @@ -8,7 +8,6 @@ from dotenv import load_dotenv load_dotenv() - class TestOpenaiPineconeIndexer(unittest.TestCase): """ Test case class for the OpenaiPineconeIndexer. @@ -63,10 +62,10 @@ def test_04_retrieve_and_generate(self): """ Test initializing the vector store and assert its type. """ + vectorstore = self.indexer.initialize_vectorstore(self.index_name) response = self.indexer.retrieve_and_generate(query = "give a short summary of the introduction", - index_name= self.index_name + vector_store = vectorstore ) - print(response) self.assertIsNotNone(response, "The retriever response should not be None.") @patch('sys.stdout', new_callable=StringIO) From c197796e1de5a87170be68383b03e71c3a6b6aad Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 30 Apr 2024 17:39:32 +0300 Subject: [PATCH 11/16] update retrieve_and_generate method with Query result pydantic model --- src/_cohere/doc_index.py | 24 ++++++++++++++---------- src/_google/doc_index.py | 34 ++++++++++++++-------------------- src/_openai/doc_index.py | 35 +++++++++++++++-------------------- 3 files changed, 43 insertions(+), 50 deletions(-) diff --git a/src/_cohere/doc_index.py b/src/_cohere/doc_index.py index 82d8685..8304f7b 100644 --- a/src/_cohere/doc_index.py +++ b/src/_cohere/doc_index.py @@ -16,7 +16,9 @@ from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser from operator import itemgetter -from _cohere.config import Config +from utils.config import Config +from utils.response_model import QueryResult +from langchain.output_parsers import PydanticOutputParser class CoherePineconeIndexer: """ @@ -218,7 +220,7 @@ def initialize_vectorstore(self, index_name): vectorstore = PineconeVectorStore(index,embed, "text") return vectorstore - def retrieve_and_generate(self,query: str, index_name: str, model_name: str = 'gpt-3.5-turbo-1106', top_k: int =5): + def retrieve_and_generate(self,query: str, vector_store: str, model_name: str = 'gpt-3.5-turbo-1106', top_k: int =5): """ Retrieve documents from the Pinecone index and generate a response. Args: @@ -228,17 +230,19 @@ def retrieve_and_generate(self,query: str, index_name: str, model_name: str = 'g top_k: The number of documents to retrieve from the index : defaults to 5 """ llm = Cohere(model="command", cohere_api_key = self.cohere_api_key) - rag_prompt = PromptTemplate(template = Config.template_str, input_variables = ["query", "context"]) - - vector_store = self.initialize_vectorstore(index_name) + parser = PydanticOutputParser(pydantic_object=QueryResult) + rag_prompt = PromptTemplate(template = Config.template_str, + input_variables = ["query", "context"], + partial_variables={"format_instructions": parser.get_format_instructions()}) retriever = vector_store.as_retriever(search_kwargs = {"k": top_k}) + rag_chain = ( {"context": itemgetter("query")| retriever, - "query": itemgetter("query"), - } - | rag_prompt - | llm - | StrOutputParser() + "query": itemgetter("query"), + } + | rag_prompt + | llm + | parser ) return rag_chain.invoke({"query": query}) diff --git a/src/_google/doc_index.py b/src/_google/doc_index.py index 71e49f8..46e2fdb 100644 --- a/src/_google/doc_index.py +++ b/src/_google/doc_index.py @@ -14,11 +14,11 @@ from langchain_community.document_loaders import UnstructuredHTMLLoader from langchain_pinecone import PineconeVectorStore from langchain_core.prompts import PromptTemplate -from langchain_core.output_parsers import StrOutputParser from operator import itemgetter from langchain_google_genai import ChatGoogleGenerativeAI -from _google.config import Config - +from utils.config import Config +from utils.response_model import QueryResult +from langchain.output_parsers import PydanticOutputParser class GooglePineconeIndexer: """ @@ -97,31 +97,22 @@ def load_document(self, file_url: str) -> List[str]: """ pages = [] file_path = Path(file_url) - - # Determine file type and use the appropriate loader file_extension = file_path.suffix - - # Load and split PDF files if file_extension == ".pdf": loader = PyPDFLoader(file_url) pages = loader.load_and_split() - # Load and split DOCX and DOC files elif file_extension in ('.docx', '.doc'): loader = UnstructuredWordDocumentLoader(file_url) pages = loader.load_and_split() - # Load and split Markdown files elif file_extension == '.md': loader = UnstructuredMarkdownLoader(file_url) pages = loader.load_and_split() - # Load and split HTML files - elif file_extension == '.html': loader = UnstructuredHTMLLoader(file_url) pages = loader.load_and_split() - # Return the list of pages return pages def tiktoken_len(self, text: str) -> int: @@ -242,7 +233,7 @@ def initialize_vectorstore(self, index_name): return vectorstore - def retrieve_and_generate(self,query: str, index_name: str, model_name: str = 'gemini-pro', top_k: int =5): + def retrieve_and_generate(self,query: str, vector_store: str, model_name: str = 'gemini-pro', top_k: int =5): """ Retrieve documents from the Pinecone index and generate a response. Args: @@ -252,16 +243,19 @@ def retrieve_and_generate(self,query: str, index_name: str, model_name: str = 'g top_k: The number of documents to retrieve from the index : defaults to 5 """ llm = ChatGoogleGenerativeAI(model = Config.default_google_model, google_api_key=self.google_api_key) - rag_prompt = PromptTemplate(template = Config.template_str, input_variables = ["query", "context"]) - vector_store = self.initialize_vectorstore(index_name) + parser = PydanticOutputParser(pydantic_object=QueryResult) + rag_prompt = PromptTemplate(template = Config.template_str, + input_variables = ["query", "context"], + partial_variables={"format_instructions": parser.get_format_instructions()}) retriever = vector_store.as_retriever(search_kwargs = {"k": top_k}) + rag_chain = ( {"context": itemgetter("query")| retriever, - "query": itemgetter("query"), - } - | rag_prompt - | llm - | StrOutputParser() + "query": itemgetter("query"), + } + | rag_prompt + | llm + | parser ) return rag_chain.invoke({"query": query}) diff --git a/src/_openai/doc_index.py b/src/_openai/doc_index.py index c0f1f4e..3decd5e 100644 --- a/src/_openai/doc_index.py +++ b/src/_openai/doc_index.py @@ -13,12 +13,11 @@ from langchain_community.document_loaders import UnstructuredHTMLLoader from langchain_pinecone import PineconeVectorStore from langchain_core.prompts import PromptTemplate -from langchain_core.output_parsers import StrOutputParser from operator import itemgetter from langchain_openai import ChatOpenAI -from _openai.config import Config - - +from utils.config import Config +from utils.response_model import QueryResult +from langchain.output_parsers import PydanticOutputParser class OpenaiPineconeIndexer: """ @@ -99,30 +98,23 @@ def load_document(self, file_url: str) -> List[str]: pages = [] file_path = Path(file_url) - # Determine file type and use the appropriate loader file_extension = file_path.suffix - # Load and split PDF files if file_extension == ".pdf": loader = PyPDFLoader(file_url) pages = loader.load_and_split() - # Load and split DOCX and DOC files elif file_extension in ('.docx', '.doc'): loader = UnstructuredWordDocumentLoader(file_url) pages = loader.load_and_split() - # Load and split Markdown files elif file_extension == '.md': loader = UnstructuredMarkdownLoader(file_url) pages = loader.load_and_split() - # Load and split HTML files elif file_extension == '.html': loader = UnstructuredHTMLLoader(file_url) pages = loader.load_and_split() - - # Return the list of pages return pages @@ -240,7 +232,7 @@ def initialize_vectorstore(self, index_name): return vectorstore - def retrieve_and_generate(self,query: str, index_name: str, model_name: str = 'gpt-3.5-turbo-1106', top_k: int =5): + def retrieve_and_generate(self,query: str, vector_store: str, model_name: str = 'gpt-3.5-turbo-1106', top_k: int =5): """ Retrieve documents from the Pinecone index and generate a response. Args: @@ -250,17 +242,19 @@ def retrieve_and_generate(self,query: str, index_name: str, model_name: str = 'g top_k: The number of documents to retrieve from the index : defaults to 5 """ llm = ChatOpenAI(model = Config.default_openai_model, openai_api_key = self.openai_api_key) - rag_prompt = PromptTemplate(template = Config.template_str, input_variables = ["query", "context"]) - - vector_store = self.initialize_vectorstore(index_name) + parser = PydanticOutputParser(pydantic_object=QueryResult) + rag_prompt = PromptTemplate(template = Config.template_str, + input_variables = ["query", "context"], + partial_variables={"format_instructions": parser.get_format_instructions()}) retriever = vector_store.as_retriever(search_kwargs = {"k": top_k}) + rag_chain = ( {"context": itemgetter("query")| retriever, - "query": itemgetter("query"), - } - | rag_prompt - | llm - | StrOutputParser() + "query": itemgetter("query"), + } + | rag_prompt + | llm + | parser ) return rag_chain.invoke({"query": query}) @@ -270,3 +264,4 @@ def retrieve_and_generate(self,query: str, index_name: str, model_name: str = 'g + From 83eb400877999015cbd01a65a6bd4cc992fd31ce Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 30 Apr 2024 17:40:56 +0300 Subject: [PATCH 12/16] update dependencies --- setup.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 05831ce..3a2f61c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,7 +3,7 @@ name = docindex author = Kevin Kibe version = 0.5.0 author_email = keviinkibe@gmail.com -description = A package for fast indexing of multiple documents and their metadata on Pinecone. +description = A package for fast persistent storage of multiple document embeddings and their metadata into Pinecone for production-level RAG. long_description = file: README.md long_description_content_type = text/markdown url = https://github.com/KevKibe/docindex @@ -32,4 +32,4 @@ package_dir= =src [options.packages.find] -where=src +where=src \ No newline at end of file From 3dff046f3abe5ec4962b6063faf6bb1a2e6d3c9a Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 30 Apr 2024 17:43:08 +0300 Subject: [PATCH 13/16] fix: ruff formatting --- src/_cohere/doc_index.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/_cohere/doc_index.py b/src/_cohere/doc_index.py index 8304f7b..9910894 100644 --- a/src/_cohere/doc_index.py +++ b/src/_cohere/doc_index.py @@ -14,7 +14,6 @@ from langchain_cohere import CohereEmbeddings from langchain_community.llms import Cohere from langchain_core.prompts import PromptTemplate -from langchain_core.output_parsers import StrOutputParser from operator import itemgetter from utils.config import Config from utils.response_model import QueryResult From d560cbeea359c81c96589811f412e7426c9ae27b Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 30 Apr 2024 18:36:06 +0300 Subject: [PATCH 14/16] fix: vectorstore parameter in test_04_retrieve_and_generate method --- src/tests/googleindex_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/googleindex_test.py b/src/tests/googleindex_test.py index 6f64e09..3051db9 100644 --- a/src/tests/googleindex_test.py +++ b/src/tests/googleindex_test.py @@ -63,9 +63,9 @@ def test_04_retrieve_and_generate(self): """ Test initializing the vector store and assert its type. """ - vectorstore = self.indexer.initialize_vectorstore(self.index_name) + vector_store = self.indexer.initialize_vectorstore(self.index_name) response = self.indexer.retrieve_and_generate(query = "give a short summary of the introduction", - vectorstore= vectorstore + vectorstore= vector_store ) print(response) self.assertIsNotNone(response, "The retriever response should not be None.") From 13c181b6cdedd7df40208617dc2099933102f846 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 30 Apr 2024 18:43:38 +0300 Subject: [PATCH 15/16] fix: vectorstore parameter in test_04_retrieve_and_generate method --- src/tests/googleindex_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/googleindex_test.py b/src/tests/googleindex_test.py index 3051db9..e47cbcb 100644 --- a/src/tests/googleindex_test.py +++ b/src/tests/googleindex_test.py @@ -65,7 +65,7 @@ def test_04_retrieve_and_generate(self): """ vector_store = self.indexer.initialize_vectorstore(self.index_name) response = self.indexer.retrieve_and_generate(query = "give a short summary of the introduction", - vectorstore= vector_store + vector_store= vector_store ) print(response) self.assertIsNotNone(response, "The retriever response should not be None.") From 7642579f2920682ec33e6775dc66a9c801cfaff3 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 30 Apr 2024 19:22:13 +0300 Subject: [PATCH 16/16] add sources, titles and page_content to QueryResult Model --- src/utils/response_model.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/utils/response_model.py b/src/utils/response_model.py index fee8440..3706e20 100644 --- a/src/utils/response_model.py +++ b/src/utils/response_model.py @@ -13,7 +13,28 @@ class QueryResult(BaseModel): page: Union[float, int] = Field(..., description="The page number of the final result of the query.") source_documents: List[Document] = Field(..., description="A list of source documents related to the query.") -# Example usage: + @property + def sources(self) -> List[Union[float, int]]: + """ + Returns a list of the sources (page numbers) from the source documents. + """ + return [doc.source for doc in self.source_documents] + + @property + def titles(self) -> List[str]: + """ + Returns a list of the titles from the source documents. + """ + return [doc.title for doc in self.source_documents] + + @property + def page_contents(self) -> List[str]: + """ + Returns a list of the page contents from the source documents. + """ + return [doc.page_content for doc in self.source_documents] + +# Example # data = { # 'query': 'how did RAG come up?', # 'result': 'RAG came up as a language model that is more strongly grounded in '