From b2ad65f49534a677f1d40155fcff94449dd7e44e Mon Sep 17 00:00:00 2001 From: KevKibe Date: Sat, 6 Apr 2024 19:16:48 +0300 Subject: [PATCH 1/4] update: langchain-google-genai dependency --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 57a283b..400364b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ pypdf==4.1.0 unstructured==0.12.6 langchain-community==0.0.31 langchain==0.1.14 -langchain-openai==0.1.1 \ No newline at end of file +langchain-openai==0.1.1 +langchain-google-genai==1.0.1 \ No newline at end of file From 12e47ba1eb9245d96f7d5bdd2da379d440dadca6 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Sat, 6 Apr 2024 19:17:51 +0300 Subject: [PATCH 2/4] update: indexer that uses google embedding model --- src/_google/__init__.py | 0 src/_google/docindex.py | 157 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 src/_google/__init__.py create mode 100644 src/_google/docindex.py diff --git a/src/_google/__init__.py b/src/_google/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/_google/docindex.py b/src/_google/docindex.py new file mode 100644 index 0000000..2f9ef8c --- /dev/null +++ b/src/_google/docindex.py @@ -0,0 +1,157 @@ +from pinecone import Pinecone +from tqdm.auto import tqdm +from uuid import uuid4 +from langchain_community.document_loaders import PyPDFLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_google_genai import GoogleGenerativeAIEmbeddings +import tiktoken +from typing import List +from _openai.doc_model import Page + + +class GooglePineconeIndexer: + """ + Class for indexing documents to Pinecone using GoogleGenerativeAIEmbeddings embeddings. + """ + def __init__( + self, + index_name: str, + pinecone_api_key: str, + environment: str, + google_api_key: str + ) -> None: + """ + Initialize the GoogleGenerativeAIEmbeddings object. + + Args: + index_name (str): Name of the Pinecone index. + pinecone_api_key (str): Pinecone API key. + environment (str): Environment for Pinecone service. + google_api_key (str): Google API key. + """ + self.pc = Pinecone(api_key=pinecone_api_key, environment=environment) + self.index = self.pc.Index(index_name) + self.google_api_key = google_api_key + self.tokenizer = tiktoken.get_encoding('p50k_base') + + + def load_pdf(self, pdf_url) -> List: + """ + Load and split a PDF document into pages. + + Args: + pdf_url (str): URL of the PDF document. + + Returns: + List: List of pages from the PDF document. + """ + loader = PyPDFLoader(pdf_url) + pages = loader.load_and_split() + return pages + + def tiktoken_len(self, text: str) -> int: + """ + Calculate length of text in tokens. + + Parameters: + text (str): Input text. + + Returns: + int: Length of text in tokens. + """ + tokens = self.tokenizer.encode( + text, + disallowed_special=() + ) + return len(tokens) + + def embed(self) -> GoogleGenerativeAIEmbeddings: + """ + Initialize GoogleGenerativeAIEmbeddings object. + + Returns: + GoogleGenerativeAIEmbeddings: GoogleGenerativeAIEmbeddings object. + """ + return GoogleGenerativeAIEmbeddings( + model="models/embedding-001", + google_api_key=self.google_api_key + ) + + def text_splitter(self) -> RecursiveCharacterTextSplitter: + """ + Initialize RecursiveCharacterTextSplitter object. + + Returns: + RecursiveCharacterTextSplitter: RecursiveCharacterTextSplitter object. + """ + return RecursiveCharacterTextSplitter( + chunk_size=400, + chunk_overlap=20, + length_function=self.tiktoken_len, + separators=["\n\n", "\n", " ", ""] + ) + + def upsert_documents(self, documents: List[Page], batch_limit: int) -> None: + """ + Upsert documents into the Pinecone index. + + Args: + documents (List[Page]): List of documents to upsert. + batch_limit (int): Maximum batch size for upsert operation. + + Returns: + None + """ + texts = [] + metadatas = [] + text_splitter = self.text_splitter() + embed = self.embed() + for i, record in enumerate(tqdm(documents)): + metadata = { + 'content': record.page_content, + 'source': record.page, + 'title': record.source + } + record_texts = text_splitter.split_text(record.page_content) + record_metadatas = [{ + "chunk": j, "text": text, **metadata + } for j, text in enumerate(record_texts)] + texts.extend(record_texts) + metadatas.extend(record_metadatas) + if len(texts) >= batch_limit: + ids = [str(uuid4()) for _ in range(len(texts))] + embeds = embed.embed_documents(texts) + self.index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True) + texts = [] + metadatas = [] + + + def index_documents(self, urls: List[str], batch_limit: int) -> None: + """ + Process a list of URLs and upsert documents to a Pinecone index. + + Args: + urls (List[str]): List of URLs to process. + batch_limit (int): Batch limit for upserting documents. + + Returns: + None + """ + for url in tqdm(urls, desc="Processing URLs"): + print(f"Processing URL: {url}") + pages = self.load_pdf(url) + print(f"Found {len(pages)} pages in the PDF.") + pages_data = [ + Page( + page_content=page.page_content, + metadata=page.metadata, + page=page.metadata['page'], + source=page.metadata['source'] + ) + for page in pages + ] + + print(f"Upserting {len(pages_data)} pages to the Pinecone index...") + self.upsert_documents(pages_data, batch_limit) + print("Finished upserting documents for this URL.") + print("Indexing complete.") From 8e4a425eb922dcbabcc96ce8f6de6cd3db0fbc5b Mon Sep 17 00:00:00 2001 From: KevKibe Date: Sat, 6 Apr 2024 19:18:38 +0300 Subject: [PATCH 3/4] update: cli args for indexing --- src/_google/main.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 src/_google/main.py diff --git a/src/_google/main.py b/src/_google/main.py new file mode 100644 index 0000000..dba5156 --- /dev/null +++ b/src/_google/main.py @@ -0,0 +1,18 @@ +from .docindex import GooglePineconeIndexer +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description="Index documents on Pinecone using OpenAI embeddings.") + parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key") + parser.add_argument("--index_name", type=str, help="Name of the Pinecone index") + parser.add_argument("--google_api_key", type=str, help="OpenAI API key") + parser.add_argument("--environment", type=str, help="Environment for Pinecone service") + parser.add_argument("--batch_limit", type=int, help="Maximum batch size for indexing") + parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + pinecone_indexer = GooglePineconeIndexer(args.index_name, args.pinecone_api_key, args.environment, args.google_api_key) + pinecone_indexer.index_documents(args.docs, args.batch_limit) From 589c6f38d24e3e10271d41b670d04654af846134 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Sat, 6 Apr 2024 19:19:23 +0300 Subject: [PATCH 4/4] Uodate: README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0703c44..ed752ed 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Last commit - + License @@ -37,7 +37,7 @@ environment = "pinecone-index-environment" # Define the batch limit for indexing, how many pages per pass. batch_limit = 20 -# List of URLs of the documents to be indexed +# List of URLs of the documents to be indexed. (offline on your computer or an online) urls = [ "your-document-1.pdf", "your-document-2.pdf"