KevKibe · KevKibe · Apr 6, 2024 · Apr 6, 2024 · Apr 6, 2024 · Apr 6, 2024
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
   <a href="https://github.com/KevKibe/docindex/commits/">
     <img src="https://img.shields.io/github/last-commit/KevKibe/docindex?" alt="Last commit">
   </a>
-  <a href="https://github.com/KevKibe/African-Whisper/blob/main/LICENSE">
+  <a href="https://github.com/KevKibe/docindex/blob/master/LICENSE">
     <img src="https://img.shields.io/github/license/KevKibe/docindex?" alt="License">
   </a>
 
@@ -37,7 +37,7 @@ environment = "pinecone-index-environment"
 # Define the batch limit for indexing, how many pages per pass.
 batch_limit = 20
 
-# List of URLs of the documents to be indexed
+# List of URLs of the documents to be indexed. (offline on your computer or an online)
 urls = [
  "your-document-1.pdf",
  "your-document-2.pdf"

diff --git a/requirements.txt b/requirements.txt
@@ -4,4 +4,5 @@ pypdf==4.1.0
 unstructured==0.12.6
 langchain-community==0.0.31
 langchain==0.1.14
-langchain-openai==0.1.1
+langchain-openai==0.1.1
+langchain-google-genai==1.0.1
diff --git a/src/_google/__init__.py b/src/_google/__init__.py
diff --git a/src/_google/docindex.py b/src/_google/docindex.py
@@ -0,0 +1,157 @@
+from pinecone import Pinecone
+from tqdm.auto import tqdm
+from uuid import uuid4
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+import tiktoken
+from typing import List
+from _openai.doc_model import Page
+
+
+class GooglePineconeIndexer:
+    """
+    Class for indexing documents to Pinecone using GoogleGenerativeAIEmbeddings embeddings.
+    """
+    def __init__(
+        self,
+        index_name: str,
+        pinecone_api_key: str,
+        environment: str,
+        google_api_key: str
+    ) -> None:
+        """
+        Initialize the GoogleGenerativeAIEmbeddings object.
+
+        Args:
+            index_name (str): Name of the Pinecone index.
+            pinecone_api_key (str): Pinecone API key.
+            environment (str): Environment for Pinecone service.
+            google_api_key (str): Google API key.
+        """
+        self.pc = Pinecone(api_key=pinecone_api_key, environment=environment)
+        self.index = self.pc.Index(index_name)
+        self.google_api_key = google_api_key
+        self.tokenizer = tiktoken.get_encoding('p50k_base')
+
+
+    def load_pdf(self, pdf_url) -> List:
+        """
+        Load and split a PDF document into pages.
+
+        Args:
+            pdf_url (str): URL of the PDF document.
+
+        Returns:
+            List: List of pages from the PDF document.
+        """
+        loader = PyPDFLoader(pdf_url)
+        pages = loader.load_and_split()
+        return pages
+
+    def tiktoken_len(self, text: str) -> int:
+        """
+        Calculate length of text in tokens.
+
+        Parameters:
+            text (str): Input text.
+
+        Returns:
+            int: Length of text in tokens.
+        """
+        tokens = self.tokenizer.encode(
+            text,
+            disallowed_special=()
+        )
+        return len(tokens)
+
+    def embed(self) -> GoogleGenerativeAIEmbeddings:
+        """
+        Initialize GoogleGenerativeAIEmbeddings object.
+
+        Returns:
+            GoogleGenerativeAIEmbeddings: GoogleGenerativeAIEmbeddings object.
+        """
+        return GoogleGenerativeAIEmbeddings(
+            model="models/embedding-001", 
+            google_api_key=self.google_api_key
+            )
+
+    def text_splitter(self) -> RecursiveCharacterTextSplitter:
+        """
+        Initialize RecursiveCharacterTextSplitter object.
+
+        Returns:
+            RecursiveCharacterTextSplitter: RecursiveCharacterTextSplitter object.
+        """
+        return RecursiveCharacterTextSplitter(
+            chunk_size=400,
+            chunk_overlap=20,
+            length_function=self.tiktoken_len,
+            separators=["\n\n", "\n", " ", ""]
+        )
+
+    def upsert_documents(self, documents: List[Page], batch_limit: int) -> None:
+        """
+        Upsert documents into the Pinecone index.
+
+        Args:
+            documents (List[Page]): List of documents to upsert.
+            batch_limit (int): Maximum batch size for upsert operation.
+
+        Returns:
+            None
+        """
+        texts = []
+        metadatas = []
+        text_splitter = self.text_splitter()  
+        embed = self.embed()  
+        for i, record in enumerate(tqdm(documents)):
+            metadata = {
+                'content': record.page_content,
+                'source': record.page,
+                'title': record.source
+            }
+            record_texts = text_splitter.split_text(record.page_content)  
+            record_metadatas = [{
+                "chunk": j, "text": text, **metadata
+            } for j, text in enumerate(record_texts)]
+            texts.extend(record_texts)
+            metadatas.extend(record_metadatas)
+            if len(texts) >= batch_limit:
+                ids = [str(uuid4()) for _ in range(len(texts))]
+                embeds = embed.embed_documents(texts)  
+                self.index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True)
+                texts = []
+                metadatas = []
+
+
+    def index_documents(self, urls: List[str], batch_limit: int) -> None:
+        """
+        Process a list of URLs and upsert documents to a Pinecone index.
+
+        Args:
+            urls (List[str]): List of URLs to process.
+            batch_limit (int): Batch limit for upserting documents.
+
+        Returns:
+            None
+        """
+        for url in tqdm(urls, desc="Processing URLs"):
+            print(f"Processing URL: {url}")
+            pages = self.load_pdf(url)
+            print(f"Found {len(pages)} pages in the PDF.")
+            pages_data = [
+                Page(
+                    page_content=page.page_content,
+                    metadata=page.metadata,
+                    page=page.metadata['page'],
+                    source=page.metadata['source']
+                )
+                for page in pages
+            ]
+
+            print(f"Upserting {len(pages_data)} pages to the Pinecone index...")
+            self.upsert_documents(pages_data, batch_limit)  
+            print("Finished upserting documents for this URL.")
+        print("Indexing complete.")
diff --git a/src/_google/main.py b/src/_google/main.py
@@ -0,0 +1,18 @@
+from .docindex import GooglePineconeIndexer
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Index documents on Pinecone using OpenAI embeddings.")
+    parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
+    parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
+    parser.add_argument("--google_api_key", type=str, help="OpenAI API key")
+    parser.add_argument("--environment", type=str, help="Environment for Pinecone service")
+    parser.add_argument("--batch_limit", type=int,  help="Maximum batch size for indexing")
+    parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    pinecone_indexer = GooglePineconeIndexer(args.index_name, args.pinecone_api_key, args.environment, args.google_api_key)
+    pinecone_indexer.index_documents(args.docs, args.batch_limit)