Merge pull request #8 from KevKibe/main

Update: Add `parse_args()` for creating and deleting indexes from CLI, removing `environment` as a parameter, `README.md` update
KevKibe · Apr 8, 2024 · 2bf632f · 2bf632f
2 parents 54ccb58 + 4a2b850
commit 2bf632f
Show file tree

Hide file tree

Showing 12 changed files with 200 additions and 57 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-<h1 align="center">DocIndex: Fast Document Embeddings Storage for RAG</h1>
+<h1 align="center">DocIndex: Fast Persistent Document Embeddings Storage for RAG</h1>
 <p align="center">
 
   <a href="https://github.com/KevKibe/docindex/commits/">
@@ -7,8 +7,11 @@
   <a href="https://github.com/KevKibe/docindex/blob/master/LICENSE">
     <img src="https://img.shields.io/github/license/KevKibe/docindex?" alt="License">
   </a>
+<br>
 
-*Efficiently store multiple document embeddings and their metadata, whether they're offline or online, in a Pinecone Vector Database optimized for Retrieval Augmented Generation (RAG) models Fast* 
+![Diagram](image.png)
+
+*Efficiently store multiple document embeddings and their metadata, whether they're offline or online, in a persistent Pinecone Vector Database optimized for Retrieval Augmented Generation (RAG) models Fast* 
 
 ## Features
 
@@ -25,15 +28,15 @@ pip install docindex
 ```
 
 ## Getting Started
+- Sign up to [Pinecone](https://www.pinecone.io/) and get an API key.
 ## Using OpenAI 
 ```python
 from _openai.docindex import OpenaiPineconeIndexer
 
-# Replace these values with your actual Pinecone API key, index name, OpenAI API key, and environment
+# Replace these values with your actual Pinecone API key, index name, OpenAI API key
 pinecone_api_key = "pinecone-api-key"
 index_name = "pinecone-index-name"
 openai_api_key = "openai-api-key"
-environment = "pinecone-index-environment"
 batch_limit = 20 # Batch limit for upserting documents
 chunk_size = 256 # Optional: size of texts per chunk. 
 
@@ -44,11 +47,18 @@ urls = [
 ]
 
 # Initialize the Pinecone indexer
-pinecone_index = OpenaiPineconeIndexer(index_name, pinecone_api_key, environment, openai_api_key)
+pinecone_index = OpenaiPineconeIndexer(index_name, pinecone_api_key, openai_api_key)
+
+# To create a new Index
+pinecone_index.create_index()
 
-# Index the documents with the specified URLs and batch limit
+# Store the document embeddings with the specified URLs and batch limit
 pinecone_index.index_documents(urls,batch_limit,chunk_size)
 ```
+```python
+# To delete the created Index
+pinecone_index.delete_index()
+```
 ## Initialize Vectorstore(using OpenAI)
 
 ```python
@@ -66,11 +76,8 @@ embed = OpenAIEmbeddings(
         openai_api_key = openai_api_key
         )
 
-# Define the text field
-text_field = "text"
-
 # Initialize the Vectorstore with the Pinecone index and OpenAI embeddings
-vectorstore = VectorStorePinecone(index, embed, text_field)
+vectorstore = VectorStorePinecone(index, embed, "text")
 ```
 
 
@@ -79,11 +86,10 @@ vectorstore = VectorStorePinecone(index, embed, text_field)
 ```python
 from _google.docindex import GooglePineconeIndexer
 
-# Replace these values with your actual Pinecone API key, index name, OpenAI API key, and environment
+# Replace these values with your actual Pinecone API key, index name, Google API key
 pinecone_api_key = "pinecone-api-key"
 index_name = "pinecone-index-name"
 google_api_key = "google-api-key"
-environment = "pinecone-index-environment"
 batch_limit = 20 # Batch limit for upserting documents
 chunk_size = 256 # Optional: size of texts per chunk. 
 
@@ -93,12 +99,18 @@ urls = [
  "your-document-2.pdf"
 ]
 
-# Initialize the Pinecone indexer
-pinecone_index = GooglePineconeIndexer(index_name, pinecone_api_key, environment, google_api_key)
+pinecone_index = GooglePineconeIndexer(index_name, pinecone_api_key, google_api_key)
 
-# Index the documents with the specified URLs and batch limit
+# To create a new Index
+pinecone_index.create_index()
+
+# Store the document embeddings with the specified URLs and batch limit
 pinecone_index.index_documents(urls,batch_limit,chunk_size)
 ```
+```python
+# To delete the created Index
+pinecone_index.delete_index()
+```
 
 
 ## Initialize Vectorstore(using Google Generative AI)
@@ -118,11 +130,8 @@ embed = GoogleGenerativeAIEmbeddings(
         google_api_key=google_api_key
         )
 
-# Define the text field
-text_field = "text"
-
 # Initialize the Vectorstore with the Pinecone index and OpenAI embeddings
-vectorstore = VectorStorePinecone(index, embed, text_field)
+vectorstore = VectorStorePinecone(index, embed, "text")
 ```
 
 
@@ -155,16 +164,38 @@ pip install -r requirements.txt
 ```bash
 cd src
 ```
+- To create an index
+
+```bash
+# Using OpenAI 
+python -m  _openai.create_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --openai_api_key "your_openai_api_key"
+```
+
+```bash
+# Using Google Generative AI
+python -m  _google.create_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --google_api_key "your_google_api_key"
+```
 
 - Run the command to start indexing the documents
 
 ```bash
 # Using OpenAI 
-python -m _openai.doc_index  --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --openai_api_key "your_openai_api_key" --environment "your_environment" --batch_limit 10 --docs  "doc-1.pdf" "doc-2.pdf' --chunk_size 256 
+python -m _openai.doc_index  --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --openai_api_key "your_openai_api_key" --batch_limit 10 --docs  "doc-1.pdf" "doc-2.pdf' --chunk_size 256 
 ```
 ```bash
 # Using Google Generative AI 
-python -m _google.doc_index  --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --google_api_key "your_google_api_key" --environment "your_environment" --batch_limit 10 --docs  "doc-1.pdf" "doc-2.pdf' --chunk_size 256 
+python -m _google.doc_index  --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --google_api_key "your_google_api_key" --batch_limit 10 --docs  "doc-1.pdf" "doc-2.pdf' --chunk_size 256 
+```
+- To delete an index
+
+```bash
+# Using OpenAI 
+python -m  _openai.delete_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --openai_api_key "your_openai_api_key"
+```
+
+```bash
+# Using Google Generative AI
+python -m  _google.delete_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --google_api_key "your_google_api_key"
 ```
 
 ## Contributing 

diff --git a/image.png b/image.png
diff --git a/setup.cfg b/setup.cfg
@@ -1,7 +1,7 @@
 [metadata]
 name = docindex
 author = Kevin Kibe
-version = 0.1.0
+version = 0.2.0
 author_email = [email protected]
 description = A package for fast indexing of multiple documents and their metadata on Pinecone.
 long_description = file: README.md

diff --git a/src/_google/create_index.py b/src/_google/create_index.py
@@ -0,0 +1,15 @@
+from .docindex import GooglePineconeIndexer
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Creates an Index on Pinecone.")
+    parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
+    parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
+    parser.add_argument("--google_api_key", type=str, help="Google API key")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    pinecone_indexer = GooglePineconeIndexer(args.index_name, args.pinecone_api_key, args.google_api_key)
+    pinecone_indexer.create_index()
diff --git a/src/_google/delete_index.py b/src/_google/delete_index.py
@@ -0,0 +1,15 @@
+from .docindex import GooglePineconeIndexer
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Deletes an Index on Pinecone.")
+    parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
+    parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
+    parser.add_argument("--google_api_key", type=str, help="OpenAI API key")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    pinecone_indexer = GooglePineconeIndexer(args.index_name, args.pinecone_api_key, args.google_api_key)
+    pinecone_indexer.delete_index()
diff --git a/src/_google/doc_index.py b/src/_google/doc_index.py
@@ -6,7 +6,6 @@ def parse_args():
     parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
     parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
     parser.add_argument("--google_api_key", type=str, help="OpenAI API key")
-    parser.add_argument("--environment", type=str, help="Environment for Pinecone service")
     parser.add_argument("--batch_limit", type=int,  help="Maximum batch size for indexing")
     parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed")
     parser.add_argument("--chunk_size", help="size of texts per chunk")
@@ -15,5 +14,5 @@ def parse_args():
 
 if __name__ == "__main__":
     args = parse_args()
-    pinecone_indexer = GooglePineconeIndexer(args.index_name, args.pinecone_api_key, args.environment, args.google_api_key)
+    pinecone_indexer = GooglePineconeIndexer(args.index_name, args.pinecone_api_key, args.google_api_key)
     pinecone_indexer.index_documents(args.docs, args.batch_limit, args.chunk_size)
diff --git a/src/_google/docindex.py b/src/_google/docindex.py
@@ -1,4 +1,4 @@
-from pinecone import Pinecone
+from pinecone import Pinecone, PodSpec
 from tqdm.auto import tqdm
 from uuid import uuid4
 from langchain_community.document_loaders import PyPDFLoader
@@ -8,7 +8,6 @@
 from typing import List
 from _openai.doc_model import Page
 
-
 class GooglePineconeIndexer:
     """
     Class for indexing documents to Pinecone using GoogleGenerativeAIEmbeddings embeddings.
@@ -17,7 +16,6 @@ def __init__(
         self,
         index_name: str,
         pinecone_api_key: str,
-        environment: str,
         google_api_key: str
     ) -> None:
         """
@@ -29,12 +27,45 @@ def __init__(
             environment (str): Environment for Pinecone service.
             google_api_key (str): Google API key.
         """
-        self.pc = Pinecone(api_key=pinecone_api_key, environment=environment)
-        self.index = self.pc.Index(index_name)
+        self.pc = Pinecone(api_key=pinecone_api_key)
+        self.index_name = index_name
         self.google_api_key = google_api_key
         self.tokenizer = tiktoken.get_encoding('p50k_base')
 
+    def create_index(self, environment: str = "us-west1-gcp" ):
+        """
+        Creates an index with the specified parameters.
+
+        Args:
+            environment (str, optional): The environment where the index will be created. Defaults to "us-west1-gcp".
+
+        Returns:
+            None
+        """
+        print(f"Creating index {self.index_name}")
+        self.pc.create_index(
+            name=self.index_name,
+            dimension=1536,
+            metric="cosine",
+            spec=PodSpec(
+                environment=environment,
+                pod_type="p1.x1",
+                pods=1
+            )
+            )
+        return print(f"Index {self.index_name} created successfully!")
+
+    def delete_index(self):
+        """
+        Deletes the created index.
 
+        Returns:
+            None
+        """
+        print(f"Deleting index {self.index_name}")
+        self.pc.delete_index(self.index_name)
+        return print(f"Index {self.index_name} deleted successfully!")
+
     def load_pdf(self, pdf_url) -> List:
         """
         Load and split a PDF document into pages.
@@ -114,7 +145,8 @@ def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size:
             if len(texts) >= batch_limit:
                 ids = [str(uuid4()) for _ in range(len(texts))]
                 embeds = embed.embed_documents(texts)  
-                self.index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True)
+                index = self.pc.Index(self.index_name)  
+                index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True)
                 texts = []
                 metadatas = []
 
@@ -148,4 +180,6 @@ def index_documents(self, urls: List[str], batch_limit: int, chunk_size: int = 2
             print(f"Upserting {len(pages_data)} pages to the Pinecone index...")
             self.upsert_documents(pages_data, batch_limit, chunk_size)  
             print("Finished upserting documents for this URL.")
+        index = self.pc.Index(self.index_name)
+        index.describe_index_stats()
         print("Indexing complete.")
diff --git a/src/_google/main.py b/src/_google/main.py
diff --git a/src/_openai/create_index.py b/src/_openai/create_index.py
@@ -0,0 +1,15 @@
+from .docindex import OpenaiPineconeIndexer
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Index documents on Pinecone using OpenAI embeddings.")
+    parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
+    parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
+    parser.add_argument("--openai_api_key", type=str, help="OpenAI API key")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    pinecone_indexer = OpenaiPineconeIndexer(args.index_name, args.pinecone_api_key, args.openai_api_key)
+    pinecone_indexer.create_index()
diff --git a/src/_openai/delete_index.py b/src/_openai/delete_index.py
@@ -0,0 +1,15 @@
+from .docindex import OpenaiPineconeIndexer
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Index documents on Pinecone using OpenAI embeddings.")
+    parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
+    parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
+    parser.add_argument("--openai_api_key", type=str, help="OpenAI API key")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    pinecone_indexer = OpenaiPineconeIndexer(args.index_name, args.pinecone_api_key, args.openai_api_key)
+    pinecone_indexer.delete_index()
diff --git a/src/_openai/doc_index.py b/src/_openai/doc_index.py
@@ -6,7 +6,6 @@ def parse_args():
     parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
     parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
     parser.add_argument("--openai_api_key", type=str, help="OpenAI API key")
-    parser.add_argument("--environment", type=str, help="Environment for Pinecone service")
     parser.add_argument("--batch_limit", type=int,  help="Maximum batch size for indexing")
     parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed")
     parser.add_argument("--chunk_size", help="size of texts per chunk")
@@ -15,7 +14,7 @@ def parse_args():
 
 if __name__ == "__main__":
     args = parse_args()
-    pinecone_indexer = OpenaiPineconeIndexer(args.index_name, args.pinecone_api_key, args.environment, args.openai_api_key)
+    pinecone_indexer = OpenaiPineconeIndexer(args.index_name, args.pinecone_api_key, args.openai_api_key)
     pinecone_indexer.index_documents(args.docs, args.batch_limit, args.chunk_size)