Merge pull request #10 from KevKibe/main

Fix: `embed` method in `GooglePineconeIndexer` class to embed chunks.
KevKibe · Apr 9, 2024 · fe868d5 · fe868d5
2 parents 80ed701 + b857972
commit fe868d5
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -11,8 +11,7 @@
 
 ![Diagram](image.png)
 
-*Efficiently store multiple document embeddings and their metadata, whether they're offline or online, in a persistent Pinecone Vector Database optimized for Retrieval Augmented Generation (RAG) applications fast* 
-
+ *Efficiently store multiple document embeddings and their metadata, whether they're offline or online, in a persistent Pinecone Vector Database optimized for Retrieval Augmented Generation (RAG) applications fast*
 ## Features
 
 - ⚡️ **Rapid Indexing**: Quickly index multiple documents along with their metadata, including source, page details, and content, into Pinecone DB.<br>
@@ -30,6 +29,7 @@ pip install docindex
 ## Getting Started
 - Sign up to [Pinecone](https://www.pinecone.io/) and get an API key.
 ## Using OpenAI 
+[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1f5DzVjM9n9XWFtErTdWkKUFtszXyqzMI#scrollTo=olWrGV2viIsP)
 ```python
 from _openai.docindex import OpenaiPineconeIndexer
 
@@ -65,7 +65,7 @@ pinecone_indexer.delete_index()
 
 
 ## Using Google Generative AI  
-
+[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1C4DyIsWMJWUmNuXEmmUMyshqoAKspFlp?usp=sharing)
 ```python
 from _google.docindex import GooglePineconeIndexer
 
@@ -157,8 +157,8 @@ python -m  _google.delete_index --pinecone_api_key "your_pinecone_api_key" --ind
 ```
 
 ## Contributing 
+🌟 First consider giving it a star at the top right. It means a lot!
 Contributions are welcome and encouraged.
-
 Before contributing, please take a moment to review our [Contribution Guidelines](https://github.com/KevKibe/docindex/blob/master/DOCS/CONTRIBUTING.md) for important information on how to contribute to this project.
 
 If you're unsure about anything or need assistance, don't hesitate to reach out to us or open an issue to discuss your ideas.

diff --git a/requirements.txt b/requirements.txt
@@ -6,4 +6,5 @@ langchain-community==0.0.31
 langchain==0.1.14
 langchain-openai==0.1.1
 langchain-google-genai==1.0.1
-langchain-pinecone==0.1.0
+langchain-pinecone==0.1.0
+google.generativeai==0.4.1
diff --git a/setup.cfg b/setup.cfg
@@ -1,7 +1,7 @@
 [metadata]
 name = docindex
 author = Kevin Kibe
-version = 0.3.0
+version = 0.4.0
 author_email = [email protected]
 description = A package for fast indexing of multiple documents and their metadata on Pinecone.
 long_description = file: README.md
@@ -22,6 +22,7 @@ install_requires =
     langchain-openai==0.1.1
     langchain-google-genai==1.0.1
     langchain-pinecone==0.1.0
+    google.generativeai==0.4.1
 package_dir=
     =src
 

diff --git a/src/_google/docindex.py b/src/_google/docindex.py
@@ -8,6 +8,7 @@
 from typing import List
 from _openai.doc_model import Page
 from langchain_pinecone import PineconeVectorStore 
+import google.generativeai as genai
 
 class GooglePineconeIndexer:
     """
@@ -46,7 +47,7 @@ def create_index(self, environment: str = "us-west1-gcp" ):
         print(f"Creating index {self.index_name}")
         self.pc.create_index(
             name=self.index_name,
-            dimension=1536,
+            dimension=768,
             metric="cosine",
             spec=PodSpec(
                 environment=environment,
@@ -97,19 +98,23 @@ def tiktoken_len(self, text: str) -> int:
         )
         return len(tokens)
 
-    def embed(self) -> GoogleGenerativeAIEmbeddings:
+    def embed(self, sample_text: str) -> GoogleGenerativeAIEmbeddings:
         """
-        Initialize GoogleGenerativeAIEmbeddings object.
+        Embeds the given sample text using Google's Generative AI.
+
+        Args:
+            sample_text (str): The text to be embedded.
 
         Returns:
-            GoogleGenerativeAIEmbeddings: GoogleGenerativeAIEmbeddings object.
+            GoogleGenerativeAIEmbeddings: An object containing the embedded content.
         """
-        return GoogleGenerativeAIEmbeddings(
-            model="models/embedding-001", 
-            google_api_key=self.google_api_key
-            )
+        genai.configure(api_key=self.google_api_key)
+        return genai.embed_content(
+            model='models/embedding-001',
+            content=sample_text,
+            task_type="retrieval_document"
+        )
 
-
     def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: int = 256) -> None:
         """
         Upsert documents into the Pinecone index.
@@ -130,7 +135,6 @@ def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size:
             length_function=self.tiktoken_len,
             separators=["\n\n", "\n", " ", ""]
         )
-        embed = self.embed()  
         for i, record in enumerate(tqdm(documents)):
             metadata = {
                 'content': record.page_content,
@@ -145,7 +149,8 @@ def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size:
             metadatas.extend(record_metadatas)
             if len(texts) >= batch_limit:
                 ids = [str(uuid4()) for _ in range(len(texts))]
-                embeds = embed.embed_documents(texts)  
+                embeds = self.embed(texts)
+                embeds = embeds['embedding']
                 index = self.pc.Index(self.index_name)  
                 index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True)
                 texts = []