diff --git a/README.md b/README.md index 41e4217..38696e3 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,7 @@ ![Diagram](image.png) -*Efficiently store multiple document embeddings and their metadata, whether they're offline or online, in a persistent Pinecone Vector Database optimized for Retrieval Augmented Generation (RAG) applications fast* - + *Efficiently store multiple document embeddings and their metadata, whether they're offline or online, in a persistent Pinecone Vector Database optimized for Retrieval Augmented Generation (RAG) applications fast* ## Features - ⚡️ **Rapid Indexing**: Quickly index multiple documents along with their metadata, including source, page details, and content, into Pinecone DB.
@@ -30,6 +29,7 @@ pip install docindex ## Getting Started - Sign up to [Pinecone](https://www.pinecone.io/) and get an API key. ## Using OpenAI +[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1f5DzVjM9n9XWFtErTdWkKUFtszXyqzMI#scrollTo=olWrGV2viIsP) ```python from _openai.docindex import OpenaiPineconeIndexer @@ -65,7 +65,7 @@ pinecone_indexer.delete_index() ## Using Google Generative AI - +[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1C4DyIsWMJWUmNuXEmmUMyshqoAKspFlp?usp=sharing) ```python from _google.docindex import GooglePineconeIndexer @@ -157,8 +157,8 @@ python -m _google.delete_index --pinecone_api_key "your_pinecone_api_key" --ind ``` ## Contributing +🌟 First consider giving it a star at the top right. It means a lot! Contributions are welcome and encouraged. - Before contributing, please take a moment to review our [Contribution Guidelines](https://github.com/KevKibe/docindex/blob/master/DOCS/CONTRIBUTING.md) for important information on how to contribute to this project. If you're unsure about anything or need assistance, don't hesitate to reach out to us or open an issue to discuss your ideas. diff --git a/requirements.txt b/requirements.txt index b6a6e9b..37040df 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ langchain-community==0.0.31 langchain==0.1.14 langchain-openai==0.1.1 langchain-google-genai==1.0.1 -langchain-pinecone==0.1.0 \ No newline at end of file +langchain-pinecone==0.1.0 +google.generativeai==0.4.1 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 74f8024..fa9f73b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,7 @@ [metadata] name = docindex author = Kevin Kibe -version = 0.3.0 +version = 0.4.0 author_email = keviinkibe@gmail.com description = A package for fast indexing of multiple documents and their metadata on Pinecone. long_description = file: README.md @@ -22,6 +22,7 @@ install_requires = langchain-openai==0.1.1 langchain-google-genai==1.0.1 langchain-pinecone==0.1.0 + google.generativeai==0.4.1 package_dir= =src diff --git a/src/_google/docindex.py b/src/_google/docindex.py index e5c83e1..ffaa0e2 100644 --- a/src/_google/docindex.py +++ b/src/_google/docindex.py @@ -8,6 +8,7 @@ from typing import List from _openai.doc_model import Page from langchain_pinecone import PineconeVectorStore +import google.generativeai as genai class GooglePineconeIndexer: """ @@ -46,7 +47,7 @@ def create_index(self, environment: str = "us-west1-gcp" ): print(f"Creating index {self.index_name}") self.pc.create_index( name=self.index_name, - dimension=1536, + dimension=768, metric="cosine", spec=PodSpec( environment=environment, @@ -97,19 +98,23 @@ def tiktoken_len(self, text: str) -> int: ) return len(tokens) - def embed(self) -> GoogleGenerativeAIEmbeddings: + def embed(self, sample_text: str) -> GoogleGenerativeAIEmbeddings: """ - Initialize GoogleGenerativeAIEmbeddings object. + Embeds the given sample text using Google's Generative AI. + + Args: + sample_text (str): The text to be embedded. Returns: - GoogleGenerativeAIEmbeddings: GoogleGenerativeAIEmbeddings object. + GoogleGenerativeAIEmbeddings: An object containing the embedded content. """ - return GoogleGenerativeAIEmbeddings( - model="models/embedding-001", - google_api_key=self.google_api_key - ) + genai.configure(api_key=self.google_api_key) + return genai.embed_content( + model='models/embedding-001', + content=sample_text, + task_type="retrieval_document" + ) - def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: int = 256) -> None: """ Upsert documents into the Pinecone index. @@ -130,7 +135,6 @@ def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: length_function=self.tiktoken_len, separators=["\n\n", "\n", " ", ""] ) - embed = self.embed() for i, record in enumerate(tqdm(documents)): metadata = { 'content': record.page_content, @@ -145,7 +149,8 @@ def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: metadatas.extend(record_metadatas) if len(texts) >= batch_limit: ids = [str(uuid4()) for _ in range(len(texts))] - embeds = embed.embed_documents(texts) + embeds = self.embed(texts) + embeds = embeds['embedding'] index = self.pc.Index(self.index_name) index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True) texts = []