From 90c7762ca23a01fa73f3152902c116e3cf5c2a8b Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 9 Apr 2024 15:20:59 +0300 Subject: [PATCH 1/5] fix: embed method in --- src/_google/docindex.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/_google/docindex.py b/src/_google/docindex.py index e5c83e1..ffaa0e2 100644 --- a/src/_google/docindex.py +++ b/src/_google/docindex.py @@ -8,6 +8,7 @@ from typing import List from _openai.doc_model import Page from langchain_pinecone import PineconeVectorStore +import google.generativeai as genai class GooglePineconeIndexer: """ @@ -46,7 +47,7 @@ def create_index(self, environment: str = "us-west1-gcp" ): print(f"Creating index {self.index_name}") self.pc.create_index( name=self.index_name, - dimension=1536, + dimension=768, metric="cosine", spec=PodSpec( environment=environment, @@ -97,19 +98,23 @@ def tiktoken_len(self, text: str) -> int: ) return len(tokens) - def embed(self) -> GoogleGenerativeAIEmbeddings: + def embed(self, sample_text: str) -> GoogleGenerativeAIEmbeddings: """ - Initialize GoogleGenerativeAIEmbeddings object. + Embeds the given sample text using Google's Generative AI. + + Args: + sample_text (str): The text to be embedded. Returns: - GoogleGenerativeAIEmbeddings: GoogleGenerativeAIEmbeddings object. + GoogleGenerativeAIEmbeddings: An object containing the embedded content. """ - return GoogleGenerativeAIEmbeddings( - model="models/embedding-001", - google_api_key=self.google_api_key - ) + genai.configure(api_key=self.google_api_key) + return genai.embed_content( + model='models/embedding-001', + content=sample_text, + task_type="retrieval_document" + ) - def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: int = 256) -> None: """ Upsert documents into the Pinecone index. @@ -130,7 +135,6 @@ def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: length_function=self.tiktoken_len, separators=["\n\n", "\n", " ", ""] ) - embed = self.embed() for i, record in enumerate(tqdm(documents)): metadata = { 'content': record.page_content, @@ -145,7 +149,8 @@ def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: metadatas.extend(record_metadatas) if len(texts) >= batch_limit: ids = [str(uuid4()) for _ in range(len(texts))] - embeds = embed.embed_documents(texts) + embeds = self.embed(texts) + embeds = embeds['embedding'] index = self.pc.Index(self.index_name) index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True) texts = [] From 45e39f2db90b0ddd92098cb6cbeb9fa569650a3d Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 9 Apr 2024 15:21:48 +0300 Subject: [PATCH 2/5] chore: update colab link --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 41e4217..5c439af 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,7 @@ ![Diagram](image.png) -*Efficiently store multiple document embeddings and their metadata, whether they're offline or online, in a persistent Pinecone Vector Database optimized for Retrieval Augmented Generation (RAG) applications fast* - + *Efficiently store multiple document embeddings and their metadata, whether they're offline or online, in a persistent Pinecone Vector Database optimized for Retrieval Augmented Generation (RAG) applications fast* ## Features - ⚡️ **Rapid Indexing**: Quickly index multiple documents along with their metadata, including source, page details, and content, into Pinecone DB.
@@ -30,6 +29,7 @@ pip install docindex ## Getting Started - Sign up to [Pinecone](https://www.pinecone.io/) and get an API key. ## Using OpenAI +[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1f5DzVjM9n9XWFtErTdWkKUFtszXyqzMI#scrollTo=olWrGV2viIsP) ```python from _openai.docindex import OpenaiPineconeIndexer From edfef1f69a37156373505f7fa660dc1f9bc08607 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 9 Apr 2024 15:26:34 +0300 Subject: [PATCH 3/5] chore: add dependency --- requirements.txt | 3 ++- setup.cfg | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b6a6e9b..37040df 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ langchain-community==0.0.31 langchain==0.1.14 langchain-openai==0.1.1 langchain-google-genai==1.0.1 -langchain-pinecone==0.1.0 \ No newline at end of file +langchain-pinecone==0.1.0 +google.generativeai==0.4.1 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 74f8024..c14b96e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,6 +22,7 @@ install_requires = langchain-openai==0.1.1 langchain-google-genai==1.0.1 langchain-pinecone==0.1.0 + google.generativeai==0.4.1 package_dir= =src From 82b68144e7168a17e961a97df9eb33bc13cea75b Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 9 Apr 2024 15:57:05 +0300 Subject: [PATCH 4/5] chore: update Colab link --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5c439af..38696e3 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ pinecone_indexer.delete_index() ## Using Google Generative AI - +[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1C4DyIsWMJWUmNuXEmmUMyshqoAKspFlp?usp=sharing) ```python from _google.docindex import GooglePineconeIndexer @@ -157,8 +157,8 @@ python -m _google.delete_index --pinecone_api_key "your_pinecone_api_key" --ind ``` ## Contributing +🌟 First consider giving it a star at the top right. It means a lot! Contributions are welcome and encouraged. - Before contributing, please take a moment to review our [Contribution Guidelines](https://github.com/KevKibe/docindex/blob/master/DOCS/CONTRIBUTING.md) for important information on how to contribute to this project. If you're unsure about anything or need assistance, don't hesitate to reach out to us or open an issue to discuss your ideas. From b857972c084ba52d324148d77e86a0bf969ee9ec Mon Sep 17 00:00:00 2001 From: KevKibe Date: Tue, 9 Apr 2024 15:59:56 +0300 Subject: [PATCH 5/5] update: version update --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index c14b96e..fa9f73b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,7 @@ [metadata] name = docindex author = Kevin Kibe -version = 0.3.0 +version = 0.4.0 author_email = keviinkibe@gmail.com description = A package for fast indexing of multiple documents and their metadata on Pinecone. long_description = file: README.md