Skip to content

Commit

Permalink
Merge pull request #10 from KevKibe/main
Browse files Browse the repository at this point in the history
Fix: `embed` method in `GooglePineconeIndexer` class to embed chunks.
  • Loading branch information
KevKibe authored Apr 9, 2024
2 parents 80ed701 + b857972 commit fe868d5
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 17 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@

![Diagram](image.png)

*Efficiently store multiple document embeddings and their metadata, whether they're offline or online, in a persistent Pinecone Vector Database optimized for Retrieval Augmented Generation (RAG) applications fast*

*Efficiently store multiple document embeddings and their metadata, whether they're offline or online, in a persistent Pinecone Vector Database optimized for Retrieval Augmented Generation (RAG) applications fast*
## Features

- ⚡️ **Rapid Indexing**: Quickly index multiple documents along with their metadata, including source, page details, and content, into Pinecone DB.<br>
Expand All @@ -30,6 +29,7 @@ pip install docindex
## Getting Started
- Sign up to [Pinecone](https://www.pinecone.io/) and get an API key.
## Using OpenAI
[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1f5DzVjM9n9XWFtErTdWkKUFtszXyqzMI#scrollTo=olWrGV2viIsP)
```python
from _openai.docindex import OpenaiPineconeIndexer

Expand Down Expand Up @@ -65,7 +65,7 @@ pinecone_indexer.delete_index()


## Using Google Generative AI

[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1C4DyIsWMJWUmNuXEmmUMyshqoAKspFlp?usp=sharing)
```python
from _google.docindex import GooglePineconeIndexer

Expand Down Expand Up @@ -157,8 +157,8 @@ python -m _google.delete_index --pinecone_api_key "your_pinecone_api_key" --ind
```
## Contributing
🌟 First consider giving it a star at the top right. It means a lot!
Contributions are welcome and encouraged.
Before contributing, please take a moment to review our [Contribution Guidelines](https://github.com/KevKibe/docindex/blob/master/DOCS/CONTRIBUTING.md) for important information on how to contribute to this project.
If you're unsure about anything or need assistance, don't hesitate to reach out to us or open an issue to discuss your ideas.
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ langchain-community==0.0.31
langchain==0.1.14
langchain-openai==0.1.1
langchain-google-genai==1.0.1
langchain-pinecone==0.1.0
langchain-pinecone==0.1.0
google.generativeai==0.4.1
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[metadata]
name = docindex
author = Kevin Kibe
version = 0.3.0
version = 0.4.0
author_email = [email protected]
description = A package for fast indexing of multiple documents and their metadata on Pinecone.
long_description = file: README.md
Expand All @@ -22,6 +22,7 @@ install_requires =
langchain-openai==0.1.1
langchain-google-genai==1.0.1
langchain-pinecone==0.1.0
google.generativeai==0.4.1
package_dir=
=src

Expand Down
27 changes: 16 additions & 11 deletions src/_google/docindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import List
from _openai.doc_model import Page
from langchain_pinecone import PineconeVectorStore
import google.generativeai as genai

class GooglePineconeIndexer:
"""
Expand Down Expand Up @@ -46,7 +47,7 @@ def create_index(self, environment: str = "us-west1-gcp" ):
print(f"Creating index {self.index_name}")
self.pc.create_index(
name=self.index_name,
dimension=1536,
dimension=768,
metric="cosine",
spec=PodSpec(
environment=environment,
Expand Down Expand Up @@ -97,19 +98,23 @@ def tiktoken_len(self, text: str) -> int:
)
return len(tokens)

def embed(self) -> GoogleGenerativeAIEmbeddings:
def embed(self, sample_text: str) -> GoogleGenerativeAIEmbeddings:
"""
Initialize GoogleGenerativeAIEmbeddings object.
Embeds the given sample text using Google's Generative AI.
Args:
sample_text (str): The text to be embedded.
Returns:
GoogleGenerativeAIEmbeddings: GoogleGenerativeAIEmbeddings object.
GoogleGenerativeAIEmbeddings: An object containing the embedded content.
"""
return GoogleGenerativeAIEmbeddings(
model="models/embedding-001",
google_api_key=self.google_api_key
)
genai.configure(api_key=self.google_api_key)
return genai.embed_content(
model='models/embedding-001',
content=sample_text,
task_type="retrieval_document"
)


def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: int = 256) -> None:
"""
Upsert documents into the Pinecone index.
Expand All @@ -130,7 +135,6 @@ def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size:
length_function=self.tiktoken_len,
separators=["\n\n", "\n", " ", ""]
)
embed = self.embed()
for i, record in enumerate(tqdm(documents)):
metadata = {
'content': record.page_content,
Expand All @@ -145,7 +149,8 @@ def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size:
metadatas.extend(record_metadatas)
if len(texts) >= batch_limit:
ids = [str(uuid4()) for _ in range(len(texts))]
embeds = embed.embed_documents(texts)
embeds = self.embed(texts)
embeds = embeds['embedding']
index = self.pc.Index(self.index_name)
index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True)
texts = []
Expand Down

0 comments on commit fe868d5

Please sign in to comment.