Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update: Support for Google Embedding Service, GoogleGenerativeAIEmbeddings #4

Merged
merged 4 commits into from
Apr 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<a href="https://github.com/KevKibe/docindex/commits/">
<img src="https://img.shields.io/github/last-commit/KevKibe/docindex?" alt="Last commit">
</a>
<a href="https://github.com/KevKibe/African-Whisper/blob/main/LICENSE">
<a href="https://github.com/KevKibe/docindex/blob/master/LICENSE">
<img src="https://img.shields.io/github/license/KevKibe/docindex?" alt="License">
</a>

Expand Down Expand Up @@ -37,7 +37,7 @@ environment = "pinecone-index-environment"
# Define the batch limit for indexing, how many pages per pass.
batch_limit = 20

# List of URLs of the documents to be indexed
# List of URLs of the documents to be indexed. (offline on your computer or an online)
urls = [
"your-document-1.pdf",
"your-document-2.pdf"
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ pypdf==4.1.0
unstructured==0.12.6
langchain-community==0.0.31
langchain==0.1.14
langchain-openai==0.1.1
langchain-openai==0.1.1
langchain-google-genai==1.0.1
Empty file added src/_google/__init__.py
Empty file.
157 changes: 157 additions & 0 deletions src/_google/docindex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
from pinecone import Pinecone
from tqdm.auto import tqdm
from uuid import uuid4
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import tiktoken
from typing import List
from _openai.doc_model import Page


class GooglePineconeIndexer:
"""
Class for indexing documents to Pinecone using GoogleGenerativeAIEmbeddings embeddings.
"""
def __init__(
self,
index_name: str,
pinecone_api_key: str,
environment: str,
google_api_key: str
) -> None:
"""
Initialize the GoogleGenerativeAIEmbeddings object.

Args:
index_name (str): Name of the Pinecone index.
pinecone_api_key (str): Pinecone API key.
environment (str): Environment for Pinecone service.
google_api_key (str): Google API key.
"""
self.pc = Pinecone(api_key=pinecone_api_key, environment=environment)
self.index = self.pc.Index(index_name)
self.google_api_key = google_api_key
self.tokenizer = tiktoken.get_encoding('p50k_base')


def load_pdf(self, pdf_url) -> List:
"""
Load and split a PDF document into pages.

Args:
pdf_url (str): URL of the PDF document.

Returns:
List: List of pages from the PDF document.
"""
loader = PyPDFLoader(pdf_url)
pages = loader.load_and_split()
return pages

def tiktoken_len(self, text: str) -> int:
"""
Calculate length of text in tokens.

Parameters:
text (str): Input text.

Returns:
int: Length of text in tokens.
"""
tokens = self.tokenizer.encode(
text,
disallowed_special=()
)
return len(tokens)

def embed(self) -> GoogleGenerativeAIEmbeddings:
"""
Initialize GoogleGenerativeAIEmbeddings object.

Returns:
GoogleGenerativeAIEmbeddings: GoogleGenerativeAIEmbeddings object.
"""
return GoogleGenerativeAIEmbeddings(
model="models/embedding-001",
google_api_key=self.google_api_key
)

def text_splitter(self) -> RecursiveCharacterTextSplitter:
"""
Initialize RecursiveCharacterTextSplitter object.

Returns:
RecursiveCharacterTextSplitter: RecursiveCharacterTextSplitter object.
"""
return RecursiveCharacterTextSplitter(
chunk_size=400,
chunk_overlap=20,
length_function=self.tiktoken_len,
separators=["\n\n", "\n", " ", ""]
)

def upsert_documents(self, documents: List[Page], batch_limit: int) -> None:
"""
Upsert documents into the Pinecone index.

Args:
documents (List[Page]): List of documents to upsert.
batch_limit (int): Maximum batch size for upsert operation.

Returns:
None
"""
texts = []
metadatas = []
text_splitter = self.text_splitter()
embed = self.embed()
for i, record in enumerate(tqdm(documents)):
metadata = {
'content': record.page_content,
'source': record.page,
'title': record.source
}
record_texts = text_splitter.split_text(record.page_content)
record_metadatas = [{
"chunk": j, "text": text, **metadata
} for j, text in enumerate(record_texts)]
texts.extend(record_texts)
metadatas.extend(record_metadatas)
if len(texts) >= batch_limit:
ids = [str(uuid4()) for _ in range(len(texts))]
embeds = embed.embed_documents(texts)
self.index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True)
texts = []
metadatas = []


def index_documents(self, urls: List[str], batch_limit: int) -> None:
"""
Process a list of URLs and upsert documents to a Pinecone index.

Args:
urls (List[str]): List of URLs to process.
batch_limit (int): Batch limit for upserting documents.

Returns:
None
"""
for url in tqdm(urls, desc="Processing URLs"):
print(f"Processing URL: {url}")
pages = self.load_pdf(url)
print(f"Found {len(pages)} pages in the PDF.")
pages_data = [
Page(
page_content=page.page_content,
metadata=page.metadata,
page=page.metadata['page'],
source=page.metadata['source']
)
for page in pages
]

print(f"Upserting {len(pages_data)} pages to the Pinecone index...")
self.upsert_documents(pages_data, batch_limit)
print("Finished upserting documents for this URL.")
print("Indexing complete.")
18 changes: 18 additions & 0 deletions src/_google/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from .docindex import GooglePineconeIndexer
import argparse

def parse_args():
parser = argparse.ArgumentParser(description="Index documents on Pinecone using OpenAI embeddings.")
parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
parser.add_argument("--google_api_key", type=str, help="OpenAI API key")
parser.add_argument("--environment", type=str, help="Environment for Pinecone service")
parser.add_argument("--batch_limit", type=int, help="Maximum batch size for indexing")
parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed")
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()
pinecone_indexer = GooglePineconeIndexer(args.index_name, args.pinecone_api_key, args.environment, args.google_api_key)
pinecone_indexer.index_documents(args.docs, args.batch_limit)
Loading