Skip to content

Commit

Permalink
Merge pull request #18 from KevKibe/cohere-embedding-support
Browse files Browse the repository at this point in the history
Add Support for embedding, vectorstore initialization and rag retrieval using cohere.
  • Loading branch information
KevKibe authored Apr 30, 2024
2 parents de04360 + 7642579 commit 4917536
Show file tree
Hide file tree
Showing 27 changed files with 884 additions and 384 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ jobs:
PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }}
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }}
run: |
pytest
pytest src/tests/openaiindex_test.py
pytest src/tests/googleindex_test.py
pytest src/tests/cohereindex_test.py
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<h1 align="center">DocIndex: Fast Persistent Document Embeddings Storage for RAG</h1>
<h1 align="center">DocIndex: Fast Persistent Document Embeddings Storage for Production-Level RAG</h1>
<p align="center">

<a href="https://github.com/KevKibe/docindex/commits/">
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ langchain-google-genai==1.0.1
langchain-pinecone==0.1.0
google.generativeai==0.4.1
python-dotenv==1.0.1
docx2txt==0.8
python-docx==1.1.0
markdown==3.6
langchain-core==0.1.46
langchain-core==0.1.46
langchain-cohere>=0.1.4
9 changes: 7 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = docindex
author = Kevin Kibe
version = 0.5.0
author_email = [email protected]
description = A package for fast indexing of multiple documents and their metadata on Pinecone.
description = A package for fast persistent storage of multiple document embeddings and their metadata into Pinecone for production-level RAG.
long_description = file: README.md
long_description_content_type = text/markdown
url = https://github.com/KevKibe/docindex
Expand All @@ -23,8 +23,13 @@ install_requires =
langchain-google-genai==1.0.1
langchain-pinecone==0.1.0
google.generativeai==0.4.1
python-dotenv==1.0.1
python-docx==1.1.0
markdown==3.6
langchain-core==0.1.46
langchain-cohere==0.1.4
package_dir=
=src

[options.packages.find]
where=src
where=src
Empty file added src/_cohere/__init__.py
Empty file.
12 changes: 12 additions & 0 deletions src/_cohere/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
class Config:
template_str = """
You are very helpful assistant for question answering tasks. Use the pieces of retrieved context to answer question given. If you do not know
the answer, Just say that you do not know the answer instead of making up an answer.
Retrieved context: {context}
Query: {query}
"""

default_google_model = "gemini-pro"
default_openai_model = "gpt-3.5-turbo-0125"
default_cohere_model = "command"
14 changes: 14 additions & 0 deletions src/_cohere/create_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from .doc_index import CoherePineconeIndexer
import argparse

def parse_args():
parser = argparse.ArgumentParser(description="Creates an Index on Pinecone.")
parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()
pinecone_indexer = CoherePineconeIndexer(args.index_name, args.pinecone_api_key)
pinecone_indexer.create_index()
14 changes: 14 additions & 0 deletions src/_cohere/delete_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from .doc_index import CoherePineconeIndexer
import argparse

def parse_args():
parser = argparse.ArgumentParser(description="Deletes an Index on Pinecone.")
parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()
pinecone_indexer = CoherePineconeIndexer(args.index_name, args.pinecone_api_key)
pinecone_indexer.delete_index()
109 changes: 42 additions & 67 deletions src/_openai/docindex.py → src/_cohere/doc_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,45 +3,44 @@
from uuid import uuid4
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
import tiktoken
from typing import List
from .doc_model import Page
from _openai.doc_model import Page
from langchain_pinecone import PineconeVectorStore
from pathlib import Path
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_pinecone import PineconeVectorStore
from langchain_cohere import CohereEmbeddings
from langchain_community.llms import Cohere
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter
from langchain_openai import ChatOpenAI
from src.config import Config
from utils.config import Config
from utils.response_model import QueryResult
from langchain.output_parsers import PydanticOutputParser



class OpenaiPineconeIndexer:
class CoherePineconeIndexer:
"""
Class for indexing documents to Pinecone using OpenAI embeddings.
Class for indexing documents to Pinecone using CohereEmbeddings embeddings.
"""
def __init__(
self,
index_name: str,
pinecone_api_key: str,
openai_api_key: str
pinecone_api_key: str = None,
cohere_api_key: str = None
) -> None:
"""
Initialize the OpenAIPineconeIndexer object.
Initialize the CohereEmbeddings object.
Args:
index_name (str): Name of the Pinecone index.
pinecone_api_key (str): Pinecone API key.
environment (str): Environment for Pinecone service.
openai_api_key (str): OpenAI API key.
cohere_api_key (str): Cohere API key.
"""
self.pc = Pinecone(api_key=pinecone_api_key)
self.index_name = index_name
self.openai_api_key = openai_api_key
self.cohere_api_key = cohere_api_key
self.tokenizer = tiktoken.get_encoding('p50k_base')

def create_index(self, environment: str = "us-west1-gcp" ):
Expand All @@ -57,7 +56,7 @@ def create_index(self, environment: str = "us-west1-gcp" ):
print(f"Creating index {self.index_name}")
self.pc.create_index(
name=self.index_name,
dimension=1536,
dimension=768,
metric="cosine",
spec=PodSpec(
environment=environment,
Expand All @@ -67,7 +66,6 @@ def create_index(self, environment: str = "us-west1-gcp" ):
)
return print(f"Index {self.index_name} created successfully!")


def delete_index(self):
"""
Deletes the created index.
Expand All @@ -78,8 +76,8 @@ def delete_index(self):
print(f"Deleting index {self.index_name}")
self.pc.delete_index(self.index_name)
return print(f"Index {self.index_name} deleted successfully!")



def load_document(self, file_url: str) -> List[str]:
"""
Load a document from a given file URL and split it into pages.
Expand All @@ -99,33 +97,21 @@ def load_document(self, file_url: str) -> List[str]:
pages = []
file_path = Path(file_url)

# Determine file type and use the appropriate loader
file_extension = file_path.suffix

# Load and split PDF files
if file_extension == ".pdf":
loader = PyPDFLoader(file_url)
pages = loader.load_and_split()

# Load and split DOCX and DOC files
elif file_extension in ('.docx', '.doc'):
loader = UnstructuredWordDocumentLoader(file_url)
pages = loader.load_and_split()

# Load and split Markdown files
elif file_extension == '.md':
loader = UnstructuredMarkdownLoader(file_url)
pages = loader.load_and_split()

# Load and split HTML files
elif file_extension == '.html':
loader = UnstructuredHTMLLoader(file_url)
pages = loader.load_and_split()

# Return the list of pages
return pages



def tiktoken_len(self, text: str) -> int:
"""
Calculate length of text in tokens.
Expand All @@ -142,19 +128,16 @@ def tiktoken_len(self, text: str) -> int:
)
return len(tokens)

def embed(self) -> OpenAIEmbeddings:
def embed(self) -> CohereEmbeddings:
"""
Initialize OpenAIEmbeddings object.
Embeds the given sample text using Google's Generative AI.
Returns:
OpenAIEmbeddings: OpenAIEmbeddings object.
CohereEmbeddings: An object containing the embedded content.
"""
return OpenAIEmbeddings(
openai_api_key=self.openai_api_key
)
return CohereEmbeddings(model="embed-multilingual-v2.0",
cohere_api_key = self.cohere_api_key)


def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: int = 256) -> None:
def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: int = 128) -> None:
"""
Upsert documents into the Pinecone index.
Expand All @@ -174,7 +157,6 @@ def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size:
length_function=self.tiktoken_len,
separators=["\n\n", "\n", " ", ""]
)
embed = self.embed()
for i, record in enumerate(tqdm(documents)):
metadata = {
'content': record.page_content,
Expand All @@ -189,14 +171,15 @@ def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size:
metadatas.extend(record_metadatas)
if len(texts) >= batch_limit:
ids = [str(uuid4()) for _ in range(len(texts))]
embed = self.embed()
embeds = embed.embed_documents(texts)
index = self.pc.Index(self.index_name)
index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True)
texts = []
metadatas = []


def index_documents(self, urls: List[str], batch_limit: int, chunk_size: int = 256) -> None:
def index_documents(self, urls: List[str], batch_limit: int = 32, chunk_size: int = 256) -> None:
"""
Process a list of URLs and upsert documents to a Pinecone index.
Expand Down Expand Up @@ -228,19 +211,15 @@ def index_documents(self, urls: List[str], batch_limit: int, chunk_size: int = 2
index = self.pc.Index(self.index_name)
print(index.describe_index_stats())
print("Indexing complete.")
return index


def initialize_vectorstore(self, index_name):
index = self.pc.Index(index_name)
embed = OpenAIEmbeddings(
model = 'text-embedding-ada-002',
openai_api_key = self.openai_api_key
)
vectorstore = PineconeVectorStore(index, embed, "text")
embed = CohereEmbeddings(model="embed-multilingual-v2.0",
cohere_api_key = self.cohere_api_key)
vectorstore = PineconeVectorStore(index,embed, "text")
return vectorstore


def retrieve_and_generate(self,query: str, index_name: str, model_name: str = 'gpt-3.5-turbo-1106', top_k: int =5):
def retrieve_and_generate(self,query: str, vector_store: str, model_name: str = 'gpt-3.5-turbo-1106', top_k: int =5):
"""
Retrieve documents from the Pinecone index and generate a response.
Args:
Expand All @@ -249,24 +228,20 @@ def retrieve_and_generate(self,query: str, index_name: str, model_name: str = 'g
model_name: The name of the model to use : defaults to 'gpt-3.5-turbo-1106'
top_k: The number of documents to retrieve from the index : defaults to 5
"""
llm = ChatOpenAI(model = Config.default_openai_model, openai_api_key = self.openai_api_key)
rag_prompt = PromptTemplate(template = Config.template_str, input_variables = ["query", "context"])

vector_store = self.initialize_vectorstore(index_name)
retriever = vector_store.as_retriver(search_kwargs = {"k": top_k})
llm = Cohere(model="command", cohere_api_key = self.cohere_api_key)
parser = PydanticOutputParser(pydantic_object=QueryResult)
rag_prompt = PromptTemplate(template = Config.template_str,
input_variables = ["query", "context"],
partial_variables={"format_instructions": parser.get_format_instructions()})
retriever = vector_store.as_retriever(search_kwargs = {"k": top_k})

rag_chain = (
{"context": itemgetter("query")| retriever,
"query": itemgetter("query"),
}
| rag_prompt
| llm
| StrOutputParser()
"query": itemgetter("query"),
}
| rag_prompt
| llm
| parser
)

return rag_chain.invoke({"query": query})






8 changes: 8 additions & 0 deletions src/_cohere/doc_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from pydantic import BaseModel, Field
from typing import Dict, Union

class Page(BaseModel):
page_content: str = Field(..., description="The content of the page")
metadata: Dict[str, Union[str, int]] = Field(..., description="Metadata about the document")
page: int = Field(..., description="The page of the content")
source: Union[str, int] = Field(..., description="The source url of the document")
20 changes: 20 additions & 0 deletions src/_cohere/index_documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from .doc_index import CoherePineconeIndexer
import argparse

def parse_args():
parser = argparse.ArgumentParser(description="Index documents on Pinecone using OpenAI embeddings.")
parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
parser.add_argument("--cohere_api_key", type=str, help="OpenAI API key")
parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed")

parser.add_argument("--batch_limit", type=int, default=32, help="Maximum batch size for indexing (default: 100).")
parser.add_argument("--chunk_size", type=int, default=256, help="Size of texts per chunk (default: 1000 characters).")
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()
pinecone_indexer = CoherePineconeIndexer(args.index_name, args.pinecone_api_key, args.cohere_api_key)
pinecone_indexer.index_documents(args.docs, args.batch_limit, args.chunk_size)
pinecone_indexer.initialize_vectorstore(args.index_name)
File renamed without changes.
2 changes: 1 addition & 1 deletion src/_google/create_index.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .docindex import GooglePineconeIndexer
from .doc_index import GooglePineconeIndexer
import argparse

def parse_args():
Expand Down
2 changes: 1 addition & 1 deletion src/_google/delete_index.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .docindex import GooglePineconeIndexer
from .doc_index import GooglePineconeIndexer
import argparse

def parse_args():
Expand Down
Loading

0 comments on commit 4917536

Please sign in to comment.