Skip to content

Commit

Permalink
Merge pull request #8 from KevKibe/main
Browse files Browse the repository at this point in the history
Update: Add `parse_args()` for creating and deleting indexes from CLI, removing `environment` as a parameter, `README.md` update
  • Loading branch information
KevKibe authored Apr 8, 2024
2 parents 54ccb58 + 4a2b850 commit 2bf632f
Show file tree
Hide file tree
Showing 12 changed files with 200 additions and 57 deletions.
73 changes: 52 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<h1 align="center">DocIndex: Fast Document Embeddings Storage for RAG</h1>
<h1 align="center">DocIndex: Fast Persistent Document Embeddings Storage for RAG</h1>
<p align="center">

<a href="https://github.com/KevKibe/docindex/commits/">
Expand All @@ -7,8 +7,11 @@
<a href="https://github.com/KevKibe/docindex/blob/master/LICENSE">
<img src="https://img.shields.io/github/license/KevKibe/docindex?" alt="License">
</a>
<br>

*Efficiently store multiple document embeddings and their metadata, whether they're offline or online, in a Pinecone Vector Database optimized for Retrieval Augmented Generation (RAG) models Fast*
![Diagram](image.png)

*Efficiently store multiple document embeddings and their metadata, whether they're offline or online, in a persistent Pinecone Vector Database optimized for Retrieval Augmented Generation (RAG) models Fast*

## Features

Expand All @@ -25,15 +28,15 @@ pip install docindex
```

## Getting Started
- Sign up to [Pinecone](https://www.pinecone.io/) and get an API key.
## Using OpenAI
```python
from _openai.docindex import OpenaiPineconeIndexer

# Replace these values with your actual Pinecone API key, index name, OpenAI API key, and environment
# Replace these values with your actual Pinecone API key, index name, OpenAI API key
pinecone_api_key = "pinecone-api-key"
index_name = "pinecone-index-name"
openai_api_key = "openai-api-key"
environment = "pinecone-index-environment"
batch_limit = 20 # Batch limit for upserting documents
chunk_size = 256 # Optional: size of texts per chunk.

Expand All @@ -44,11 +47,18 @@ urls = [
]

# Initialize the Pinecone indexer
pinecone_index = OpenaiPineconeIndexer(index_name, pinecone_api_key, environment, openai_api_key)
pinecone_index = OpenaiPineconeIndexer(index_name, pinecone_api_key, openai_api_key)

# To create a new Index
pinecone_index.create_index()

# Index the documents with the specified URLs and batch limit
# Store the document embeddings with the specified URLs and batch limit
pinecone_index.index_documents(urls,batch_limit,chunk_size)
```
```python
# To delete the created Index
pinecone_index.delete_index()
```
## Initialize Vectorstore(using OpenAI)

```python
Expand All @@ -66,11 +76,8 @@ embed = OpenAIEmbeddings(
openai_api_key = openai_api_key
)

# Define the text field
text_field = "text"

# Initialize the Vectorstore with the Pinecone index and OpenAI embeddings
vectorstore = VectorStorePinecone(index, embed, text_field)
vectorstore = VectorStorePinecone(index, embed, "text")
```


Expand All @@ -79,11 +86,10 @@ vectorstore = VectorStorePinecone(index, embed, text_field)
```python
from _google.docindex import GooglePineconeIndexer

# Replace these values with your actual Pinecone API key, index name, OpenAI API key, and environment
# Replace these values with your actual Pinecone API key, index name, Google API key
pinecone_api_key = "pinecone-api-key"
index_name = "pinecone-index-name"
google_api_key = "google-api-key"
environment = "pinecone-index-environment"
batch_limit = 20 # Batch limit for upserting documents
chunk_size = 256 # Optional: size of texts per chunk.

Expand All @@ -93,12 +99,18 @@ urls = [
"your-document-2.pdf"
]

# Initialize the Pinecone indexer
pinecone_index = GooglePineconeIndexer(index_name, pinecone_api_key, environment, google_api_key)
pinecone_index = GooglePineconeIndexer(index_name, pinecone_api_key, google_api_key)

# Index the documents with the specified URLs and batch limit
# To create a new Index
pinecone_index.create_index()

# Store the document embeddings with the specified URLs and batch limit
pinecone_index.index_documents(urls,batch_limit,chunk_size)
```
```python
# To delete the created Index
pinecone_index.delete_index()
```


## Initialize Vectorstore(using Google Generative AI)
Expand All @@ -118,11 +130,8 @@ embed = GoogleGenerativeAIEmbeddings(
google_api_key=google_api_key
)

# Define the text field
text_field = "text"

# Initialize the Vectorstore with the Pinecone index and OpenAI embeddings
vectorstore = VectorStorePinecone(index, embed, text_field)
vectorstore = VectorStorePinecone(index, embed, "text")
```


Expand Down Expand Up @@ -155,16 +164,38 @@ pip install -r requirements.txt
```bash
cd src
```
- To create an index

```bash
# Using OpenAI
python -m _openai.create_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --openai_api_key "your_openai_api_key"
```

```bash
# Using Google Generative AI
python -m _google.create_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --google_api_key "your_google_api_key"
```

- Run the command to start indexing the documents

```bash
# Using OpenAI
python -m _openai.doc_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --openai_api_key "your_openai_api_key" --environment "your_environment" --batch_limit 10 --docs "doc-1.pdf" "doc-2.pdf' --chunk_size 256
python -m _openai.doc_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --openai_api_key "your_openai_api_key" --batch_limit 10 --docs "doc-1.pdf" "doc-2.pdf' --chunk_size 256
```
```bash
# Using Google Generative AI
python -m _google.doc_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --google_api_key "your_google_api_key" --environment "your_environment" --batch_limit 10 --docs "doc-1.pdf" "doc-2.pdf' --chunk_size 256
python -m _google.doc_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --google_api_key "your_google_api_key" --batch_limit 10 --docs "doc-1.pdf" "doc-2.pdf' --chunk_size 256
```
- To delete an index
```bash
# Using OpenAI
python -m _openai.delete_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --openai_api_key "your_openai_api_key"
```
```bash
# Using Google Generative AI
python -m _google.delete_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --google_api_key "your_google_api_key"
```
## Contributing
Expand Down
Binary file added image.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[metadata]
name = docindex
author = Kevin Kibe
version = 0.1.0
version = 0.2.0
author_email = [email protected]
description = A package for fast indexing of multiple documents and their metadata on Pinecone.
long_description = file: README.md
Expand Down
15 changes: 15 additions & 0 deletions src/_google/create_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from .docindex import GooglePineconeIndexer
import argparse

def parse_args():
parser = argparse.ArgumentParser(description="Creates an Index on Pinecone.")
parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
parser.add_argument("--google_api_key", type=str, help="Google API key")
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()
pinecone_indexer = GooglePineconeIndexer(args.index_name, args.pinecone_api_key, args.google_api_key)
pinecone_indexer.create_index()
15 changes: 15 additions & 0 deletions src/_google/delete_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from .docindex import GooglePineconeIndexer
import argparse

def parse_args():
parser = argparse.ArgumentParser(description="Deletes an Index on Pinecone.")
parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
parser.add_argument("--google_api_key", type=str, help="OpenAI API key")
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()
pinecone_indexer = GooglePineconeIndexer(args.index_name, args.pinecone_api_key, args.google_api_key)
pinecone_indexer.delete_index()
3 changes: 1 addition & 2 deletions src/_google/doc_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ def parse_args():
parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
parser.add_argument("--google_api_key", type=str, help="OpenAI API key")
parser.add_argument("--environment", type=str, help="Environment for Pinecone service")
parser.add_argument("--batch_limit", type=int, help="Maximum batch size for indexing")
parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed")
parser.add_argument("--chunk_size", help="size of texts per chunk")
Expand All @@ -15,5 +14,5 @@ def parse_args():

if __name__ == "__main__":
args = parse_args()
pinecone_indexer = GooglePineconeIndexer(args.index_name, args.pinecone_api_key, args.environment, args.google_api_key)
pinecone_indexer = GooglePineconeIndexer(args.index_name, args.pinecone_api_key, args.google_api_key)
pinecone_indexer.index_documents(args.docs, args.batch_limit, args.chunk_size)
46 changes: 40 additions & 6 deletions src/_google/docindex.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from pinecone import Pinecone
from pinecone import Pinecone, PodSpec
from tqdm.auto import tqdm
from uuid import uuid4
from langchain_community.document_loaders import PyPDFLoader
Expand All @@ -8,7 +8,6 @@
from typing import List
from _openai.doc_model import Page


class GooglePineconeIndexer:
"""
Class for indexing documents to Pinecone using GoogleGenerativeAIEmbeddings embeddings.
Expand All @@ -17,7 +16,6 @@ def __init__(
self,
index_name: str,
pinecone_api_key: str,
environment: str,
google_api_key: str
) -> None:
"""
Expand All @@ -29,12 +27,45 @@ def __init__(
environment (str): Environment for Pinecone service.
google_api_key (str): Google API key.
"""
self.pc = Pinecone(api_key=pinecone_api_key, environment=environment)
self.index = self.pc.Index(index_name)
self.pc = Pinecone(api_key=pinecone_api_key)
self.index_name = index_name
self.google_api_key = google_api_key
self.tokenizer = tiktoken.get_encoding('p50k_base')

def create_index(self, environment: str = "us-west1-gcp" ):
"""
Creates an index with the specified parameters.
Args:
environment (str, optional): The environment where the index will be created. Defaults to "us-west1-gcp".
Returns:
None
"""
print(f"Creating index {self.index_name}")
self.pc.create_index(
name=self.index_name,
dimension=1536,
metric="cosine",
spec=PodSpec(
environment=environment,
pod_type="p1.x1",
pods=1
)
)
return print(f"Index {self.index_name} created successfully!")

def delete_index(self):
"""
Deletes the created index.
Returns:
None
"""
print(f"Deleting index {self.index_name}")
self.pc.delete_index(self.index_name)
return print(f"Index {self.index_name} deleted successfully!")

def load_pdf(self, pdf_url) -> List:
"""
Load and split a PDF document into pages.
Expand Down Expand Up @@ -114,7 +145,8 @@ def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size:
if len(texts) >= batch_limit:
ids = [str(uuid4()) for _ in range(len(texts))]
embeds = embed.embed_documents(texts)
self.index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True)
index = self.pc.Index(self.index_name)
index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True)
texts = []
metadatas = []

Expand Down Expand Up @@ -148,4 +180,6 @@ def index_documents(self, urls: List[str], batch_limit: int, chunk_size: int = 2
print(f"Upserting {len(pages_data)} pages to the Pinecone index...")
self.upsert_documents(pages_data, batch_limit, chunk_size)
print("Finished upserting documents for this URL.")
index = self.pc.Index(self.index_name)
index.describe_index_stats()
print("Indexing complete.")
18 changes: 0 additions & 18 deletions src/_google/main.py

This file was deleted.

15 changes: 15 additions & 0 deletions src/_openai/create_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from .docindex import OpenaiPineconeIndexer
import argparse

def parse_args():
parser = argparse.ArgumentParser(description="Index documents on Pinecone using OpenAI embeddings.")
parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
parser.add_argument("--openai_api_key", type=str, help="OpenAI API key")
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()
pinecone_indexer = OpenaiPineconeIndexer(args.index_name, args.pinecone_api_key, args.openai_api_key)
pinecone_indexer.create_index()
15 changes: 15 additions & 0 deletions src/_openai/delete_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from .docindex import OpenaiPineconeIndexer
import argparse

def parse_args():
parser = argparse.ArgumentParser(description="Index documents on Pinecone using OpenAI embeddings.")
parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
parser.add_argument("--openai_api_key", type=str, help="OpenAI API key")
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()
pinecone_indexer = OpenaiPineconeIndexer(args.index_name, args.pinecone_api_key, args.openai_api_key)
pinecone_indexer.delete_index()
3 changes: 1 addition & 2 deletions src/_openai/doc_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ def parse_args():
parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
parser.add_argument("--openai_api_key", type=str, help="OpenAI API key")
parser.add_argument("--environment", type=str, help="Environment for Pinecone service")
parser.add_argument("--batch_limit", type=int, help="Maximum batch size for indexing")
parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed")
parser.add_argument("--chunk_size", help="size of texts per chunk")
Expand All @@ -15,7 +14,7 @@ def parse_args():

if __name__ == "__main__":
args = parse_args()
pinecone_indexer = OpenaiPineconeIndexer(args.index_name, args.pinecone_api_key, args.environment, args.openai_api_key)
pinecone_indexer = OpenaiPineconeIndexer(args.index_name, args.pinecone_api_key, args.openai_api_key)
pinecone_indexer.index_documents(args.docs, args.batch_limit, args.chunk_size)


Expand Down
Loading

0 comments on commit 2bf632f

Please sign in to comment.