From 73402c7bbb4439667400d759f349749dc314485b Mon Sep 17 00:00:00 2001 From: KevKibe Date: Sat, 6 Apr 2024 19:16:48 +0300 Subject: [PATCH 01/13] update: langchain-google-genai dependency --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 57a283b..400364b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ pypdf==4.1.0 unstructured==0.12.6 langchain-community==0.0.31 langchain==0.1.14 -langchain-openai==0.1.1 \ No newline at end of file +langchain-openai==0.1.1 +langchain-google-genai==1.0.1 \ No newline at end of file From b497d975844c4bee34f2767096b59dbc6fca5e64 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Sat, 6 Apr 2024 19:17:51 +0300 Subject: [PATCH 02/13] update: indexer that uses google embedding model --- src/_google/__init__.py | 0 src/_google/docindex.py | 157 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 src/_google/__init__.py create mode 100644 src/_google/docindex.py diff --git a/src/_google/__init__.py b/src/_google/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/_google/docindex.py b/src/_google/docindex.py new file mode 100644 index 0000000..2f9ef8c --- /dev/null +++ b/src/_google/docindex.py @@ -0,0 +1,157 @@ +from pinecone import Pinecone +from tqdm.auto import tqdm +from uuid import uuid4 +from langchain_community.document_loaders import PyPDFLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_google_genai import GoogleGenerativeAIEmbeddings +import tiktoken +from typing import List +from _openai.doc_model import Page + + +class GooglePineconeIndexer: + """ + Class for indexing documents to Pinecone using GoogleGenerativeAIEmbeddings embeddings. + """ + def __init__( + self, + index_name: str, + pinecone_api_key: str, + environment: str, + google_api_key: str + ) -> None: + """ + Initialize the GoogleGenerativeAIEmbeddings object. + + Args: + index_name (str): Name of the Pinecone index. + pinecone_api_key (str): Pinecone API key. + environment (str): Environment for Pinecone service. + google_api_key (str): Google API key. + """ + self.pc = Pinecone(api_key=pinecone_api_key, environment=environment) + self.index = self.pc.Index(index_name) + self.google_api_key = google_api_key + self.tokenizer = tiktoken.get_encoding('p50k_base') + + + def load_pdf(self, pdf_url) -> List: + """ + Load and split a PDF document into pages. + + Args: + pdf_url (str): URL of the PDF document. + + Returns: + List: List of pages from the PDF document. + """ + loader = PyPDFLoader(pdf_url) + pages = loader.load_and_split() + return pages + + def tiktoken_len(self, text: str) -> int: + """ + Calculate length of text in tokens. + + Parameters: + text (str): Input text. + + Returns: + int: Length of text in tokens. + """ + tokens = self.tokenizer.encode( + text, + disallowed_special=() + ) + return len(tokens) + + def embed(self) -> GoogleGenerativeAIEmbeddings: + """ + Initialize GoogleGenerativeAIEmbeddings object. + + Returns: + GoogleGenerativeAIEmbeddings: GoogleGenerativeAIEmbeddings object. + """ + return GoogleGenerativeAIEmbeddings( + model="models/embedding-001", + google_api_key=self.google_api_key + ) + + def text_splitter(self) -> RecursiveCharacterTextSplitter: + """ + Initialize RecursiveCharacterTextSplitter object. + + Returns: + RecursiveCharacterTextSplitter: RecursiveCharacterTextSplitter object. + """ + return RecursiveCharacterTextSplitter( + chunk_size=400, + chunk_overlap=20, + length_function=self.tiktoken_len, + separators=["\n\n", "\n", " ", ""] + ) + + def upsert_documents(self, documents: List[Page], batch_limit: int) -> None: + """ + Upsert documents into the Pinecone index. + + Args: + documents (List[Page]): List of documents to upsert. + batch_limit (int): Maximum batch size for upsert operation. + + Returns: + None + """ + texts = [] + metadatas = [] + text_splitter = self.text_splitter() + embed = self.embed() + for i, record in enumerate(tqdm(documents)): + metadata = { + 'content': record.page_content, + 'source': record.page, + 'title': record.source + } + record_texts = text_splitter.split_text(record.page_content) + record_metadatas = [{ + "chunk": j, "text": text, **metadata + } for j, text in enumerate(record_texts)] + texts.extend(record_texts) + metadatas.extend(record_metadatas) + if len(texts) >= batch_limit: + ids = [str(uuid4()) for _ in range(len(texts))] + embeds = embed.embed_documents(texts) + self.index.upsert(vectors=zip(ids, embeds, metadatas), async_req=True) + texts = [] + metadatas = [] + + + def index_documents(self, urls: List[str], batch_limit: int) -> None: + """ + Process a list of URLs and upsert documents to a Pinecone index. + + Args: + urls (List[str]): List of URLs to process. + batch_limit (int): Batch limit for upserting documents. + + Returns: + None + """ + for url in tqdm(urls, desc="Processing URLs"): + print(f"Processing URL: {url}") + pages = self.load_pdf(url) + print(f"Found {len(pages)} pages in the PDF.") + pages_data = [ + Page( + page_content=page.page_content, + metadata=page.metadata, + page=page.metadata['page'], + source=page.metadata['source'] + ) + for page in pages + ] + + print(f"Upserting {len(pages_data)} pages to the Pinecone index...") + self.upsert_documents(pages_data, batch_limit) + print("Finished upserting documents for this URL.") + print("Indexing complete.") From 1b98229f3d8035043f31162949a4a88809c7afa1 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Sat, 6 Apr 2024 19:18:38 +0300 Subject: [PATCH 03/13] update: cli args for indexing --- src/_google/main.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 src/_google/main.py diff --git a/src/_google/main.py b/src/_google/main.py new file mode 100644 index 0000000..dba5156 --- /dev/null +++ b/src/_google/main.py @@ -0,0 +1,18 @@ +from .docindex import GooglePineconeIndexer +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description="Index documents on Pinecone using OpenAI embeddings.") + parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key") + parser.add_argument("--index_name", type=str, help="Name of the Pinecone index") + parser.add_argument("--google_api_key", type=str, help="OpenAI API key") + parser.add_argument("--environment", type=str, help="Environment for Pinecone service") + parser.add_argument("--batch_limit", type=int, help="Maximum batch size for indexing") + parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + pinecone_indexer = GooglePineconeIndexer(args.index_name, args.pinecone_api_key, args.environment, args.google_api_key) + pinecone_indexer.index_documents(args.docs, args.batch_limit) From 175c5e72df79da227098f0c2f9b2177b4519f8f6 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Sat, 6 Apr 2024 19:19:23 +0300 Subject: [PATCH 04/13] Uodate: README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 627a1f0..97f9926 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ environment = "pinecone-index-environment" # Define the batch limit for indexing, how many pages per pass. batch_limit = 20 -# List of URLs of the documents to be indexed +# List of URLs of the documents to be indexed. (offline on your computer or an online) urls = [ "your-document-1.pdf", "your-document-2.pdf" From 10bfa4e8e408d3d4fa7ccf8ff873485b325d7f7c Mon Sep 17 00:00:00 2001 From: KevKibe Date: Mon, 8 Apr 2024 14:09:39 +0300 Subject: [PATCH 05/13] update: readme usage instructions --- README.md | 106 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 93 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 97f9926..7e09410 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -

DocIndex: Fast Document Storage for RAG

+

DocIndex: Fast Document Embeddings Storage for RAG

@@ -8,14 +8,14 @@ License -*Efficiently store multiple documents and their metadata, whether they're offline or online, in a Pinecone Vector Database optimized for Retrieval Augmented Generation (RAG) models Fast* +*Efficiently store multiple document embeddings and their metadata, whether they're offline or online, in a Pinecone Vector Database optimized for Retrieval Augmented Generation (RAG) models Fast* ## Features - ⚡️ **Rapid Indexing**: Quickly index multiple documents along with their metadata, including source, page details, and content, into Pinecone DB.
- 📚 **Document Flexibility**: Index documents from your local storage or online sources with ease.
- 📂 **Format Support**: Seamlessly handle various document formats, including PDF, docx(in-development), etc.
-- 🔁 **Embedding Services Integration**: Enjoy support for multiple embedding services such as OpenAIEmbeddings, GoogleGenerativeAIEmbeddings and more in development.
+- 🔁 **Embedding Services Integration**: Enjoy support for multiple embedding services such as OpenAI Embeddings, Google Generative AI Embeddings and more in development.
- 🛠️ **Configurable Vectorstore**: Configure a vectorstore directly from the index to facilitate RAG pipelines effortlessly. ## Setup @@ -24,7 +24,8 @@ pip install docindex ``` -## Usage +## Getting Started +## Using OpenAI ```python from _openai.index import OpenaiPineconeIndexer @@ -33,11 +34,10 @@ pinecone_api_key = "pinecone-api-key" index_name = "pinecone-index-name" openai_api_key = "openai-api-key" environment = "pinecone-index-environment" +batch_limit = 20 # Batch limit for upserting documents +chunk_size = 256 # Optional: size of texts per chunk. -# Define the batch limit for indexing, how many pages per pass. -batch_limit = 20 - -# List of URLs of the documents to be indexed. (offline on your computer or an online) +# List of URLs of the documents to be indexed. (offline on your computer or online) urls = [ "your-document-1.pdf", "your-document-2.pdf" @@ -47,10 +47,9 @@ urls = [ pinecone_index = OpenaiPineconeIndexer(index_name, pinecone_api_key, environment, openai_api_key) # Index the documents with the specified URLs and batch limit -pinecone_index.index_documents(urls,batch_limit) +pinecone_index.index_documents(urls,batch_limit,chunk_size) ``` - -## Initialize Vectorstore +## Initialize Vectorstore(using OpenAI) ```python from pinecone import Pinecone as IndexPinecone @@ -74,6 +73,61 @@ text_field = "text" vectorstore = VectorStorePinecone(index, embed.embed_query, text_field) ``` + +## Using Google Generative AI + +```python +from _google.index import GooglePineconeIndexer + +# Replace these values with your actual Pinecone API key, index name, OpenAI API key, and environment +pinecone_api_key = "pinecone-api-key" +index_name = "pinecone-index-name" +google_api_key = "google-api-key" +environment = "pinecone-index-environment" +batch_limit = 20 # Batch limit for upserting documents +chunk_size = 256 # Optional: size of texts per chunk. + +# List of URLs of the documents to be indexed. (offline on your computer or an online) +urls = [ + "your-document-1.pdf", + "your-document-2.pdf" +] + +# Initialize the Pinecone indexer +pinecone_index = GooglePineconeIndexer(index_name, pinecone_api_key, environment, google_api_key) + +# Index the documents with the specified URLs and batch limit +pinecone_index.index_documents(urls,batch_limit,chunk_size) +``` + + +## Initialize Vectorstore(using Google Generative AI) + +```python +from pinecone import Pinecone as IndexPinecone +from langchain_community.vectorstores import Pinecone as VectorStorePinecone +from langchain_google_genai import GoogleGenerativeAIEmbeddings + +# Initialize the Pinecone index +index_pc = IndexPinecone(api_key=pinecone_api_key) +index = index_pc.Index(index_name) + +# Initialize embeddings +embed = GoogleGenerativeAIEmbeddings( + model="models/embedding-001", + google_api_key=google_api_key + ) + +# Define the text field +text_field = "text" + +# Initialize the Vectorstore with the Pinecone index and OpenAI embeddings +vectorstore = VectorStorePinecone(index, embed.embed_query, text_field) +``` + + + + ## Using the CLI - Clone the Repository: Clone or download the application code to your local machine. @@ -83,10 +137,13 @@ git clone https://github.com/KevKibe/docindex.git - Create a virtual environment for the project and activate it. ```bash +# Navigate to project repository cd docindex +# create virtual environment python -m venv venv +# activate virtual environment source venv/bin/activate ``` - Install dependencies by running this command @@ -94,10 +151,33 @@ source venv/bin/activate pip install -r requirements.txt ``` -- Navigate to src and run this command to index documents +- Navigate to src ```bash cd src +``` -python -m _openai.doc_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --openai_api_key "your_openai_api_key" --environment "your_environment" --batch_limit 10 --docs "doc-1.pdf" "doc-2.pdf' +- Run the command to start indexing the documents +```bash +# Using OpenAI +python -m _openai.doc_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --openai_api_key "your_openai_api_key" --environment "your_environment" --batch_limit 10 --docs "doc-1.pdf" "doc-2.pdf' --chunk_size 256 +``` +```bash +# Using Google Generative AI +python -m _google.doc_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --google_api_key "your_google_api_key" --environment "your_environment" --batch_limit 10 --docs "doc-1.pdf" "doc-2.pdf' --chunk_size 256 ``` + +## Contributing +Contributions are welcome and encouraged. + +Before contributing, please take a moment to review our [Contribution Guidelines](https://github.com/KevKibe/docindex/blob/master/DOCS/CONTRIBUTING.md) for important information on how to contribute to this project. + +If you're unsure about anything or need assistance, don't hesitate to reach out to us or open an issue to discuss your ideas. + +We look forward to your contributions! + +## License +This project is licensed under the MIT License - see the [LICENSE](https://github.com/KevKibe/docindex/blob/master/LICENSE) file for details. + +## Contact +For any enquiries, please reach out to me through keviinkibe@gmail.com \ No newline at end of file From e529ac96acc33e6242a013446001fd4f04f44281 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Mon, 8 Apr 2024 14:11:10 +0300 Subject: [PATCH 06/13] update: docs section code of condunct and contribution --- DOCS/CODE_OF_CONDUCT.md | 45 ++++++++++++++++++++++++++++++++++++++ DOCS/CONTRIBUTING.md | 48 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 DOCS/CODE_OF_CONDUCT.md create mode 100644 DOCS/CONTRIBUTING.md diff --git a/DOCS/CODE_OF_CONDUCT.md b/DOCS/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..56df9c1 --- /dev/null +++ b/DOCS/CODE_OF_CONDUCT.md @@ -0,0 +1,45 @@ +# Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +- Using welcoming and inclusive language +- Being respectful of differing viewpoints and experiences +- Gracefully accepting constructive criticism +- Focusing on what is best for the community +- Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +- The use of sexualized language or imagery and unwelcome sexual attention or advances +- Trolling, insulting/derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others' private information, such as a physical or email address, without their explicit permission +- Other conduct which could reasonably be considered inappropriate in a professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned with this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project email address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at [keviinkibe@gmail.com](mailto:keviinkibe@gmail.com). All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.0, available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +For answers to common questions about this code of conduct, see the FAQ at https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. diff --git a/DOCS/CONTRIBUTING.md b/DOCS/CONTRIBUTING.md new file mode 100644 index 0000000..cbca8d4 --- /dev/null +++ b/DOCS/CONTRIBUTING.md @@ -0,0 +1,48 @@ +# Contributing to DocIndex +Welcome to DocIndex! We appreciate your interest in contributing to our open-source project. Please take a moment to review the following guidelines to ensure a smooth and collaborative experience for everyone. + +## Code of Conduct + +Before contributing, please read and adhere to our [Code of Conduct](https://github.com/KevKibe/docindex/blob/master/DOCS/CODE_OF_CONDUCT.md). We are committed to fostering an inclusive and respectful community. + +## How to Contribute + +### Reporting Bugs or Issues + +If you encounter a bug or issue with the project, please search the [issue tracker](https://github.com/KevKibe/docindex/issues) to see if it has already been reported. If not, please open a new issue with a clear and descriptive title, along with detailed steps to reproduce the issue. + +### Suggesting Enhancements or New Features + +We welcome suggestions for enhancements or new features. Please open a new issue with a clear description of the enhancement or feature you'd like to see, along with any relevant context or use cases. + +### Submitting Pull Requests + +We appreciate contributions via pull requests. Before submitting a pull request, please ensure that: + + +- You have added appropriate tests (if applicable) +- Your pull request addresses a specific issue or feature request + +Please reference the relevant issue or feature request in your pull request description. + +## Getting Started + +To get started with contributing to African Whisper, follow these steps: + +1. Fork the repository and clone it to your local machine. +2. Install dependencies by running `pip install -r requirements.txt` (or equivalent). +3. Create a new branch for your changes: `git checkout -b my-feature-branch`. +4. Make your changes and commit them: `git commit -am 'Add new feature'`. +5. Push your changes to your fork: `git push origin my-feature-branch`. +6. Submit a pull request to the repository's `master` branch. + +## Communication + + + +## License + +By contributing to DocIndex, you agree to license your contributions under the [project's license](https://github.com/KevKibe/docindex/blob/master/LICENSE). + +Thank you for your contributions! + From 816017ca1607e6ae5114008474bdf6abcf123c1b Mon Sep 17 00:00:00 2001 From: KevKibe Date: Mon, 8 Apr 2024 14:59:05 +0300 Subject: [PATCH 07/13] update: add chunk size as argument to upsert documents method and move text splitter method to upsert document method --- src/_google/docindex.py | 28 +++++++++++----------------- src/_google/main.py | 18 ------------------ 2 files changed, 11 insertions(+), 35 deletions(-) delete mode 100644 src/_google/main.py diff --git a/src/_google/docindex.py b/src/_google/docindex.py index 2f9ef8c..de41e4c 100644 --- a/src/_google/docindex.py +++ b/src/_google/docindex.py @@ -77,34 +77,27 @@ def embed(self) -> GoogleGenerativeAIEmbeddings: google_api_key=self.google_api_key ) - def text_splitter(self) -> RecursiveCharacterTextSplitter: - """ - Initialize RecursiveCharacterTextSplitter object. - - Returns: - RecursiveCharacterTextSplitter: RecursiveCharacterTextSplitter object. - """ - return RecursiveCharacterTextSplitter( - chunk_size=400, - chunk_overlap=20, - length_function=self.tiktoken_len, - separators=["\n\n", "\n", " ", ""] - ) - def upsert_documents(self, documents: List[Page], batch_limit: int) -> None: + def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: int = 256) -> None: """ Upsert documents into the Pinecone index. Args: documents (List[Page]): List of documents to upsert. batch_limit (int): Maximum batch size for upsert operation. + chunks_size(int): size of texts per chunk. Returns: None """ texts = [] metadatas = [] - text_splitter = self.text_splitter() + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=int(chunk_size), + chunk_overlap=20, + length_function=self.tiktoken_len, + separators=["\n\n", "\n", " ", ""] + ) embed = self.embed() for i, record in enumerate(tqdm(documents)): metadata = { @@ -126,13 +119,14 @@ def upsert_documents(self, documents: List[Page], batch_limit: int) -> None: metadatas = [] - def index_documents(self, urls: List[str], batch_limit: int) -> None: + def index_documents(self, urls: List[str], batch_limit: int, chunk_size: int = 256) -> None: """ Process a list of URLs and upsert documents to a Pinecone index. Args: urls (List[str]): List of URLs to process. batch_limit (int): Batch limit for upserting documents. + chunks_size(int): size of texts per chunk. Returns: None @@ -152,6 +146,6 @@ def index_documents(self, urls: List[str], batch_limit: int) -> None: ] print(f"Upserting {len(pages_data)} pages to the Pinecone index...") - self.upsert_documents(pages_data, batch_limit) + self.upsert_documents(pages_data, batch_limit, chunk_size) print("Finished upserting documents for this URL.") print("Indexing complete.") diff --git a/src/_google/main.py b/src/_google/main.py deleted file mode 100644 index dba5156..0000000 --- a/src/_google/main.py +++ /dev/null @@ -1,18 +0,0 @@ -from .docindex import GooglePineconeIndexer -import argparse - -def parse_args(): - parser = argparse.ArgumentParser(description="Index documents on Pinecone using OpenAI embeddings.") - parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key") - parser.add_argument("--index_name", type=str, help="Name of the Pinecone index") - parser.add_argument("--google_api_key", type=str, help="OpenAI API key") - parser.add_argument("--environment", type=str, help="Environment for Pinecone service") - parser.add_argument("--batch_limit", type=int, help="Maximum batch size for indexing") - parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed") - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - pinecone_indexer = GooglePineconeIndexer(args.index_name, args.pinecone_api_key, args.environment, args.google_api_key) - pinecone_indexer.index_documents(args.docs, args.batch_limit) From afd8dff381a7627aeebaed223d388a5b341fb9e3 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Mon, 8 Apr 2024 15:00:02 +0300 Subject: [PATCH 08/13] update: cli args for google indexing --- src/_google/doc_index.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 src/_google/doc_index.py diff --git a/src/_google/doc_index.py b/src/_google/doc_index.py new file mode 100644 index 0000000..bebe166 --- /dev/null +++ b/src/_google/doc_index.py @@ -0,0 +1,19 @@ +from .docindex import GooglePineconeIndexer +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description="Index documents on Pinecone using OpenAI embeddings.") + parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key") + parser.add_argument("--index_name", type=str, help="Name of the Pinecone index") + parser.add_argument("--google_api_key", type=str, help="OpenAI API key") + parser.add_argument("--environment", type=str, help="Environment for Pinecone service") + parser.add_argument("--batch_limit", type=int, help="Maximum batch size for indexing") + parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed") + parser.add_argument("--chunk_size", help="size of texts per chunk") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + pinecone_indexer = GooglePineconeIndexer(args.index_name, args.pinecone_api_key, args.environment, args.google_api_key) + pinecone_indexer.index_documents(args.docs, args.batch_limit, args.chunk_size) From 7a75c52bfc50b39b9e0854dc5fa3f07bfa3653a8 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Mon, 8 Apr 2024 15:02:03 +0300 Subject: [PATCH 09/13] update: add chunksize parameter to index_documents method --- src/_openai/doc_index.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/_openai/doc_index.py b/src/_openai/doc_index.py index 7758297..6c3d9c9 100644 --- a/src/_openai/doc_index.py +++ b/src/_openai/doc_index.py @@ -9,13 +9,14 @@ def parse_args(): parser.add_argument("--environment", type=str, help="Environment for Pinecone service") parser.add_argument("--batch_limit", type=int, help="Maximum batch size for indexing") parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed") + parser.add_argument("--chunk_size", help="size of texts per chunk") return parser.parse_args() if __name__ == "__main__": args = parse_args() pinecone_indexer = OpenaiPineconeIndexer(args.index_name, args.pinecone_api_key, args.environment, args.openai_api_key) - pinecone_indexer.index_documents(args.docs, args.batch_limit) + pinecone_indexer.index_documents(args.docs, args.batch_limit, args.chunk_size) \ No newline at end of file From 355aee297b4cc8ca5638bb7ece72fb0132c3498e Mon Sep 17 00:00:00 2001 From: KevKibe Date: Mon, 8 Apr 2024 15:02:21 +0300 Subject: [PATCH 10/13] update: add chunk size as argument to upsert documents method and move text splitter method to upsert document method --- src/_openai/docindex.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/_openai/docindex.py b/src/_openai/docindex.py index 72d397a..ddaf2ea 100644 --- a/src/_openai/docindex.py +++ b/src/_openai/docindex.py @@ -76,34 +76,27 @@ def embed(self) -> OpenAIEmbeddings: openai_api_key=self.openai_api_key ) - def text_splitter(self) -> RecursiveCharacterTextSplitter: - """ - Initialize RecursiveCharacterTextSplitter object. - - Returns: - RecursiveCharacterTextSplitter: RecursiveCharacterTextSplitter object. - """ - return RecursiveCharacterTextSplitter( - chunk_size=400, - chunk_overlap=20, - length_function=self.tiktoken_len, - separators=["\n\n", "\n", " ", ""] - ) - def upsert_documents(self, documents: List[Page], batch_limit: int) -> None: + def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: int = 256) -> None: """ Upsert documents into the Pinecone index. Args: documents (List[Page]): List of documents to upsert. batch_limit (int): Maximum batch size for upsert operation. + chunks_size(int): size of texts per chunk. Returns: None """ texts = [] metadatas = [] - text_splitter = self.text_splitter() + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=int(chunk_size), + chunk_overlap=20, + length_function=self.tiktoken_len, + separators=["\n\n", "\n", " ", ""] + ) embed = self.embed() for i, record in enumerate(tqdm(documents)): metadata = { @@ -125,13 +118,14 @@ def upsert_documents(self, documents: List[Page], batch_limit: int) -> None: metadatas = [] - def index_documents(self, urls: List[str], batch_limit: int) -> None: + def index_documents(self, urls: List[str], batch_limit: int, chunk_size: int = 256) -> None: """ Process a list of URLs and upsert documents to a Pinecone index. Args: urls (List[str]): List of URLs to process. batch_limit (int): Batch limit for upserting documents. + chunks_size(int): size of texts per chunk. Returns: None @@ -151,6 +145,6 @@ def index_documents(self, urls: List[str], batch_limit: int) -> None: ] print(f"Upserting {len(pages_data)} pages to the Pinecone index...") - self.upsert_documents(pages_data, batch_limit) + self.upsert_documents(pages_data, batch_limit, chunk_size) print("Finished upserting documents for this URL.") print("Indexing complete.") From 3e43694f1ade1fcf379e50efab7b862852705b11 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Mon, 8 Apr 2024 15:17:55 +0300 Subject: [PATCH 11/13] update: add langchain-google-genai package to setup.cfg --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index d4c5c80..c2acf23 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,6 +20,7 @@ install_requires = langchain-community==0.0.31 langchain==0.1.14 langchain-openai==0.1.1 + langchain-google-genai==1.0.1 package_dir= =src From 7f93cbe611d545ab3b66e3795470fbb17ea731b5 Mon Sep 17 00:00:00 2001 From: KevKibe Date: Mon, 8 Apr 2024 15:18:35 +0300 Subject: [PATCH 12/13] fix: import statement in usage section --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7e09410..9782c64 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ pip install docindex ## Getting Started ## Using OpenAI ```python -from _openai.index import OpenaiPineconeIndexer +from _openai.docindex import OpenaiPineconeIndexer # Replace these values with your actual Pinecone API key, index name, OpenAI API key, and environment pinecone_api_key = "pinecone-api-key" @@ -77,7 +77,7 @@ vectorstore = VectorStorePinecone(index, embed.embed_query, text_field) ## Using Google Generative AI ```python -from _google.index import GooglePineconeIndexer +from _google.docindex import GooglePineconeIndexer # Replace these values with your actual Pinecone API key, index name, OpenAI API key, and environment pinecone_api_key = "pinecone-api-key" From 9b812dc1e10fbdf00f7614e5b744d78a8478ae4e Mon Sep 17 00:00:00 2001 From: KevKibe Date: Mon, 8 Apr 2024 15:43:19 +0300 Subject: [PATCH 13/13] fix: deprecated .embed --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9782c64..15e3b7b 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ embed = OpenAIEmbeddings( text_field = "text" # Initialize the Vectorstore with the Pinecone index and OpenAI embeddings -vectorstore = VectorStorePinecone(index, embed.embed_query, text_field) +vectorstore = VectorStorePinecone(index, embed, text_field) ``` @@ -122,7 +122,7 @@ embed = GoogleGenerativeAIEmbeddings( text_field = "text" # Initialize the Vectorstore with the Pinecone index and OpenAI embeddings -vectorstore = VectorStorePinecone(index, embed.embed_query, text_field) +vectorstore = VectorStorePinecone(index, embed, text_field) ```