diff --git a/DOCS/CODE_OF_CONDUCT.md b/DOCS/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..56df9c1 --- /dev/null +++ b/DOCS/CODE_OF_CONDUCT.md @@ -0,0 +1,45 @@ +# Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +- Using welcoming and inclusive language +- Being respectful of differing viewpoints and experiences +- Gracefully accepting constructive criticism +- Focusing on what is best for the community +- Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +- The use of sexualized language or imagery and unwelcome sexual attention or advances +- Trolling, insulting/derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others' private information, such as a physical or email address, without their explicit permission +- Other conduct which could reasonably be considered inappropriate in a professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned with this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project email address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at [keviinkibe@gmail.com](mailto:keviinkibe@gmail.com). All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.0, available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +For answers to common questions about this code of conduct, see the FAQ at https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. diff --git a/DOCS/CONTRIBUTING.md b/DOCS/CONTRIBUTING.md new file mode 100644 index 0000000..cbca8d4 --- /dev/null +++ b/DOCS/CONTRIBUTING.md @@ -0,0 +1,48 @@ +# Contributing to DocIndex +Welcome to DocIndex! We appreciate your interest in contributing to our open-source project. Please take a moment to review the following guidelines to ensure a smooth and collaborative experience for everyone. + +## Code of Conduct + +Before contributing, please read and adhere to our [Code of Conduct](https://github.com/KevKibe/docindex/blob/master/DOCS/CODE_OF_CONDUCT.md). We are committed to fostering an inclusive and respectful community. + +## How to Contribute + +### Reporting Bugs or Issues + +If you encounter a bug or issue with the project, please search the [issue tracker](https://github.com/KevKibe/docindex/issues) to see if it has already been reported. If not, please open a new issue with a clear and descriptive title, along with detailed steps to reproduce the issue. + +### Suggesting Enhancements or New Features + +We welcome suggestions for enhancements or new features. Please open a new issue with a clear description of the enhancement or feature you'd like to see, along with any relevant context or use cases. + +### Submitting Pull Requests + +We appreciate contributions via pull requests. Before submitting a pull request, please ensure that: + + +- You have added appropriate tests (if applicable) +- Your pull request addresses a specific issue or feature request + +Please reference the relevant issue or feature request in your pull request description. + +## Getting Started + +To get started with contributing to African Whisper, follow these steps: + +1. Fork the repository and clone it to your local machine. +2. Install dependencies by running `pip install -r requirements.txt` (or equivalent). +3. Create a new branch for your changes: `git checkout -b my-feature-branch`. +4. Make your changes and commit them: `git commit -am 'Add new feature'`. +5. Push your changes to your fork: `git push origin my-feature-branch`. +6. Submit a pull request to the repository's `master` branch. + +## Communication + + + +## License + +By contributing to DocIndex, you agree to license your contributions under the [project's license](https://github.com/KevKibe/docindex/blob/master/LICENSE). + +Thank you for your contributions! + diff --git a/README.md b/README.md index 97f9926..15e3b7b 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -
@@ -8,14 +8,14 @@
-*Efficiently store multiple documents and their metadata, whether they're offline or online, in a Pinecone Vector Database optimized for Retrieval Augmented Generation (RAG) models Fast*
+*Efficiently store multiple document embeddings and their metadata, whether they're offline or online, in a Pinecone Vector Database optimized for Retrieval Augmented Generation (RAG) models Fast*
## Features
- ⚡️ **Rapid Indexing**: Quickly index multiple documents along with their metadata, including source, page details, and content, into Pinecone DB.
- 📚 **Document Flexibility**: Index documents from your local storage or online sources with ease.
- 📂 **Format Support**: Seamlessly handle various document formats, including PDF, docx(in-development), etc.
-- 🔁 **Embedding Services Integration**: Enjoy support for multiple embedding services such as OpenAIEmbeddings, GoogleGenerativeAIEmbeddings and more in development.
+- 🔁 **Embedding Services Integration**: Enjoy support for multiple embedding services such as OpenAI Embeddings, Google Generative AI Embeddings and more in development.
- 🛠️ **Configurable Vectorstore**: Configure a vectorstore directly from the index to facilitate RAG pipelines effortlessly.
## Setup
@@ -24,20 +24,20 @@
pip install docindex
```
-## Usage
+## Getting Started
+## Using OpenAI
```python
-from _openai.index import OpenaiPineconeIndexer
+from _openai.docindex import OpenaiPineconeIndexer
# Replace these values with your actual Pinecone API key, index name, OpenAI API key, and environment
pinecone_api_key = "pinecone-api-key"
index_name = "pinecone-index-name"
openai_api_key = "openai-api-key"
environment = "pinecone-index-environment"
+batch_limit = 20 # Batch limit for upserting documents
+chunk_size = 256 # Optional: size of texts per chunk.
-# Define the batch limit for indexing, how many pages per pass.
-batch_limit = 20
-
-# List of URLs of the documents to be indexed. (offline on your computer or an online)
+# List of URLs of the documents to be indexed. (offline on your computer or online)
urls = [
"your-document-1.pdf",
"your-document-2.pdf"
@@ -47,10 +47,9 @@ urls = [
pinecone_index = OpenaiPineconeIndexer(index_name, pinecone_api_key, environment, openai_api_key)
# Index the documents with the specified URLs and batch limit
-pinecone_index.index_documents(urls,batch_limit)
+pinecone_index.index_documents(urls,batch_limit,chunk_size)
```
-
-## Initialize Vectorstore
+## Initialize Vectorstore(using OpenAI)
```python
from pinecone import Pinecone as IndexPinecone
@@ -71,9 +70,64 @@ embed = OpenAIEmbeddings(
text_field = "text"
# Initialize the Vectorstore with the Pinecone index and OpenAI embeddings
-vectorstore = VectorStorePinecone(index, embed.embed_query, text_field)
+vectorstore = VectorStorePinecone(index, embed, text_field)
+```
+
+
+## Using Google Generative AI
+
+```python
+from _google.docindex import GooglePineconeIndexer
+
+# Replace these values with your actual Pinecone API key, index name, OpenAI API key, and environment
+pinecone_api_key = "pinecone-api-key"
+index_name = "pinecone-index-name"
+google_api_key = "google-api-key"
+environment = "pinecone-index-environment"
+batch_limit = 20 # Batch limit for upserting documents
+chunk_size = 256 # Optional: size of texts per chunk.
+
+# List of URLs of the documents to be indexed. (offline on your computer or an online)
+urls = [
+ "your-document-1.pdf",
+ "your-document-2.pdf"
+]
+
+# Initialize the Pinecone indexer
+pinecone_index = GooglePineconeIndexer(index_name, pinecone_api_key, environment, google_api_key)
+
+# Index the documents with the specified URLs and batch limit
+pinecone_index.index_documents(urls,batch_limit,chunk_size)
+```
+
+
+## Initialize Vectorstore(using Google Generative AI)
+
+```python
+from pinecone import Pinecone as IndexPinecone
+from langchain_community.vectorstores import Pinecone as VectorStorePinecone
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+
+# Initialize the Pinecone index
+index_pc = IndexPinecone(api_key=pinecone_api_key)
+index = index_pc.Index(index_name)
+
+# Initialize embeddings
+embed = GoogleGenerativeAIEmbeddings(
+ model="models/embedding-001",
+ google_api_key=google_api_key
+ )
+
+# Define the text field
+text_field = "text"
+
+# Initialize the Vectorstore with the Pinecone index and OpenAI embeddings
+vectorstore = VectorStorePinecone(index, embed, text_field)
```
+
+
+
## Using the CLI
- Clone the Repository: Clone or download the application code to your local machine.
@@ -83,10 +137,13 @@ git clone https://github.com/KevKibe/docindex.git
- Create a virtual environment for the project and activate it.
```bash
+# Navigate to project repository
cd docindex
+# create virtual environment
python -m venv venv
+# activate virtual environment
source venv/bin/activate
```
- Install dependencies by running this command
@@ -94,10 +151,33 @@ source venv/bin/activate
pip install -r requirements.txt
```
-- Navigate to src and run this command to index documents
+- Navigate to src
```bash
cd src
+```
-python -m _openai.doc_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --openai_api_key "your_openai_api_key" --environment "your_environment" --batch_limit 10 --docs "doc-1.pdf" "doc-2.pdf'
+- Run the command to start indexing the documents
+```bash
+# Using OpenAI
+python -m _openai.doc_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --openai_api_key "your_openai_api_key" --environment "your_environment" --batch_limit 10 --docs "doc-1.pdf" "doc-2.pdf' --chunk_size 256
+```
+```bash
+# Using Google Generative AI
+python -m _google.doc_index --pinecone_api_key "your_pinecone_api_key" --index_name "your_index_name" --google_api_key "your_google_api_key" --environment "your_environment" --batch_limit 10 --docs "doc-1.pdf" "doc-2.pdf' --chunk_size 256
```
+
+## Contributing
+Contributions are welcome and encouraged.
+
+Before contributing, please take a moment to review our [Contribution Guidelines](https://github.com/KevKibe/docindex/blob/master/DOCS/CONTRIBUTING.md) for important information on how to contribute to this project.
+
+If you're unsure about anything or need assistance, don't hesitate to reach out to us or open an issue to discuss your ideas.
+
+We look forward to your contributions!
+
+## License
+This project is licensed under the MIT License - see the [LICENSE](https://github.com/KevKibe/docindex/blob/master/LICENSE) file for details.
+
+## Contact
+For any enquiries, please reach out to me through keviinkibe@gmail.com
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index d4c5c80..c2acf23 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -20,6 +20,7 @@ install_requires =
langchain-community==0.0.31
langchain==0.1.14
langchain-openai==0.1.1
+ langchain-google-genai==1.0.1
package_dir=
=src
diff --git a/src/_google/doc_index.py b/src/_google/doc_index.py
new file mode 100644
index 0000000..bebe166
--- /dev/null
+++ b/src/_google/doc_index.py
@@ -0,0 +1,19 @@
+from .docindex import GooglePineconeIndexer
+import argparse
+
+def parse_args():
+ parser = argparse.ArgumentParser(description="Index documents on Pinecone using OpenAI embeddings.")
+ parser.add_argument("--pinecone_api_key", type=str, help="Pinecone API key")
+ parser.add_argument("--index_name", type=str, help="Name of the Pinecone index")
+ parser.add_argument("--google_api_key", type=str, help="OpenAI API key")
+ parser.add_argument("--environment", type=str, help="Environment for Pinecone service")
+ parser.add_argument("--batch_limit", type=int, help="Maximum batch size for indexing")
+ parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed")
+ parser.add_argument("--chunk_size", help="size of texts per chunk")
+ return parser.parse_args()
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ pinecone_indexer = GooglePineconeIndexer(args.index_name, args.pinecone_api_key, args.environment, args.google_api_key)
+ pinecone_indexer.index_documents(args.docs, args.batch_limit, args.chunk_size)
diff --git a/src/_google/docindex.py b/src/_google/docindex.py
index 2f9ef8c..de41e4c 100644
--- a/src/_google/docindex.py
+++ b/src/_google/docindex.py
@@ -77,34 +77,27 @@ def embed(self) -> GoogleGenerativeAIEmbeddings:
google_api_key=self.google_api_key
)
- def text_splitter(self) -> RecursiveCharacterTextSplitter:
- """
- Initialize RecursiveCharacterTextSplitter object.
-
- Returns:
- RecursiveCharacterTextSplitter: RecursiveCharacterTextSplitter object.
- """
- return RecursiveCharacterTextSplitter(
- chunk_size=400,
- chunk_overlap=20,
- length_function=self.tiktoken_len,
- separators=["\n\n", "\n", " ", ""]
- )
- def upsert_documents(self, documents: List[Page], batch_limit: int) -> None:
+ def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: int = 256) -> None:
"""
Upsert documents into the Pinecone index.
Args:
documents (List[Page]): List of documents to upsert.
batch_limit (int): Maximum batch size for upsert operation.
+ chunks_size(int): size of texts per chunk.
Returns:
None
"""
texts = []
metadatas = []
- text_splitter = self.text_splitter()
+ text_splitter = RecursiveCharacterTextSplitter(
+ chunk_size=int(chunk_size),
+ chunk_overlap=20,
+ length_function=self.tiktoken_len,
+ separators=["\n\n", "\n", " ", ""]
+ )
embed = self.embed()
for i, record in enumerate(tqdm(documents)):
metadata = {
@@ -126,13 +119,14 @@ def upsert_documents(self, documents: List[Page], batch_limit: int) -> None:
metadatas = []
- def index_documents(self, urls: List[str], batch_limit: int) -> None:
+ def index_documents(self, urls: List[str], batch_limit: int, chunk_size: int = 256) -> None:
"""
Process a list of URLs and upsert documents to a Pinecone index.
Args:
urls (List[str]): List of URLs to process.
batch_limit (int): Batch limit for upserting documents.
+ chunks_size(int): size of texts per chunk.
Returns:
None
@@ -152,6 +146,6 @@ def index_documents(self, urls: List[str], batch_limit: int) -> None:
]
print(f"Upserting {len(pages_data)} pages to the Pinecone index...")
- self.upsert_documents(pages_data, batch_limit)
+ self.upsert_documents(pages_data, batch_limit, chunk_size)
print("Finished upserting documents for this URL.")
print("Indexing complete.")
diff --git a/src/_openai/doc_index.py b/src/_openai/doc_index.py
index 7758297..6c3d9c9 100644
--- a/src/_openai/doc_index.py
+++ b/src/_openai/doc_index.py
@@ -9,13 +9,14 @@ def parse_args():
parser.add_argument("--environment", type=str, help="Environment for Pinecone service")
parser.add_argument("--batch_limit", type=int, help="Maximum batch size for indexing")
parser.add_argument("--docs", nargs="+", help="URLs of the documents to be indexed")
+ parser.add_argument("--chunk_size", help="size of texts per chunk")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
pinecone_indexer = OpenaiPineconeIndexer(args.index_name, args.pinecone_api_key, args.environment, args.openai_api_key)
- pinecone_indexer.index_documents(args.docs, args.batch_limit)
+ pinecone_indexer.index_documents(args.docs, args.batch_limit, args.chunk_size)
\ No newline at end of file
diff --git a/src/_openai/docindex.py b/src/_openai/docindex.py
index 72d397a..ddaf2ea 100644
--- a/src/_openai/docindex.py
+++ b/src/_openai/docindex.py
@@ -76,34 +76,27 @@ def embed(self) -> OpenAIEmbeddings:
openai_api_key=self.openai_api_key
)
- def text_splitter(self) -> RecursiveCharacterTextSplitter:
- """
- Initialize RecursiveCharacterTextSplitter object.
-
- Returns:
- RecursiveCharacterTextSplitter: RecursiveCharacterTextSplitter object.
- """
- return RecursiveCharacterTextSplitter(
- chunk_size=400,
- chunk_overlap=20,
- length_function=self.tiktoken_len,
- separators=["\n\n", "\n", " ", ""]
- )
- def upsert_documents(self, documents: List[Page], batch_limit: int) -> None:
+ def upsert_documents(self, documents: List[Page], batch_limit: int, chunk_size: int = 256) -> None:
"""
Upsert documents into the Pinecone index.
Args:
documents (List[Page]): List of documents to upsert.
batch_limit (int): Maximum batch size for upsert operation.
+ chunks_size(int): size of texts per chunk.
Returns:
None
"""
texts = []
metadatas = []
- text_splitter = self.text_splitter()
+ text_splitter = RecursiveCharacterTextSplitter(
+ chunk_size=int(chunk_size),
+ chunk_overlap=20,
+ length_function=self.tiktoken_len,
+ separators=["\n\n", "\n", " ", ""]
+ )
embed = self.embed()
for i, record in enumerate(tqdm(documents)):
metadata = {
@@ -125,13 +118,14 @@ def upsert_documents(self, documents: List[Page], batch_limit: int) -> None:
metadatas = []
- def index_documents(self, urls: List[str], batch_limit: int) -> None:
+ def index_documents(self, urls: List[str], batch_limit: int, chunk_size: int = 256) -> None:
"""
Process a list of URLs and upsert documents to a Pinecone index.
Args:
urls (List[str]): List of URLs to process.
batch_limit (int): Batch limit for upserting documents.
+ chunks_size(int): size of texts per chunk.
Returns:
None
@@ -151,6 +145,6 @@ def index_documents(self, urls: List[str], batch_limit: int) -> None:
]
print(f"Upserting {len(pages_data)} pages to the Pinecone index...")
- self.upsert_documents(pages_data, batch_limit)
+ self.upsert_documents(pages_data, batch_limit, chunk_size)
print("Finished upserting documents for this URL.")
print("Indexing complete.")