Skip to content

Commit

Permalink
Added chroma upload to pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewcoole committed Oct 16, 2024
1 parent 4071052 commit ec183d3
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 3 deletions.
1 change: 1 addition & 0 deletions data/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
/chunked_data.json
/chunked_embeddings.json
/embeddings.json
/chroma-data
18 changes: 18 additions & 0 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,21 @@ stages:
hash: md5
md5: b08299369d1f243eb8d8ffa2cdb9a90f
size: 351126
upload-to-docstore:
cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data
-em all-MiniLM-L6-v2
deps:
- path: data/embeddings.json
hash: md5
md5: b08299369d1f243eb8d8ffa2cdb9a90f
size: 351126
- path: scripts/upload_to_docstore.py
hash: md5
md5: ae8755770166dd3d6c1efb9f15723116
size: 1836
outs:
- path: data/chroma-data
hash: md5
md5: 2f2ba629bf078284bb6d6be73c6166a7.dir
size: 2069220
nfiles: 5
9 changes: 8 additions & 1 deletion dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,11 @@ stages:
- ${files.chunked}
- scripts/create_embeddings.py
outs:
- ${files.embeddings}
- ${files.embeddings}
upload-to-docstore:
cmd: python scripts/upload_to_docstore.py ${files.embeddings} -o ${files.doc-store} -em ${hp.embeddings-model}
deps:
- ${files.embeddings}
- scripts/upload_to_docstore.py
outs:
- ${files.doc-store}
2 changes: 2 additions & 0 deletions params.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
hp:
chunk-size: 300
overlap: 100
embeddings-model: "all-MiniLM-L6-v2"
files:
metadata: "data/eidc_metadata.json"
extracted: "data/extracted_metadata.json"
chunked: "data/chunked_data.json"
embeddings: "data/embeddings.json"
doc-store: "data/chroma-data"
sample-size: 10 # sample size of 0 will process all data
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies = [
"haystack-ai",
"accelerate",
"sentence-transformers",
"chromadb",
]

[project.optional-dependencies]
Expand Down
56 changes: 54 additions & 2 deletions scripts/upload_to_docstore.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,58 @@
from argparse import ArgumentParser
import json
import uuid

import chromadb
from chromadb.utils import embedding_functions


def main(input_file: str, output_path: str, collection_name: str, embedding_model: str):
print(collection_name)
with open(input_file) as f:
json_data = json.load(f)

docs = [chunk["chunk"] for chunk in json_data]
metas = [
{field: chunk[field] for field in ["field", "id", "index"]}
for chunk in json_data
]
embs = [chunk["embedding"] for chunk in json_data]
ids = [uuid.uuid4().hex for _ in json_data]

func = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name=embedding_model
)

client = chromadb.PersistentClient(output_path)
collection = client.create_collection(
name=collection_name, embedding_function=func
)
collection.add(documents=docs, metadatas=metas, embeddings=embs, ids=ids)


if __name__ == "__main__":
parser = ArgumentParser("prepare_data.py")
parser.add_argument("input_file", nargs="+", help="File containing chunks and embeddings to upload to document store")
parser.add_argument("-o", "--output", help="The file to write the output to.")
parser.add_argument(
"input_file",
help="File containing chunks and embeddings to upload to document store",
)
parser.add_argument(
"-o",
"--output",
help="The file to write the output to.",
default="data/chroma-data",
)
parser.add_argument(
"-c",
"--collection",
help="Collection name to use in doc store.",
default="eidc-data",
)
parser.add_argument(
"-em",
"--embedding_model",
help="Embedding model to use in the doc store (must be the same as the function used to create embeddings.)",
default="all-MiniLM-L6-v2",
)
args = parser.parse_args()
main(args.input_file, args.output, args.collection, args.embedding_model)

1 comment on commit ec183d3

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

answer_relevancy: 0.4698974858684828
context_precision: 0.4974214494117698
answer_correctness: 0.5285432309309501
context_recall: 0.5459190810274471

Please sign in to comment.