-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
scripts for downloading input, converting docs, generating embeddings…
… and chroma insertion Signed-off-by: greg pereira <[email protected]>
- Loading branch information
1 parent
e3a15aa
commit f3d0556
Showing
63 changed files
with
68,156 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# pdf-input/*.pdf # these are 10k listings, they can be included | ||
# pdf-output/*.md # docling has been opensourced, we can share results | ||
# pdf-output/*.json # docling has been opensourced, we can share results | ||
venv | ||
chroma.log |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
aws --profile et s3 cp s3://east2-backup-rcook/1c674864-2ac6-4c97-8bfa-f30efc6264f2.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/2e22532e-2b0f-458b-bd39-7daf01da779a.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/336d8745-ea82-40a5-9acc-1a89df23d0f3.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/499b79ae-bd47-4538-8c98-3731ea102750.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/4d39f579-19d8-4119-b087-ee618abf82d6.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/51b3ab22-594a-4d41-b45f-5af1c9a80e69.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/69682db4-0019-42ce-a4e1-983c30725f1b.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/6af7bf21-4d7b-4ed5-95a3-fad5cc68efe0.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/805e8ffe-b918-4118-ab8e-fe998cc64d89.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/8c560d9c-f48e-4b8d-aad8-6a0badb8e348.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/96985bfb-79b1-41e9-b552-fd5ad5af6fd3.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/97bb54ee-ce92-4f6f-80b7-9e33013402bd.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/a18084e1-1eac-4133-8829-3fc5fb52295d.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/b639d901-a599-4823-9de4-3a32528fd6f1.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/b657f733-ed5f-4fd7-a89e-95736b5f3cf1.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/bed19367-fa6b-41ff-a973-df19510b0bba.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/c2043b4d-2d3c-4781-b0a0-5bbf79da02ae.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/c323c7df-179c-44ca-8ec7-96f51880b187.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/c7c14359-36fa-40c3-b3ca-5bf7f3fa0b96.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/cc5a18a3-8f6f-4d7c-b1b4-f91257a973dc.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/d2fde7ee-05f7-419d-9ce8-186de4c96e25.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/eb4ef26e-07c0-4414-b940-c25712d441f3.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/f965e5c3-fded-45d3-bbdb-f750f156dcc9.pdf ./ | ||
aws --profile et s3 cp s3://east2-backup-rcook/fae0d139-777f-489f-8e63-c97285413f9d.pdf ./ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
python3 -m venv venv | ||
source venv/bin/activate | ||
venv/bin/python -m pip install -r requirements.txt | ||
# python 2-convert-docs.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
# import datetime | ||
import logging | ||
import time | ||
from pathlib import Path | ||
from typing import Iterable | ||
import json | ||
|
||
from docling.datamodel.base_models import ( # type: ignore | pylance_cfg | ||
ConversionStatus, | ||
# FigureElement, | ||
# PageElement, | ||
# TableElement, | ||
PipelineOptions, | ||
) | ||
|
||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend # type: ignore | pylance_cfg | ||
from docling.datamodel.document import ConversionResult, DocumentConversionInput # type: ignore | pylance_cfg | ||
from docling.document_converter import DocumentConverter # type: ignore | pylance_cfg | ||
|
||
_log = logging.getLogger(__name__) | ||
|
||
IMAGE_RESOLUTION_SCALE = 2.0 | ||
|
||
def export_documents( | ||
conv_results: Iterable[ConversionResult], | ||
output_dir: Path, | ||
): | ||
output_dir.mkdir(parents=True, exist_ok=True) | ||
|
||
success_count = 0 | ||
failure_count = 0 | ||
partial_success_count = 0 | ||
|
||
for conv_res in conv_results: | ||
if conv_res.status == ConversionStatus.SUCCESS: | ||
success_count += 1 | ||
doc_filename = conv_res.input.file.stem | ||
|
||
# Export Deep Search document JSON format: | ||
with (output_dir / f"{doc_filename}.json").open("w") as fp: | ||
fp.write(json.dumps(conv_res.render_as_dict())) | ||
|
||
# Export Markdown format: | ||
with (output_dir / f"{doc_filename}.md").open("w") as fp: | ||
fp.write(conv_res.render_as_markdown()) | ||
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS: | ||
_log.info( | ||
f"Document {conv_res.input.file} was partially converted with the following errors:" | ||
) | ||
for item in conv_res.errors: | ||
_log.info(f"\t{item.error_message}") | ||
partial_success_count += 1 | ||
else: | ||
_log.info(f"Document {conv_res.input.file} failed to convert.") | ||
failure_count += 1 | ||
|
||
_log.info( | ||
f"Processed {success_count + partial_success_count + failure_count} docs, " | ||
f"of which {failure_count} failed " | ||
f"and {partial_success_count} were partially converted." | ||
) | ||
return success_count, partial_success_count, failure_count | ||
|
||
def main(): | ||
logging.basicConfig(level=logging.INFO) | ||
|
||
|
||
directory_path = Path('./pdf-input') | ||
|
||
input_doc_paths = list(directory_path.glob('*.pdf')) | ||
|
||
pipeline_options = PipelineOptions() | ||
pipeline_options.do_ocr=True | ||
pipeline_options.do_table_structure=True | ||
pipeline_options.table_structure_options.do_cell_matching = True | ||
|
||
doc_converter = DocumentConverter( | ||
pipeline_options=pipeline_options, | ||
pdf_backend=DoclingParseDocumentBackend, | ||
) | ||
|
||
input = DocumentConversionInput.from_paths(input_doc_paths) | ||
|
||
start_time = time.time() | ||
|
||
conv_results = doc_converter.convert(input) | ||
success_count, failure_count = export_documents( | ||
conv_results, output_dir=Path("./pdf-output") | ||
) | ||
|
||
end_time = time.time() - start_time | ||
|
||
_log.info(f"All documents were converted in {end_time:.2f} seconds.") | ||
|
||
if failure_count > 0: | ||
raise RuntimeError( | ||
f"The example failed converting {failure_count} on {len(input_doc_paths)}." | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import os | ||
import shutil | ||
import chromadb | ||
import logging | ||
from pathlib import Path | ||
|
||
from langchain_text_splitters import MarkdownHeaderTextSplitter | ||
|
||
_log = logging.getLogger(__name__) | ||
|
||
def gather_docs(path): | ||
docs = {} | ||
directory_path = Path(path) | ||
converted_doc_paths = list(directory_path.glob('*.md')) | ||
for doc in converted_doc_paths: | ||
with open(doc, 'r', encoding='utf-8') as file: | ||
markdown_content = file.read() | ||
docs[doc] = markdown_content | ||
return docs | ||
|
||
def init_collection(): | ||
try: | ||
client = chromadb.Client() | ||
except Exception as error: | ||
_log.error(f"Could not connect to the db: {error}") | ||
|
||
ten_k_collection = client.get_or_create_collection("10-Ks") | ||
return ten_k_collection | ||
|
||
def chunk_document(document): | ||
headers_to_split_on = [ | ||
("#", "Header 1"), | ||
("##", "Header 2"), | ||
("###", "Header 3"), | ||
] | ||
|
||
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on) | ||
md_header_splits = markdown_splitter.split_text(document) | ||
return md_header_splits | ||
|
||
|
||
def main(): | ||
ten_k_collection = init_collection() | ||
docs = gather_docs("./pdf-output") | ||
for pdf_name, pdf_content in docs.items(): # iterate through every actual document | ||
doc_chunks = chunk_document(pdf_content) | ||
chunk_ids = [] | ||
metadatas = [] | ||
documents = [] | ||
for chunk_index in range(len(doc_chunks)): | ||
chunk_ids.append(f"{pdf_name}.{chunk_index}") | ||
if not bool(doc_chunks[chunk_index].metadata): | ||
metadatas.append({"source": str(pdf_name)}) | ||
else: | ||
doc_chunks[chunk_index].metadata["source"] = str(pdf_name) | ||
metadatas.append(doc_chunks[chunk_index].metadata) | ||
documents.append(doc_chunks[chunk_index].page_content) | ||
# print("index") | ||
|
||
# document | ||
ten_k_collection.add(documents=documents, ids=chunk_ids, metadatas=metadatas) | ||
|
||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
dbpath=$(realpath chromadb) | ||
cd $dbpath | ||
bash -c "chroma run --path $dbpath" | ||
cd "$dbpath/.." | ||
python3 5_generate_embeddings_and_populate_vector_db.py |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
[32mINFO[0m: [06-09-2024 13:20:50] Set chroma_server_nofile to 65535 | ||
[32mINFO[0m: [06-09-2024 13:20:50] Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information. | ||
[36mDEBUG[0m: [06-09-2024 13:20:50] Starting component System | ||
[36mDEBUG[0m: [06-09-2024 13:20:50] Starting component OpenTelemetryClient | ||
[36mDEBUG[0m: [06-09-2024 13:20:50] Starting component SqliteDB | ||
[36mDEBUG[0m: [06-09-2024 13:20:50] Starting component QuotaEnforcer | ||
[36mDEBUG[0m: [06-09-2024 13:20:50] Starting component Posthog | ||
[36mDEBUG[0m: [06-09-2024 13:20:50] Starting component LocalSegmentManager | ||
[36mDEBUG[0m: [06-09-2024 13:20:50] Starting component SegmentAPI | ||
[32mINFO[0m: [06-09-2024 13:20:50] Started server process [[36m93912[0m] | ||
[32mINFO[0m: [06-09-2024 13:20:50] Waiting for application startup. | ||
[32mINFO[0m: [06-09-2024 13:20:50] Application startup complete. | ||
[32mINFO[0m: [06-09-2024 13:20:50] Uvicorn running on [1mhttp://localhost:8000[0m (Press CTRL+C to quit) | ||
[32mINFO[0m: [06-09-2024 13:25:57] ::1:58985 - "GET /api/v1/tenants/default_tenant HTTP/1.1" 200 | ||
[32mINFO[0m: [06-09-2024 13:25:57] ::1:58985 - "GET /api/v1/databases/default_database?tenant=default_tenant HTTP/1.1" 200 | ||
[32mINFO[0m: [06-09-2024 19:52:25] Shutting down | ||
[32mINFO[0m: [06-09-2024 19:52:25] Waiting for application shutdown. | ||
[36mDEBUG[0m: [06-09-2024 19:52:25] Stopping component System | ||
[36mDEBUG[0m: [06-09-2024 19:52:25] Stopping component SegmentAPI | ||
[36mDEBUG[0m: [06-09-2024 19:52:25] Stopping component LocalSegmentManager | ||
[36mDEBUG[0m: [06-09-2024 19:52:25] Stopping component Posthog | ||
[36mDEBUG[0m: [06-09-2024 19:52:25] Stopping component QuotaEnforcer | ||
[36mDEBUG[0m: [06-09-2024 19:52:25] Stopping component SqliteDB | ||
[36mDEBUG[0m: [06-09-2024 19:52:25] Stopping component OpenTelemetryClient | ||
[32mINFO[0m: [06-09-2024 19:52:25] Application shutdown complete. | ||
[32mINFO[0m: [06-09-2024 19:52:25] Finished server process [[36m93912[0m] | ||
[32mINFO[0m: [06-09-2024 19:52:28] Set chroma_server_nofile to 65535 | ||
[32mINFO[0m: [06-09-2024 19:52:28] Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information. | ||
[36mDEBUG[0m: [06-09-2024 19:52:28] Starting component System | ||
[36mDEBUG[0m: [06-09-2024 19:52:28] Starting component OpenTelemetryClient | ||
[36mDEBUG[0m: [06-09-2024 19:52:28] Starting component SqliteDB | ||
[36mDEBUG[0m: [06-09-2024 19:52:28] Starting component QuotaEnforcer | ||
[36mDEBUG[0m: [06-09-2024 19:52:28] Starting component Posthog | ||
[36mDEBUG[0m: [06-09-2024 19:52:28] Starting component LocalSegmentManager | ||
[36mDEBUG[0m: [06-09-2024 19:52:28] Starting component SegmentAPI | ||
[32mINFO[0m: [06-09-2024 19:52:28] Started server process [[36m99645[0m] | ||
[32mINFO[0m: [06-09-2024 19:52:28] Waiting for application startup. | ||
[32mINFO[0m: [06-09-2024 19:52:28] Application startup complete. | ||
[32mINFO[0m: [06-09-2024 19:52:28] Uvicorn running on [1mhttp://localhost:8000[0m (Press CTRL+C to quit) |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
Consideration: should we be using multi-modal? | ||
- Via docling we could export the tables themselves as images. Would this be a better format? It seems like a better storage format but not for queries. What is the best way for the model to get that kind of table data out? | ||
- https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/multimodal_rag_langchain.ipynb has some examples | ||
- for our implementation this would leverage the docling export figures example: https://github.com/DS4SD/docling/blob/main/examples/export_figures.py | ||
- we would have to get the table images in the right inersertion order for the chunks |
Empty file.
Large diffs are not rendered by default.
Oops, something went wrong.
Oops, something went wrong.