Skip to content

Commit

Permalink
scripts for downloading input, converting docs, generating embeddings…
Browse files Browse the repository at this point in the history
… and chroma insertion

Signed-off-by: greg pereira <[email protected]>
  • Loading branch information
Gregory-Pereira committed Sep 7, 2024
1 parent e3a15aa commit f3d0556
Show file tree
Hide file tree
Showing 63 changed files with 68,156 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# pdf-input/*.pdf # these are 10k listings, they can be included
# pdf-output/*.md # docling has been opensourced, we can share results
# pdf-output/*.json # docling has been opensourced, we can share results
venv
chroma.log
24 changes: 24 additions & 0 deletions 1-download-docs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
aws --profile et s3 cp s3://east2-backup-rcook/1c674864-2ac6-4c97-8bfa-f30efc6264f2.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/2e22532e-2b0f-458b-bd39-7daf01da779a.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/336d8745-ea82-40a5-9acc-1a89df23d0f3.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/499b79ae-bd47-4538-8c98-3731ea102750.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/4d39f579-19d8-4119-b087-ee618abf82d6.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/51b3ab22-594a-4d41-b45f-5af1c9a80e69.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/69682db4-0019-42ce-a4e1-983c30725f1b.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/6af7bf21-4d7b-4ed5-95a3-fad5cc68efe0.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/805e8ffe-b918-4118-ab8e-fe998cc64d89.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/8c560d9c-f48e-4b8d-aad8-6a0badb8e348.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/96985bfb-79b1-41e9-b552-fd5ad5af6fd3.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/97bb54ee-ce92-4f6f-80b7-9e33013402bd.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/a18084e1-1eac-4133-8829-3fc5fb52295d.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/b639d901-a599-4823-9de4-3a32528fd6f1.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/b657f733-ed5f-4fd7-a89e-95736b5f3cf1.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/bed19367-fa6b-41ff-a973-df19510b0bba.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/c2043b4d-2d3c-4781-b0a0-5bbf79da02ae.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/c323c7df-179c-44ca-8ec7-96f51880b187.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/c7c14359-36fa-40c3-b3ca-5bf7f3fa0b96.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/cc5a18a3-8f6f-4d7c-b1b4-f91257a973dc.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/d2fde7ee-05f7-419d-9ce8-186de4c96e25.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/eb4ef26e-07c0-4414-b940-c25712d441f3.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/f965e5c3-fded-45d3-bbdb-f750f156dcc9.pdf ./
aws --profile et s3 cp s3://east2-backup-rcook/fae0d139-777f-489f-8e63-c97285413f9d.pdf ./
4 changes: 4 additions & 0 deletions 2-convert-docs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
python3 -m venv venv
source venv/bin/activate
venv/bin/python -m pip install -r requirements.txt
# python 2-convert-docs.py
102 changes: 102 additions & 0 deletions 2_convert_docs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# import datetime
import logging
import time
from pathlib import Path
from typing import Iterable
import json

from docling.datamodel.base_models import ( # type: ignore | pylance_cfg
ConversionStatus,
# FigureElement,
# PageElement,
# TableElement,
PipelineOptions,
)

from docling.backend.docling_parse_backend import DoclingParseDocumentBackend # type: ignore | pylance_cfg
from docling.datamodel.document import ConversionResult, DocumentConversionInput # type: ignore | pylance_cfg
from docling.document_converter import DocumentConverter # type: ignore | pylance_cfg

_log = logging.getLogger(__name__)

IMAGE_RESOLUTION_SCALE = 2.0

def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,
):
output_dir.mkdir(parents=True, exist_ok=True)

success_count = 0
failure_count = 0
partial_success_count = 0

for conv_res in conv_results:
if conv_res.status == ConversionStatus.SUCCESS:
success_count += 1
doc_filename = conv_res.input.file.stem

# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open("w") as fp:
fp.write(json.dumps(conv_res.render_as_dict()))

# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(conv_res.render_as_markdown())
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
_log.info(
f"Document {conv_res.input.file} was partially converted with the following errors:"
)
for item in conv_res.errors:
_log.info(f"\t{item.error_message}")
partial_success_count += 1
else:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1

_log.info(
f"Processed {success_count + partial_success_count + failure_count} docs, "
f"of which {failure_count} failed "
f"and {partial_success_count} were partially converted."
)
return success_count, partial_success_count, failure_count

def main():
logging.basicConfig(level=logging.INFO)


directory_path = Path('./pdf-input')

input_doc_paths = list(directory_path.glob('*.pdf'))

pipeline_options = PipelineOptions()
pipeline_options.do_ocr=True
pipeline_options.do_table_structure=True
pipeline_options.table_structure_options.do_cell_matching = True

doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)

input = DocumentConversionInput.from_paths(input_doc_paths)

start_time = time.time()

conv_results = doc_converter.convert(input)
success_count, failure_count = export_documents(
conv_results, output_dir=Path("./pdf-output")
)

end_time = time.time() - start_time

_log.info(f"All documents were converted in {end_time:.2f} seconds.")

if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)


if __name__ == "__main__":
main()
63 changes: 63 additions & 0 deletions 5_generate_embeddings_and_populate_vector_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import os
import shutil
import chromadb
import logging
from pathlib import Path

from langchain_text_splitters import MarkdownHeaderTextSplitter

_log = logging.getLogger(__name__)

def gather_docs(path):
docs = {}
directory_path = Path(path)
converted_doc_paths = list(directory_path.glob('*.md'))
for doc in converted_doc_paths:
with open(doc, 'r', encoding='utf-8') as file:
markdown_content = file.read()
docs[doc] = markdown_content
return docs

def init_collection():
try:
client = chromadb.Client()
except Exception as error:
_log.error(f"Could not connect to the db: {error}")

ten_k_collection = client.get_or_create_collection("10-Ks")
return ten_k_collection

def chunk_document(document):
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(document)
return md_header_splits


def main():
ten_k_collection = init_collection()
docs = gather_docs("./pdf-output")
for pdf_name, pdf_content in docs.items(): # iterate through every actual document
doc_chunks = chunk_document(pdf_content)
chunk_ids = []
metadatas = []
documents = []
for chunk_index in range(len(doc_chunks)):
chunk_ids.append(f"{pdf_name}.{chunk_index}")
if not bool(doc_chunks[chunk_index].metadata):
metadatas.append({"source": str(pdf_name)})
else:
doc_chunks[chunk_index].metadata["source"] = str(pdf_name)
metadatas.append(doc_chunks[chunk_index].metadata)
documents.append(doc_chunks[chunk_index].page_content)
# print("index")

# document
ten_k_collection.add(documents=documents, ids=chunk_ids, metadatas=metadatas)

main()
5 changes: 5 additions & 0 deletions 5_run_chroma.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
dbpath=$(realpath chromadb)
cd $dbpath
bash -c "chroma run --path $dbpath"
cd "$dbpath/.."
python3 5_generate_embeddings_and_populate_vector_db.py
Empty file added chromadb/.gitkeep
Empty file.
39 changes: 39 additions & 0 deletions chromadb/chroma.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
INFO: [06-09-2024 13:20:50] Set chroma_server_nofile to 65535
INFO: [06-09-2024 13:20:50] Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
DEBUG: [06-09-2024 13:20:50] Starting component System
DEBUG: [06-09-2024 13:20:50] Starting component OpenTelemetryClient
DEBUG: [06-09-2024 13:20:50] Starting component SqliteDB
DEBUG: [06-09-2024 13:20:50] Starting component QuotaEnforcer
DEBUG: [06-09-2024 13:20:50] Starting component Posthog
DEBUG: [06-09-2024 13:20:50] Starting component LocalSegmentManager
DEBUG: [06-09-2024 13:20:50] Starting component SegmentAPI
INFO: [06-09-2024 13:20:50] Started server process [93912]
INFO: [06-09-2024 13:20:50] Waiting for application startup.
INFO: [06-09-2024 13:20:50] Application startup complete.
INFO: [06-09-2024 13:20:50] Uvicorn running on http://localhost:8000 (Press CTRL+C to quit)
INFO: [06-09-2024 13:25:57] ::1:58985 - "GET /api/v1/tenants/default_tenant HTTP/1.1" 200
INFO: [06-09-2024 13:25:57] ::1:58985 - "GET /api/v1/databases/default_database?tenant=default_tenant HTTP/1.1" 200
INFO: [06-09-2024 19:52:25] Shutting down
INFO: [06-09-2024 19:52:25] Waiting for application shutdown.
DEBUG: [06-09-2024 19:52:25] Stopping component System
DEBUG: [06-09-2024 19:52:25] Stopping component SegmentAPI
DEBUG: [06-09-2024 19:52:25] Stopping component LocalSegmentManager
DEBUG: [06-09-2024 19:52:25] Stopping component Posthog
DEBUG: [06-09-2024 19:52:25] Stopping component QuotaEnforcer
DEBUG: [06-09-2024 19:52:25] Stopping component SqliteDB
DEBUG: [06-09-2024 19:52:25] Stopping component OpenTelemetryClient
INFO: [06-09-2024 19:52:25] Application shutdown complete.
INFO: [06-09-2024 19:52:25] Finished server process [93912]
INFO: [06-09-2024 19:52:28] Set chroma_server_nofile to 65535
INFO: [06-09-2024 19:52:28] Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
DEBUG: [06-09-2024 19:52:28] Starting component System
DEBUG: [06-09-2024 19:52:28] Starting component OpenTelemetryClient
DEBUG: [06-09-2024 19:52:28] Starting component SqliteDB
DEBUG: [06-09-2024 19:52:28] Starting component QuotaEnforcer
DEBUG: [06-09-2024 19:52:28] Starting component Posthog
DEBUG: [06-09-2024 19:52:28] Starting component LocalSegmentManager
DEBUG: [06-09-2024 19:52:28] Starting component SegmentAPI
INFO: [06-09-2024 19:52:28] Started server process [99645]
INFO: [06-09-2024 19:52:28] Waiting for application startup.
INFO: [06-09-2024 19:52:28] Application startup complete.
INFO: [06-09-2024 19:52:28] Uvicorn running on http://localhost:8000 (Press CTRL+C to quit)
Binary file added chromadb/chroma.sqlite3
Binary file not shown.
5 changes: 5 additions & 0 deletions notes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Consideration: should we be using multi-modal?
- Via docling we could export the tables themselves as images. Would this be a better format? It seems like a better storage format but not for queries. What is the best way for the model to get that kind of table data out?
- https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/multimodal_rag_langchain.ipynb has some examples
- for our implementation this would leverage the docling export figures example: https://github.com/DS4SD/docling/blob/main/examples/export_figures.py
- we would have to get the table images in the right inersertion order for the chunks
Empty file added pdf-output/.gitkeep
Empty file.
1 change: 1 addition & 0 deletions pdf-output/02da2a86-7bb9-4bcb-95ae-4ce27ea5e3bc.json

Large diffs are not rendered by default.

Loading

0 comments on commit f3d0556

Please sign in to comment.