scripts for downloading input, converting docs, generating embeddings…

… and chroma insertion Signed-off-by: greg pereira <[email protected]>
redhat-et · Sep 7, 2024 · f3d0556 · f3d0556
1 parent e3a15aa
commit f3d0556
Show file tree

Hide file tree

Showing 63 changed files with 68,156 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+# pdf-input/*.pdf # these are 10k listings, they can be included
+# pdf-output/*.md # docling has been opensourced, we can share results
+# pdf-output/*.json # docling has been opensourced, we can share results
+venv
+chroma.log
diff --git a/1-download-docs.sh b/1-download-docs.sh
@@ -0,0 +1,24 @@
+aws --profile et s3 cp s3://east2-backup-rcook/1c674864-2ac6-4c97-8bfa-f30efc6264f2.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/2e22532e-2b0f-458b-bd39-7daf01da779a.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/336d8745-ea82-40a5-9acc-1a89df23d0f3.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/499b79ae-bd47-4538-8c98-3731ea102750.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/4d39f579-19d8-4119-b087-ee618abf82d6.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/51b3ab22-594a-4d41-b45f-5af1c9a80e69.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/69682db4-0019-42ce-a4e1-983c30725f1b.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/6af7bf21-4d7b-4ed5-95a3-fad5cc68efe0.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/805e8ffe-b918-4118-ab8e-fe998cc64d89.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/8c560d9c-f48e-4b8d-aad8-6a0badb8e348.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/96985bfb-79b1-41e9-b552-fd5ad5af6fd3.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/97bb54ee-ce92-4f6f-80b7-9e33013402bd.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/a18084e1-1eac-4133-8829-3fc5fb52295d.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/b639d901-a599-4823-9de4-3a32528fd6f1.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/b657f733-ed5f-4fd7-a89e-95736b5f3cf1.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/bed19367-fa6b-41ff-a973-df19510b0bba.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/c2043b4d-2d3c-4781-b0a0-5bbf79da02ae.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/c323c7df-179c-44ca-8ec7-96f51880b187.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/c7c14359-36fa-40c3-b3ca-5bf7f3fa0b96.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/cc5a18a3-8f6f-4d7c-b1b4-f91257a973dc.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/d2fde7ee-05f7-419d-9ce8-186de4c96e25.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/eb4ef26e-07c0-4414-b940-c25712d441f3.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/f965e5c3-fded-45d3-bbdb-f750f156dcc9.pdf ./
+aws --profile et s3 cp s3://east2-backup-rcook/fae0d139-777f-489f-8e63-c97285413f9d.pdf ./
diff --git a/2-convert-docs.sh b/2-convert-docs.sh
@@ -0,0 +1,4 @@
+python3 -m venv venv
+source venv/bin/activate
+venv/bin/python -m pip install -r requirements.txt
+# python 2-convert-docs.py
diff --git a/2_convert_docs.py b/2_convert_docs.py
@@ -0,0 +1,102 @@
+# import datetime
+import logging
+import time
+from pathlib import Path
+from typing import Iterable
+import json
+
+from docling.datamodel.base_models import ( # type: ignore | pylance_cfg
+    ConversionStatus,
+    # FigureElement,
+    # PageElement,
+    # TableElement,
+    PipelineOptions,
+)
+
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend  # type: ignore | pylance_cfg
+from docling.datamodel.document import ConversionResult, DocumentConversionInput  # type: ignore | pylance_cfg
+from docling.document_converter import DocumentConverter  # type: ignore | pylance_cfg
+
+_log = logging.getLogger(__name__)
+
+IMAGE_RESOLUTION_SCALE = 2.0
+
+def export_documents(
+    conv_results: Iterable[ConversionResult],
+    output_dir: Path,
+):
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    success_count = 0
+    failure_count = 0
+    partial_success_count = 0
+
+    for conv_res in conv_results:
+        if conv_res.status == ConversionStatus.SUCCESS:
+            success_count += 1
+            doc_filename = conv_res.input.file.stem
+
+            # Export Deep Search document JSON format:
+            with (output_dir / f"{doc_filename}.json").open("w") as fp:
+                fp.write(json.dumps(conv_res.render_as_dict()))
+
+            # Export Markdown format:
+            with (output_dir / f"{doc_filename}.md").open("w") as fp:
+                fp.write(conv_res.render_as_markdown())
+        elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
+            _log.info(
+                f"Document {conv_res.input.file} was partially converted with the following errors:"
+            )
+            for item in conv_res.errors:
+                _log.info(f"\t{item.error_message}")
+            partial_success_count += 1
+        else:
+            _log.info(f"Document {conv_res.input.file} failed to convert.")
+            failure_count += 1
+
+    _log.info(
+        f"Processed {success_count + partial_success_count + failure_count} docs, "
+        f"of which {failure_count} failed "
+        f"and {partial_success_count} were partially converted."
+    )
+    return success_count, partial_success_count, failure_count
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+
+    directory_path = Path('./pdf-input')
+
+    input_doc_paths = list(directory_path.glob('*.pdf'))
+
+    pipeline_options = PipelineOptions()
+    pipeline_options.do_ocr=True
+    pipeline_options.do_table_structure=True
+    pipeline_options.table_structure_options.do_cell_matching = True
+
+    doc_converter = DocumentConverter(
+        pipeline_options=pipeline_options,
+        pdf_backend=DoclingParseDocumentBackend,
+    )
+
+    input = DocumentConversionInput.from_paths(input_doc_paths)
+
+    start_time = time.time()
+
+    conv_results = doc_converter.convert(input)
+    success_count, failure_count = export_documents(
+        conv_results, output_dir=Path("./pdf-output")
+    )
+
+    end_time = time.time() - start_time
+
+    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+
+    if failure_count > 0:
+        raise RuntimeError(
+            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/5_generate_embeddings_and_populate_vector_db.py b/5_generate_embeddings_and_populate_vector_db.py
@@ -0,0 +1,63 @@
+import os
+import shutil
+import chromadb
+import logging
+from pathlib import Path
+
+from langchain_text_splitters import MarkdownHeaderTextSplitter
+
+_log = logging.getLogger(__name__)
+
+def gather_docs(path):
+    docs = {}
+    directory_path = Path(path)
+    converted_doc_paths = list(directory_path.glob('*.md'))
+    for doc in converted_doc_paths:
+        with open(doc, 'r', encoding='utf-8') as file:
+            markdown_content = file.read()
+            docs[doc] = markdown_content
+    return docs
+
+def init_collection(): 
+    try:
+        client = chromadb.Client()
+    except Exception as error:
+        _log.error(f"Could not connect to the db: {error}")
+
+    ten_k_collection = client.get_or_create_collection("10-Ks")
+    return ten_k_collection
+
+def chunk_document(document):
+    headers_to_split_on = [
+        ("#", "Header 1"),
+        ("##", "Header 2"),
+        ("###", "Header 3"),
+    ]
+
+    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
+    md_header_splits = markdown_splitter.split_text(document)
+    return md_header_splits
+
+
+def main():
+    ten_k_collection = init_collection()
+    docs = gather_docs("./pdf-output")
+    for pdf_name, pdf_content in docs.items(): # iterate through every actual document
+        doc_chunks = chunk_document(pdf_content)
+        chunk_ids = []
+        metadatas = []
+        documents = []
+        for chunk_index in range(len(doc_chunks)):
+            chunk_ids.append(f"{pdf_name}.{chunk_index}")
+            if not bool(doc_chunks[chunk_index].metadata):
+                metadatas.append({"source": str(pdf_name)})
+            else:
+                doc_chunks[chunk_index].metadata["source"] = str(pdf_name)
+                metadatas.append(doc_chunks[chunk_index].metadata)
+            documents.append(doc_chunks[chunk_index].page_content)
+            # print("index")
+
+            # document
+        ten_k_collection.add(documents=documents, ids=chunk_ids, metadatas=metadatas)
+
+main()
diff --git a/5_run_chroma.sh b/5_run_chroma.sh
@@ -0,0 +1,5 @@
+dbpath=$(realpath chromadb)
+cd $dbpath
+bash -c "chroma run --path $dbpath"
+cd "$dbpath/.."
+python3 5_generate_embeddings_and_populate_vector_db.py
diff --git a/chromadb/.gitkeep b/chromadb/.gitkeep
diff --git a/chromadb/chroma.log b/chromadb/chroma.log
@@ -0,0 +1,39 @@
+[32mINFO[0m:     [06-09-2024 13:20:50] Set chroma_server_nofile to 65535
+[32mINFO[0m:     [06-09-2024 13:20:50] Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
+[36mDEBUG[0m:    [06-09-2024 13:20:50] Starting component System
+[36mDEBUG[0m:    [06-09-2024 13:20:50] Starting component OpenTelemetryClient
+[36mDEBUG[0m:    [06-09-2024 13:20:50] Starting component SqliteDB
+[36mDEBUG[0m:    [06-09-2024 13:20:50] Starting component QuotaEnforcer
+[36mDEBUG[0m:    [06-09-2024 13:20:50] Starting component Posthog
+[36mDEBUG[0m:    [06-09-2024 13:20:50] Starting component LocalSegmentManager
+[36mDEBUG[0m:    [06-09-2024 13:20:50] Starting component SegmentAPI
+[32mINFO[0m:     [06-09-2024 13:20:50] Started server process [[36m93912[0m]
+[32mINFO[0m:     [06-09-2024 13:20:50] Waiting for application startup.
+[32mINFO[0m:     [06-09-2024 13:20:50] Application startup complete.
+[32mINFO[0m:     [06-09-2024 13:20:50] Uvicorn running on [1mhttp://localhost:8000[0m (Press CTRL+C to quit)
+[32mINFO[0m:     [06-09-2024 13:25:57] ::1:58985 - "GET /api/v1/tenants/default_tenant HTTP/1.1" 200
+[32mINFO[0m:     [06-09-2024 13:25:57] ::1:58985 - "GET /api/v1/databases/default_database?tenant=default_tenant HTTP/1.1" 200
+[32mINFO[0m:     [06-09-2024 19:52:25] Shutting down
+[32mINFO[0m:     [06-09-2024 19:52:25] Waiting for application shutdown.
+[36mDEBUG[0m:    [06-09-2024 19:52:25] Stopping component System
+[36mDEBUG[0m:    [06-09-2024 19:52:25] Stopping component SegmentAPI
+[36mDEBUG[0m:    [06-09-2024 19:52:25] Stopping component LocalSegmentManager
+[36mDEBUG[0m:    [06-09-2024 19:52:25] Stopping component Posthog
+[36mDEBUG[0m:    [06-09-2024 19:52:25] Stopping component QuotaEnforcer
+[36mDEBUG[0m:    [06-09-2024 19:52:25] Stopping component SqliteDB
+[36mDEBUG[0m:    [06-09-2024 19:52:25] Stopping component OpenTelemetryClient
+[32mINFO[0m:     [06-09-2024 19:52:25] Application shutdown complete.
+[32mINFO[0m:     [06-09-2024 19:52:25] Finished server process [[36m93912[0m]
+[32mINFO[0m:     [06-09-2024 19:52:28] Set chroma_server_nofile to 65535
+[32mINFO[0m:     [06-09-2024 19:52:28] Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
+[36mDEBUG[0m:    [06-09-2024 19:52:28] Starting component System
+[36mDEBUG[0m:    [06-09-2024 19:52:28] Starting component OpenTelemetryClient
+[36mDEBUG[0m:    [06-09-2024 19:52:28] Starting component SqliteDB
+[36mDEBUG[0m:    [06-09-2024 19:52:28] Starting component QuotaEnforcer
+[36mDEBUG[0m:    [06-09-2024 19:52:28] Starting component Posthog
+[36mDEBUG[0m:    [06-09-2024 19:52:28] Starting component LocalSegmentManager
+[36mDEBUG[0m:    [06-09-2024 19:52:28] Starting component SegmentAPI
+[32mINFO[0m:     [06-09-2024 19:52:28] Started server process [[36m99645[0m]
+[32mINFO[0m:     [06-09-2024 19:52:28] Waiting for application startup.
+[32mINFO[0m:     [06-09-2024 19:52:28] Application startup complete.
+[32mINFO[0m:     [06-09-2024 19:52:28] Uvicorn running on [1mhttp://localhost:8000[0m (Press CTRL+C to quit)
diff --git a/chromadb/chroma.sqlite3 b/chromadb/chroma.sqlite3
diff --git a/notes.md b/notes.md
@@ -0,0 +1,5 @@
+Consideration: should we be using multi-modal?
+    - Via docling we could export the tables themselves as images. Would this be a better format? It seems like a better storage format but not for queries. What is the best way for the model to get that kind of table data out?
+        - https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/multimodal_rag_langchain.ipynb has some examples
+        - for our implementation this would leverage the docling export figures example: https://github.com/DS4SD/docling/blob/main/examples/export_figures.py
+            - we would have to get the table images in the right inersertion order for the chunks
diff --git a/pdf-output/.gitkeep b/pdf-output/.gitkeep
diff --git a/pdf-output/02da2a86-7bb9-4bcb-95ae-4ce27ea5e3bc.json b/pdf-output/02da2a86-7bb9-4bcb-95ae-4ce27ea5e3bc.json