From 12556223db572a3f1103d887f9519102481d1e19 Mon Sep 17 00:00:00 2001 From: Massimiliano Angelino Date: Fri, 8 Nov 2024 09:55:53 +0100 Subject: [PATCH] fix: fix corrupted notebook --- data_pipelines/03-document-extraction.ipynb | 124 ++++---------------- 1 file changed, 24 insertions(+), 100 deletions(-) diff --git a/data_pipelines/03-document-extraction.ipynb b/data_pipelines/03-document-extraction.ipynb index 7bc0edc..9cb025b 100644 --- a/data_pipelines/03-document-extraction.ipynb +++ b/data_pipelines/03-document-extraction.ipynb @@ -15,17 +15,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install -q amazon-textract-textractor[pdf]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9889c475-cef9-4276-840f-7aee770d48ac", - "metadata": {}, - "outputs": [], - "source": [ - "!uname -a" + "%pip install -q amazon-textract-textractor[pdf] pdf2image pydantic \"anthropic[bedrock]\"" ] }, { @@ -35,28 +25,7 @@ "metadata": {}, "outputs": [], "source": [ - "!sudo apt-get update -y 2> /dev/null" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf5623fd-a3f8-4b9f-98d1-fedd4e26b33f", - "metadata": {}, - "outputs": [], - "source": [ - "!sudo apt install poppler-utils -y 2> /dev/null" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28790f37-351a-4f36-a153-39d32cc86539", - "metadata": {}, - "outputs": [], - "source": [ - "# used by mazon-textract-textractor to visualize images with extraction results\n", - "%pip install -q pdf2image" + "!sudo apt-get update -y 2> /dev/null && sudo apt install poppler-utils -y 2> /dev/null" ] }, { @@ -66,7 +35,7 @@ "metadata": {}, "outputs": [], "source": [ - "ls raw_documents/" + "!ls raw_documents/" ] }, { @@ -76,7 +45,7 @@ "metadata": {}, "outputs": [], "source": [ - "ls raw_documents/prepared/" + "!ls raw_documents/prepared/" ] }, { @@ -86,7 +55,7 @@ "metadata": {}, "outputs": [], "source": [ - "ls raw_documents/prepared/Amazon/" + "!ls raw_documents/prepared/Amazon/" ] }, { @@ -176,26 +145,6 @@ "print(document.pages[4].to_markdown())" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "203a6eac-7de3-414d-93a4-2d769054e349", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -U -q pydantic 2> /dev/null" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c4a24b27-4948-4d0a-9543-c157e9344964", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -U -q \"anthropic[bedrock]\"" - ] - }, { "cell_type": "code", "execution_count": 17, @@ -444,7 +393,6 @@ "outputs": [], "source": [ "%%time\n", - "\n", "results = extract_docs_into_markdown(prepared_pdfs_metadata)" ] }, @@ -465,16 +413,7 @@ "metadata": {}, "outputs": [], "source": [ - "from utils.helpers import store_list_to_s3" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "29e061f8-b57f-4150-83ce-dd39568cdbd8", - "metadata": {}, - "outputs": [], - "source": [ + "from utils.helpers import store_list_to_s3\n", "ssm = boto3.client(\"ssm\")" ] }, @@ -485,27 +424,9 @@ "metadata": {}, "outputs": [], "source": [ - "s3_bucket_name_parameter = \"/AgenticLLMAssistantWorkshop/AgentDataBucketParameter\"" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "2245c6a7-1335-49c3-ab2f-cea1cbeb7b14", - "metadata": {}, - "outputs": [], - "source": [ + "s3_bucket_name_parameter = \"/AgenticLLMAssistantWorkshop/AgentDataBucketParameter\"\n", "s3_bucket_name = ssm.get_parameter(Name=s3_bucket_name_parameter)\n", - "s3_bucket_name = s3_bucket_name[\"Parameter\"][\"Value\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "c63a38cf-fbc1-4f29-97c5-3cf73af479a6", - "metadata": {}, - "outputs": [], - "source": [ + "s3_bucket_name = s3_bucket_name[\"Parameter\"][\"Value\"]\n", "processed_documents_s3_key = \"documents_processed.json\"" ] }, @@ -518,16 +439,6 @@ "source": [ "store_list_to_s3(s3_bucket_name, processed_documents_s3_key, results)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e40e3a7d-8b47-4b79-94eb-7e9c08624322", - "metadata": {}, - "outputs": [], - "source": [ - "results[0]" - ] } ], "metadata": { @@ -536,6 +447,19 @@ "language": "python", "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 5 - } \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}