From cd112d1e9550ac096e438500f7c541b0b06dd6f6 Mon Sep 17 00:00:00 2001 From: Massimiliano Angelino Date: Fri, 8 Nov 2024 09:38:18 +0100 Subject: [PATCH] fix: fewer cells and missing variable --- ...emaker-jobs-connection-to-postgreSQL.ipynb | 5 +- .../02-download-raw-pdf-documents.ipynb | 8 +- data_pipelines/03-document-extraction.ipynb | 1093 ++++++++--------- ...ad-embeddings-into-aurora-postgreSQL.ipynb | 15 +- ...ad-sql-tables-into-aurora-postgreSQL.ipynb | 14 +- 5 files changed, 555 insertions(+), 580 deletions(-) diff --git a/data_pipelines/01-validate-sagemaker-jobs-connection-to-postgreSQL.ipynb b/data_pipelines/01-validate-sagemaker-jobs-connection-to-postgreSQL.ipynb index 02b1ab1..0eb1d79 100644 --- a/data_pipelines/01-validate-sagemaker-jobs-connection-to-postgreSQL.ipynb +++ b/data_pipelines/01-validate-sagemaker-jobs-connection-to-postgreSQL.ipynb @@ -84,7 +84,8 @@ "db_secret_arn = db_secret_arn[\"Parameter\"][\"Value\"]\n", "\n", "subnet_ids = ssm.get_parameter(Name=subnet_ids_parameter)\n", - "private_subnets_with_egress_ids = json.loads(subnet_ids[\"Parameter\"][\"Value\"])" + "private_subnets_with_egress_ids = json.loads(subnet_ids[\"Parameter\"][\"Value\"])\n", + "script_processor_container_uri = ssm.get_parameter(Name=script_processor_container_parameter)[\"Parameter\"][\"Value\"]" ] }, { @@ -175,7 +176,7 @@ "id": "dcb8bed1-bef3-4fd7-84b7-29dd4b418362", "metadata": {}, "source": [ - "## Attempt the same in a SageMaker processing job with VPC network config" + "## Attempt the same in a SageMaker processing job with VPC network config" ] }, { diff --git a/data_pipelines/02-download-raw-pdf-documents.ipynb b/data_pipelines/02-download-raw-pdf-documents.ipynb index 6ca6250..749a783 100644 --- a/data_pipelines/02-download-raw-pdf-documents.ipynb +++ b/data_pipelines/02-download-raw-pdf-documents.ipynb @@ -163,7 +163,7 @@ }, "outputs": [], "source": [ - "ls {raw_base_directory}/Amazon" + "!ls {raw_base_directory}/Amazon" ] }, { @@ -171,7 +171,7 @@ "id": "87a9f7bd-81e0-46b2-aa37-abddd1b0f1ec", "metadata": {}, "source": [ - "## Keep relevant pages\n", + "## Keep relevant pages\n", "\n", "Although you can run the full PDF documents through the solution, to optimize the extraction costs, we suggest that you select the relevant pages from each pdf documents." ] @@ -354,7 +354,7 @@ }, "outputs": [], "source": [ - "ls {prepared_base_directory}" + "!ls {prepared_base_directory}" ] }, { @@ -366,7 +366,7 @@ }, "outputs": [], "source": [ - "cat {prepared_base_directory}/metadata.json | python -m json.tool" + "!cat {prepared_base_directory}/metadata.json | python -m json.tool" ] } ], diff --git a/data_pipelines/03-document-extraction.ipynb b/data_pipelines/03-document-extraction.ipynb index d932e86..7bc0edc 100644 --- a/data_pipelines/03-document-extraction.ipynb +++ b/data_pipelines/03-document-extraction.ipynb @@ -1,554 +1,541 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "ff20787a-08a8-43b0-924f-b79e625fa775", - "metadata": {}, - "source": [ - "# 03 - Layout aware text extraction with Amazon Textract" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2321573-2b17-40dc-afcf-6075f2a8489e", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q amazon-textract-textractor[pdf]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9889c475-cef9-4276-840f-7aee770d48ac", - "metadata": {}, - "outputs": [], - "source": [ - "!uname -a" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19435f95-fc3d-4769-a039-f6c48a625dbd", - "metadata": {}, - "outputs": [], - "source": [ - "!sudo apt-get update -y 2> /dev/null" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf5623fd-a3f8-4b9f-98d1-fedd4e26b33f", - "metadata": {}, - "outputs": [], - "source": [ - "!sudo apt install poppler-utils -y 2> /dev/null" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28790f37-351a-4f36-a153-39d32cc86539", - "metadata": {}, - "outputs": [], - "source": [ - "# used by mazon-textract-textractor to visualize images with extraction results\n", - "%pip install -q pdf2image" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20bd6519-22e6-41e4-8353-632c2d68da00", - "metadata": {}, - "outputs": [], - "source": [ - "ls raw_documents/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3b72e529-3bf4-4a46-a059-d488e8b8a8bd", - "metadata": {}, - "outputs": [], - "source": [ - "ls raw_documents/prepared/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6713d74-77c4-4c99-a991-bdbc83585199", - "metadata": {}, - "outputs": [], - "source": [ - "ls raw_documents/prepared/Amazon/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f68c5ca7-4632-4a01-8115-e285289fe1f5", - "metadata": {}, - "outputs": [], - "source": [ - "!python -m json.tool raw_documents/prepared/metadata.json" - ] - }, - { - "cell_type": "markdown", - "id": "9cb3114c-e1db-46dc-8234-f98f58a135f2", - "metadata": {}, - "source": [ - "## Extraction with textractor" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "363c90b1-b849-447f-a8c9-b64d9d655e73", - "metadata": {}, - "outputs": [], - "source": [ - "import sagemaker\n", - "\n", - "default_sagemaker_bucket = sagemaker.Session().default_bucket()\n", - "sagemaker_execution_role = sagemaker.get_execution_role()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "653a51ce-5f25-48f3-a2f9-ce7964732768", - "metadata": {}, - "outputs": [], - "source": [ - "import boto3\n", - "from textractor import Textractor\n", - "from textractor.data.constants import TextractFeatures\n", - "\n", - "region = boto3.session.Session().region_name\n", - "# extractor = Textractor(profile_name=\"default\")\n", - "extractor = Textractor(region_name=region)\n", - "\n", - "input_document = \"raw_documents/prepared/Amazon/annual_report_2022.pdf\"\n", - "\n", - "document = extractor.start_document_analysis(\n", - " file_source=input_document,\n", - " s3_upload_path=f\"s3://{default_sagemaker_bucket}/input_documents/\",\n", - " s3_output_path=f\"s3://{default_sagemaker_bucket}/output_documents/\",\n", - " features=[TextractFeatures.LAYOUT],\n", - " save_image=False\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "ada94249-6a6e-4bbd-a790-0d23fb4fe5c6", - "metadata": {}, - "outputs": [], - "source": [ - "document.document" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "118c3f36-30d6-4c01-8d10-0f9a30b998e2", - "metadata": {}, - "outputs": [], - "source": [ - "document.pages[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fff7a03c-b86f-49fa-b293-b2a056e2bb1f", - "metadata": {}, - "outputs": [], - "source": [ - "print(document.pages[4].to_markdown())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "203a6eac-7de3-414d-93a4-2d769054e349", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -U -q pydantic 2> /dev/null" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c4a24b27-4948-4d0a-9543-c157e9344964", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -U -q \"anthropic[bedrock]\"" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "2714569b-4cc8-481b-a90a-b7398ebf1c6d", - "metadata": {}, - "outputs": [], - "source": [ - "from anthropic import Anthropic\n", - "\n", - "anthropic_client = Anthropic()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e9d6a050-8afc-4c84-b732-0ef7386d0faf", - "metadata": {}, - "outputs": [], - "source": [ - "anthropic_client.count_tokens(document.pages[0].to_markdown())" - ] - }, - { - "cell_type": "markdown", - "id": "57f3357e-b7c5-43a3-b65c-ba40857ae080", - "metadata": {}, - "source": [ - "## Use LLM to review and improve the extracted document\n", - "\n", - "Here we use Anthropic Claude 3 models through Amazon Bedrock to improve the markdown file extracted by Amazon Textract further, so it is ready for the LLM to answer question properly later on." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "25fe312e-660a-4787-8124-02b17dbc1e60", - "metadata": {}, - "outputs": [], - "source": [ - "import boto3\n", - "import json\n", - "import logging\n", - "from botocore.exceptions import ClientError\n", - "\n", - "bedrock = boto3.client(\"bedrock\", region_name=\"us-west-2\")\n", - "bedrock_runtime = boto3.client(\"bedrock-runtime\", region_name=\"us-west-2\")" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "9e94a2ff-be70-4786-a61f-f6622ddbde4c", - "metadata": {}, - "outputs": [], - "source": [ - "# bedrock.list_foundation_models()" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "2b33773e-5fb2-4411-90d1-5bfbc726b4fa", - "metadata": {}, - "outputs": [], - "source": [ - "# llm_model_id = \"anthropic.claude-3-haiku-20240307-v1:0\"\n", - "llm_model_id = \"anthropic.claude-3-sonnet-20240229-v1:0\"" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "f638ca66-f904-46c0-b72d-809154f9f4ab", - "metadata": {}, - "outputs": [], - "source": [ - "logger = logging.getLogger(__name__)\n", - "logging.basicConfig(level=logging.INFO)\n", - "\n", - "\n", - "def generate_message(bedrock_runtime, model_id, system_prompt, messages, max_tokens):\n", - "\n", - " body=json.dumps(\n", - " {\n", - " \"anthropic_version\": \"bedrock-2023-05-31\",\n", - " \"max_tokens\": max_tokens,\n", - " \"system\": system_prompt,\n", - " \"messages\": messages\n", - " }\n", - " )\n", - " response = bedrock_runtime.invoke_model(body=body, modelId=model_id)\n", - " response_body = json.loads(response.get('body').read())\n", - "\n", - " return response_body\n", - "\n", - "\n", - "def call_llm(user_input, model_id, system_prompt, bedrock_runtime, max_tokens=1000):\n", - " \"\"\"Handle calls to Anthropic Claude message api.\"\"\"\n", - " try:\n", - " # Prompt with user turn only.\n", - " user_message = {\"role\": \"user\", \"content\": user_input}\n", - " messages = [user_message]\n", - " return generate_message(bedrock_runtime, model_id, system_prompt, messages, max_tokens)\n", - " except ClientError as err:\n", - " message=err.response[\"Error\"][\"Message\"]\n", - " logger.error(\"A client error occurred: %s\", message)\n", - " print(\"A client error occured: \" +\n", - " format(message))\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "cc681f37-82e9-4878-86cc-760122218b5f", - "metadata": {}, - "source": [ - "Below we test the help functions by calling the LLM" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5cae7812-d193-4e39-8a3e-a29884fef932", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "user_input = \"hello\"\n", - "system_prompt = \"reply in a friendly manner\"\n", - "\n", - "call_llm(user_input, llm_model_id, system_prompt, bedrock_runtime, max_tokens=1000)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "71d0306d-08fa-4b49-8ad3-05a0d97f0174", - "metadata": {}, - "outputs": [], - "source": [ - "user_prompt = \"\"\"\n", - "Improve the markdown while keeping all original information. Put the improved markdown inside a xml tags with no explanation:\n", - "\\n{markdown_doc}\n", - "\"\"\".strip()\n", - "\n", - "system_prompt = \"Your task is to review and improve the results of Amazon textract in markdown.\"\n", - "\n", - "\n", - "def improve_textract_markdown_output(document, llm_model_id):\n", - " improved_markdown = []\n", - " for i in range(len(document.pages)):\n", - " user_input = user_prompt.format(markdown_doc=document.pages[i].to_markdown())\n", - " result = call_llm(user_input, llm_model_id, system_prompt, bedrock_runtime, max_tokens=3000)\n", - " # Extract the text between the XML tags only.\n", - " improved_markdown.append(result[\"content\"][0][\"text\"].split(\"\")[-1].split(\"\")[0].strip())\n", - " return improved_markdown" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "ab6cea2e-2685-4dce-93ef-30a80636251f", - "metadata": {}, - "outputs": [], - "source": [ - "# res = improve_textract_markdown_output(document, llm_model_id)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c61b27b1-0fab-4456-801a-5ef7458d5629", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "raw_base_directory = \"raw_documents\"\n", - "prepared_base_directory = os.path.join(raw_base_directory, \"prepared/\")\n", - "prepared_base_directory" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "da319b54-e2a4-4c63-8955-7b72d81298be", - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\n", - " os.path.join(prepared_base_directory, \"metadata.json\"), \"r\"\n", - ") as prepared_pdfs_metadata_obj:\n", - " prepared_pdfs_metadata = json.load(prepared_pdfs_metadata_obj)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14f40cf8-d02b-4fbf-b334-dd0cf3754c23", - "metadata": {}, - "outputs": [], - "source": [ - "prepared_pdfs_metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "46f8e0ed-65fd-4a1d-9cba-ddb5846d6822", - "metadata": {}, - "outputs": [], - "source": [ - "def extract_pages_as_markdown(input_document):\n", - "\n", - " document = extractor.start_document_analysis(\n", - " file_source=input_document,\n", - " s3_upload_path=f\"s3://{default_sagemaker_bucket}/input_documents/\",\n", - " s3_output_path=f\"s3://{default_sagemaker_bucket}/output_documents/\",\n", - " features=[TextractFeatures.LAYOUT],\n", - " save_image=False\n", - " )\n", - "\n", - " res = improve_textract_markdown_output(document, llm_model_id)\n", - " pages = [{\"page\": indx, \"page_text\": text} for indx, text in enumerate(res)]\n", - " return pages\n", - "\n", - "\n", - "def extract_docs_into_markdown(docs_metadata):\n", - " results = []\n", - " for doc_meta in docs_metadata:\n", - " doc_result_with_metadata = {}\n", - " doc_result_with_metadata[\"metadata\"] = doc_meta\n", - " doc_result_with_metadata[\"name\"] = doc_meta[\"doc_url\"].split(\"/\")[-1]\n", - " doc_result_with_metadata[\"source_location\"] = doc_meta[\"doc_url\"]\n", - " doc_result_with_metadata[\"pages\"] = extract_pages_as_markdown(doc_meta[\"local_pdf_path\"])\n", - " results.append(doc_result_with_metadata)\n", - " return results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e37c8a74-a33f-4d7f-8b0f-400d7c8984b1", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "\n", - "results = extract_docs_into_markdown(prepared_pdfs_metadata)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bc6f950d-af99-4859-b6dc-94e5d541de26", - "metadata": {}, - "outputs": [], - "source": [ - "results[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "1199a24c-13db-4742-a95a-96290481d371", - "metadata": {}, - "outputs": [], - "source": [ - "from utils.helpers import store_list_to_s3" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "29e061f8-b57f-4150-83ce-dd39568cdbd8", - "metadata": {}, - "outputs": [], - "source": [ - "ssm = boto3.client(\"ssm\")" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "2c66762e-514c-422d-a4f9-cb851af9892e", - "metadata": {}, - "outputs": [], - "source": [ - "s3_bucket_name_parameter = \"/AgenticLLMAssistantWorkshop/AgentDataBucketParameter\"" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "2245c6a7-1335-49c3-ab2f-cea1cbeb7b14", - "metadata": {}, - "outputs": [], - "source": [ - "s3_bucket_name = ssm.get_parameter(Name=s3_bucket_name_parameter)\n", - "s3_bucket_name = s3_bucket_name[\"Parameter\"][\"Value\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "c63a38cf-fbc1-4f29-97c5-3cf73af479a6", - "metadata": {}, - "outputs": [], - "source": [ - "processed_documents_s3_key = \"documents_processed.json\"" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "1b6cc0b7-0abe-4f49-9282-ccce09fd70e8", - "metadata": {}, - "outputs": [], - "source": [ - "store_list_to_s3(s3_bucket_name, processed_documents_s3_key, results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e40e3a7d-8b47-4b79-94eb-7e9c08624322", - "metadata": {}, - "outputs": [], - "source": [ - "results[0]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "cells": [ + { + "cell_type": "markdown", + "id": "ff20787a-08a8-43b0-924f-b79e625fa775", + "metadata": {}, + "source": [ + "# 03 - Layout aware text extraction with Amazon Textract" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2321573-2b17-40dc-afcf-6075f2a8489e", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q amazon-textract-textractor[pdf]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9889c475-cef9-4276-840f-7aee770d48ac", + "metadata": {}, + "outputs": [], + "source": [ + "!uname -a" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19435f95-fc3d-4769-a039-f6c48a625dbd", + "metadata": {}, + "outputs": [], + "source": [ + "!sudo apt-get update -y 2> /dev/null" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf5623fd-a3f8-4b9f-98d1-fedd4e26b33f", + "metadata": {}, + "outputs": [], + "source": [ + "!sudo apt install poppler-utils -y 2> /dev/null" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28790f37-351a-4f36-a153-39d32cc86539", + "metadata": {}, + "outputs": [], + "source": [ + "# used by mazon-textract-textractor to visualize images with extraction results\n", + "%pip install -q pdf2image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20bd6519-22e6-41e4-8353-632c2d68da00", + "metadata": {}, + "outputs": [], + "source": [ + "ls raw_documents/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b72e529-3bf4-4a46-a059-d488e8b8a8bd", + "metadata": {}, + "outputs": [], + "source": [ + "ls raw_documents/prepared/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6713d74-77c4-4c99-a991-bdbc83585199", + "metadata": {}, + "outputs": [], + "source": [ + "ls raw_documents/prepared/Amazon/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f68c5ca7-4632-4a01-8115-e285289fe1f5", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m json.tool raw_documents/prepared/metadata.json" + ] + }, + { + "cell_type": "markdown", + "id": "9cb3114c-e1db-46dc-8234-f98f58a135f2", + "metadata": {}, + "source": [ + "## Extraction with textractor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "363c90b1-b849-447f-a8c9-b64d9d655e73", + "metadata": {}, + "outputs": [], + "source": [ + "import sagemaker\n", + "\n", + "default_sagemaker_bucket = sagemaker.Session().default_bucket()\n", + "sagemaker_execution_role = sagemaker.get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "653a51ce-5f25-48f3-a2f9-ce7964732768", + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "from textractor import Textractor\n", + "from textractor.data.constants import TextractFeatures\n", + "\n", + "region = boto3.session.Session().region_name\n", + "# extractor = Textractor(profile_name=\"default\")\n", + "extractor = Textractor(region_name=region)\n", + "\n", + "input_document = \"raw_documents/prepared/Amazon/annual_report_2022.pdf\"\n", + "\n", + "document = extractor.start_document_analysis(\n", + " file_source=input_document,\n", + " s3_upload_path=f\"s3://{default_sagemaker_bucket}/input_documents/\",\n", + " s3_output_path=f\"s3://{default_sagemaker_bucket}/output_documents/\",\n", + " features=[TextractFeatures.LAYOUT],\n", + " save_image=False\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ada94249-6a6e-4bbd-a790-0d23fb4fe5c6", + "metadata": {}, + "outputs": [], + "source": [ + "document.document" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "118c3f36-30d6-4c01-8d10-0f9a30b998e2", + "metadata": {}, + "outputs": [], + "source": [ + "document.pages[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fff7a03c-b86f-49fa-b293-b2a056e2bb1f", + "metadata": {}, + "outputs": [], + "source": [ + "print(document.pages[4].to_markdown())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "203a6eac-7de3-414d-93a4-2d769054e349", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -U -q pydantic 2> /dev/null" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4a24b27-4948-4d0a-9543-c157e9344964", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -U -q \"anthropic[bedrock]\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "2714569b-4cc8-481b-a90a-b7398ebf1c6d", + "metadata": {}, + "outputs": [], + "source": [ + "from anthropic import Anthropic\n", + "\n", + "anthropic_client = Anthropic()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9d6a050-8afc-4c84-b732-0ef7386d0faf", + "metadata": {}, + "outputs": [], + "source": [ + "anthropic_client.count_tokens(document.pages[0].to_markdown())" + ] + }, + { + "cell_type": "markdown", + "id": "57f3357e-b7c5-43a3-b65c-ba40857ae080", + "metadata": {}, + "source": [ + "## Use LLM to review and improve the extracted document\n", + "\n", + "Here we use Anthropic Claude 3 models through Amazon Bedrock to improve the markdown file extracted by Amazon Textract further, so it is ready for the LLM to answer question properly later on." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "25fe312e-660a-4787-8124-02b17dbc1e60", + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "import json\n", + "import logging\n", + "from botocore.exceptions import ClientError\n", + "\n", + "bedrock = boto3.client(\"bedrock\", region_name=\"us-west-2\")\n", + "bedrock_runtime = boto3.client(\"bedrock-runtime\", region_name=\"us-west-2\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "9e94a2ff-be70-4786-a61f-f6622ddbde4c", + "metadata": {}, + "outputs": [], + "source": [ + "# bedrock.list_foundation_models()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "2b33773e-5fb2-4411-90d1-5bfbc726b4fa", + "metadata": {}, + "outputs": [], + "source": [ + "# llm_model_id = \"anthropic.claude-3-haiku-20240307-v1:0\"\n", + "llm_model_id = \"anthropic.claude-3-sonnet-20240229-v1:0\"" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f638ca66-f904-46c0-b72d-809154f9f4ab", + "metadata": {}, + "outputs": [], + "source": [ + "logger = logging.getLogger(__name__)\n", + "logging.basicConfig(level=logging.INFO)\n", + "\n", + "\n", + "def generate_message(bedrock_runtime, model_id, system_prompt, messages, max_tokens):\n", + "\n", + " body=json.dumps(\n", + " {\n", + " \"anthropic_version\": \"bedrock-2023-05-31\",\n", + " \"max_tokens\": max_tokens,\n", + " \"system\": system_prompt,\n", + " \"messages\": messages\n", + " }\n", + " )\n", + " response = bedrock_runtime.invoke_model(body=body, modelId=model_id)\n", + " response_body = json.loads(response.get('body').read())\n", + "\n", + " return response_body\n", + "\n", + "\n", + "def call_llm(user_input, model_id, system_prompt, bedrock_runtime, max_tokens=1000):\n", + " \"\"\"Handle calls to Anthropic Claude message api.\"\"\"\n", + " try:\n", + " # Prompt with user turn only.\n", + " user_message = {\"role\": \"user\", \"content\": user_input}\n", + " messages = [user_message]\n", + " return generate_message(bedrock_runtime, model_id, system_prompt, messages, max_tokens)\n", + " except ClientError as err:\n", + " message=err.response[\"Error\"][\"Message\"]\n", + " logger.error(\"A client error occurred: %s\", message)\n", + " print(\"A client error occured: \" +\n", + " format(message))\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "cc681f37-82e9-4878-86cc-760122218b5f", + "metadata": {}, + "source": [ + "Below we test the help functions by calling the LLM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cae7812-d193-4e39-8a3e-a29884fef932", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "user_input = \"hello\"\n", + "system_prompt = \"reply in a friendly manner\"\n", + "\n", + "call_llm(user_input, llm_model_id, system_prompt, bedrock_runtime, max_tokens=1000)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "71d0306d-08fa-4b49-8ad3-05a0d97f0174", + "metadata": {}, + "outputs": [], + "source": [ + "user_prompt = \"\"\"\n", + "Improve the markdown while keeping all original information. Put the improved markdown inside a xml tags with no explanation:\n", + "\\n{markdown_doc}\n", + "\"\"\".strip()\n", + "\n", + "system_prompt = \"Your task is to review and improve the results of Amazon textract in markdown.\"\n", + "\n", + "\n", + "def improve_textract_markdown_output(document, llm_model_id):\n", + " improved_markdown = []\n", + " for i in range(len(document.pages)):\n", + " user_input = user_prompt.format(markdown_doc=document.pages[i].to_markdown())\n", + " result = call_llm(user_input, llm_model_id, system_prompt, bedrock_runtime, max_tokens=3000)\n", + " # Extract the text between the XML tags only.\n", + " improved_markdown.append(result[\"content\"][0][\"text\"].split(\"\")[-1].split(\"\")[0].strip())\n", + " return improved_markdown" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "ab6cea2e-2685-4dce-93ef-30a80636251f", + "metadata": {}, + "outputs": [], + "source": [ + "# res = improve_textract_markdown_output(document, llm_model_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c61b27b1-0fab-4456-801a-5ef7458d5629", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "raw_base_directory = \"raw_documents\"\n", + "prepared_base_directory = os.path.join(raw_base_directory, \"prepared/\")\n", + "prepared_base_directory" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "da319b54-e2a4-4c63-8955-7b72d81298be", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "with open(\n", + " os.path.join(prepared_base_directory, \"metadata.json\"), \"r\"\n", + ") as prepared_pdfs_metadata_obj:\n", + " prepared_pdfs_metadata = json.load(prepared_pdfs_metadata_obj)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14f40cf8-d02b-4fbf-b334-dd0cf3754c23", + "metadata": {}, + "outputs": [], + "source": [ + "prepared_pdfs_metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "46f8e0ed-65fd-4a1d-9cba-ddb5846d6822", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_pages_as_markdown(input_document):\n", + "\n", + " document = extractor.start_document_analysis(\n", + " file_source=input_document,\n", + " s3_upload_path=f\"s3://{default_sagemaker_bucket}/input_documents/\",\n", + " s3_output_path=f\"s3://{default_sagemaker_bucket}/output_documents/\",\n", + " features=[TextractFeatures.LAYOUT],\n", + " save_image=False\n", + " )\n", + "\n", + " res = improve_textract_markdown_output(document, llm_model_id)\n", + " pages = [{\"page\": indx, \"page_text\": text} for indx, text in enumerate(res)]\n", + " return pages\n", + "\n", + "\n", + "def extract_docs_into_markdown(docs_metadata):\n", + " results = []\n", + " for doc_meta in docs_metadata:\n", + " doc_result_with_metadata = {}\n", + " doc_result_with_metadata[\"metadata\"] = doc_meta\n", + " doc_result_with_metadata[\"name\"] = doc_meta[\"doc_url\"].split(\"/\")[-1]\n", + " doc_result_with_metadata[\"source_location\"] = doc_meta[\"doc_url\"]\n", + " doc_result_with_metadata[\"pages\"] = extract_pages_as_markdown(doc_meta[\"local_pdf_path\"])\n", + " results.append(doc_result_with_metadata)\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e37c8a74-a33f-4d7f-8b0f-400d7c8984b1", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "results = extract_docs_into_markdown(prepared_pdfs_metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc6f950d-af99-4859-b6dc-94e5d541de26", + "metadata": {}, + "outputs": [], + "source": [ + "results[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "1199a24c-13db-4742-a95a-96290481d371", + "metadata": {}, + "outputs": [], + "source": [ + "from utils.helpers import store_list_to_s3" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "29e061f8-b57f-4150-83ce-dd39568cdbd8", + "metadata": {}, + "outputs": [], + "source": [ + "ssm = boto3.client(\"ssm\")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "2c66762e-514c-422d-a4f9-cb851af9892e", + "metadata": {}, + "outputs": [], + "source": [ + "s3_bucket_name_parameter = \"/AgenticLLMAssistantWorkshop/AgentDataBucketParameter\"" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "2245c6a7-1335-49c3-ab2f-cea1cbeb7b14", + "metadata": {}, + "outputs": [], + "source": [ + "s3_bucket_name = ssm.get_parameter(Name=s3_bucket_name_parameter)\n", + "s3_bucket_name = s3_bucket_name[\"Parameter\"][\"Value\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "c63a38cf-fbc1-4f29-97c5-3cf73af479a6", + "metadata": {}, + "outputs": [], + "source": [ + "processed_documents_s3_key = \"documents_processed.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "1b6cc0b7-0abe-4f49-9282-ccce09fd70e8", + "metadata": {}, + "outputs": [], + "source": [ + "store_list_to_s3(s3_bucket_name, processed_documents_s3_key, results)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e40e3a7d-8b47-4b79-94eb-7e9c08624322", + "metadata": {}, + "outputs": [], + "source": [ + "results[0]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "nbformat": 4, + "nbformat_minor": 5 + } \ No newline at end of file diff --git a/data_pipelines/04-create-and-load-embeddings-into-aurora-postgreSQL.ipynb b/data_pipelines/04-create-and-load-embeddings-into-aurora-postgreSQL.ipynb index 934b3bb..3b1472f 100644 --- a/data_pipelines/04-create-and-load-embeddings-into-aurora-postgreSQL.ipynb +++ b/data_pipelines/04-create-and-load-embeddings-into-aurora-postgreSQL.ipynb @@ -33,7 +33,6 @@ "source": [ "import json\n", "import os\n", - "\n", "import boto3" ] }, @@ -75,16 +74,10 @@ "private_subnets_with_egress_ids = json.loads(subnet_ids[\"Parameter\"][\"Value\"])\n", "\n", "s3_bucket_name = ssm.get_parameter(Name=s3_bucket_name_parameter)\n", - "s3_bucket_name = s3_bucket_name[\"Parameter\"][\"Value\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "42dac5f7-97f4-4f4f-9869-9e7b68a23e5b", - "metadata": {}, - "outputs": [], - "source": [ + "s3_bucket_name = s3_bucket_name[\"Parameter\"][\"Value\"]\n", + "\n", + "script_processor_container_uri = ssm.get_parameter(Name=script_processor_container_parameter)[\"Parameter\"][\"Value\"]\n", + "\n", "processed_documents_s3_key = \"documents_processed.json\"" ] }, diff --git a/data_pipelines/05-load-sql-tables-into-aurora-postgreSQL.ipynb b/data_pipelines/05-load-sql-tables-into-aurora-postgreSQL.ipynb index 086e95f..392625e 100644 --- a/data_pipelines/05-load-sql-tables-into-aurora-postgreSQL.ipynb +++ b/data_pipelines/05-load-sql-tables-into-aurora-postgreSQL.ipynb @@ -75,16 +75,10 @@ "private_subnets_with_egress_ids = json.loads(subnet_ids[\"Parameter\"][\"Value\"])\n", "\n", "s3_bucket_name = ssm.get_parameter(Name=s3_bucket_name_parameter)\n", - "s3_bucket_name = s3_bucket_name[\"Parameter\"][\"Value\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "42dac5f7-97f4-4f4f-9869-9e7b68a23e5b", - "metadata": {}, - "outputs": [], - "source": [ + "s3_bucket_name = s3_bucket_name[\"Parameter\"][\"Value\"]\n", + "\n", + "processed_documents_s3_key = \"documents_processed.json\"\n", + "\n", "sql_tables_s3_key = \"structured_metadata\"" ] },