Skip to content

Commit

Permalink
Merge pull request #17 from massi-ang/fix_corrupted_nb
Browse files Browse the repository at this point in the history
Fix corrupted notebook
  • Loading branch information
massi-ang authored Nov 8, 2024
2 parents bfe1daf + 1255622 commit 2c80136
Showing 1 changed file with 24 additions and 100 deletions.
124 changes: 24 additions & 100 deletions data_pipelines/03-document-extraction.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,7 @@
"metadata": {},
"outputs": [],
"source": [
"%pip install -q amazon-textract-textractor[pdf]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9889c475-cef9-4276-840f-7aee770d48ac",
"metadata": {},
"outputs": [],
"source": [
"!uname -a"
"%pip install -q amazon-textract-textractor[pdf] pdf2image pydantic \"anthropic[bedrock]\""
]
},
{
Expand All @@ -35,28 +25,7 @@
"metadata": {},
"outputs": [],
"source": [
"!sudo apt-get update -y 2> /dev/null"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bf5623fd-a3f8-4b9f-98d1-fedd4e26b33f",
"metadata": {},
"outputs": [],
"source": [
"!sudo apt install poppler-utils -y 2> /dev/null"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "28790f37-351a-4f36-a153-39d32cc86539",
"metadata": {},
"outputs": [],
"source": [
"# used by mazon-textract-textractor to visualize images with extraction results\n",
"%pip install -q pdf2image"
"!sudo apt-get update -y 2> /dev/null && sudo apt install poppler-utils -y 2> /dev/null"
]
},
{
Expand All @@ -66,7 +35,7 @@
"metadata": {},
"outputs": [],
"source": [
"ls raw_documents/"
"!ls raw_documents/"
]
},
{
Expand All @@ -76,7 +45,7 @@
"metadata": {},
"outputs": [],
"source": [
"ls raw_documents/prepared/"
"!ls raw_documents/prepared/"
]
},
{
Expand All @@ -86,7 +55,7 @@
"metadata": {},
"outputs": [],
"source": [
"ls raw_documents/prepared/Amazon/"
"!ls raw_documents/prepared/Amazon/"
]
},
{
Expand Down Expand Up @@ -176,26 +145,6 @@
"print(document.pages[4].to_markdown())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "203a6eac-7de3-414d-93a4-2d769054e349",
"metadata": {},
"outputs": [],
"source": [
"%pip install -U -q pydantic 2> /dev/null"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c4a24b27-4948-4d0a-9543-c157e9344964",
"metadata": {},
"outputs": [],
"source": [
"%pip install -U -q \"anthropic[bedrock]\""
]
},
{
"cell_type": "code",
"execution_count": 17,
Expand Down Expand Up @@ -444,7 +393,6 @@
"outputs": [],
"source": [
"%%time\n",
"\n",
"results = extract_docs_into_markdown(prepared_pdfs_metadata)"
]
},
Expand All @@ -465,16 +413,7 @@
"metadata": {},
"outputs": [],
"source": [
"from utils.helpers import store_list_to_s3"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "29e061f8-b57f-4150-83ce-dd39568cdbd8",
"metadata": {},
"outputs": [],
"source": [
"from utils.helpers import store_list_to_s3\n",
"ssm = boto3.client(\"ssm\")"
]
},
Expand All @@ -485,27 +424,9 @@
"metadata": {},
"outputs": [],
"source": [
"s3_bucket_name_parameter = \"/AgenticLLMAssistantWorkshop/AgentDataBucketParameter\""
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "2245c6a7-1335-49c3-ab2f-cea1cbeb7b14",
"metadata": {},
"outputs": [],
"source": [
"s3_bucket_name_parameter = \"/AgenticLLMAssistantWorkshop/AgentDataBucketParameter\"\n",
"s3_bucket_name = ssm.get_parameter(Name=s3_bucket_name_parameter)\n",
"s3_bucket_name = s3_bucket_name[\"Parameter\"][\"Value\"]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "c63a38cf-fbc1-4f29-97c5-3cf73af479a6",
"metadata": {},
"outputs": [],
"source": [
"s3_bucket_name = s3_bucket_name[\"Parameter\"][\"Value\"]\n",
"processed_documents_s3_key = \"documents_processed.json\""
]
},
Expand All @@ -518,16 +439,6 @@
"source": [
"store_list_to_s3(s3_bucket_name, processed_documents_s3_key, results)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e40e3a7d-8b47-4b79-94eb-7e9c08624322",
"metadata": {},
"outputs": [],
"source": [
"results[0]"
]
}
],
"metadata": {
Expand All @@ -536,6 +447,19 @@
"language": "python",
"name": "python3"
},
"nbformat": 4,
"nbformat_minor": 5
}
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 2c80136

Please sign in to comment.