Azure-Samples · pamelafox · Nov 20, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 22, 2024
diff --git a/.azdo/pipelines/azure-dev.yml b/.azdo/pipelines/azure-dev.yml
@@ -120,6 +120,7 @@ steps:
       DEPLOYMENT_TARGET: $(DEPLOYMENT_TARGET)
       AZURE_CONTAINER_APPS_WORKLOAD_PROFILE: $(AZURE_CONTAINER_APPS_WORKLOAD_PROFILE)
       USE_CHAT_HISTORY_BROWSER: $(USE_CHAT_HISTORY_BROWSER)
+      USE_MEDIA_DESCRIBER_AZURE_CU: $(USE_MEDIA_DESCRIBER_AZURE_CU)
   - task: AzureCLI@2
     displayName: Deploy Application
     inputs:

diff --git a/.github/workflows/azure-dev.yml b/.github/workflows/azure-dev.yml
@@ -103,6 +103,7 @@ jobs:
       DEPLOYMENT_TARGET: ${{ vars.DEPLOYMENT_TARGET }}
       AZURE_CONTAINER_APPS_WORKLOAD_PROFILE: ${{ vars.AZURE_CONTAINER_APPS_WORKLOAD_PROFILE }}
       USE_CHAT_HISTORY_BROWSER: ${{ vars.USE_CHAT_HISTORY_BROWSER }}
+      USE_MEDIA_DESCRIBER_AZURE_CU: ${{ vars.USE_MEDIA_DESCRIBER_AZURE_CU }}
     steps:
       - name: Checkout
         uses: actions/checkout@v4

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -22,7 +22,7 @@
 - [Running unit tests](#running-unit-tests)
 - [Running E2E tests](#running-e2e-tests)
 - [Code Style](#code-style)
-- [Adding new azd environment variables](#add-new-azd-environment-variables)
+- [Adding new azd environment variables](#adding-new-azd-environment-variables)
 
 ## Code of Conduct
 
@@ -166,6 +166,8 @@
 
 When adding new azd environment variables, please remember to update:
 
+1. [main.parameters.json](./main.parameters.json)
+1. [appEnvVariables in main.bicep](./main.bicep)
 1. App Service's [azure.yaml](./azure.yaml)
 1. [ADO pipeline](.azdo/pipelines/azure-dev.yml).
 1. [Github workflows](.github/workflows/azure-dev.yml)
diff --git a/README.md b/README.md
@@ -92,7 +92,9 @@
 - Azure AI Document Intelligence: SO (Standard) tier using pre-built layout. Pricing per document page, sample documents have 261 pages total. [Pricing](https://azure.microsoft.com/pricing/details/form-recognizer/)
 - Azure AI Search: Basic tier, 1 replica, free level of semantic search. Pricing per hour. [Pricing](https://azure.microsoft.com/pricing/details/search/)
 - Azure Blob Storage: Standard tier with ZRS (Zone-redundant storage). Pricing per storage and read operations. [Pricing](https://azure.microsoft.com/pricing/details/storage/blobs/)
-- Azure Cosmos DB: Serverless tier. Pricing per request unit and storage. [Pricing](https://azure.microsoft.com/pricing/details/cosmos-db/)
+- Azure Cosmos DB: Only provisioned if you enabled [chat history with Cosmos DB](docs/deploy_features.md#enabling-persistent-chat-history-with-azure-cosmos-db). Serverless tier. Pricing per request unit and storage. [Pricing](https://azure.microsoft.com/pricing/details/cosmos-db/)
+- Azure AI Vision: Only provisioned if you enabled [GPT-4 with vision](docs/gpt4v.md). Pricing per 1K transactions. [Pricing](https://azure.microsoft.com/en-us/pricing/details/cognitive-services/computer-vision/)
+- Azure AI Content Understanding: Only provisioned if you enabled [media description](docs/deploy_features.md#enabling-media-description-with-azure-content-understanding). Pricing per TODO. [Pricing](TODO)
 - Azure Monitor: Pay-as-you-go tier. Costs based on data ingested. [Pricing](https://azure.microsoft.com/pricing/details/monitor/)
 
 To reduce costs, you can switch to free SKUs for various services, but those SKUs have limitations.

diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
@@ -7,6 +7,7 @@
 from azure.core.credentials import AzureKeyCredential
 from azure.core.credentials_async import AsyncTokenCredential
 from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider
+from rich.logging import RichHandler
 
 from load_azd_env import load_azd_env
 from prepdocslib.blobmanager import BlobManager
@@ -158,8 +159,10 @@ def setup_file_processors(
     local_pdf_parser: bool = False,
     local_html_parser: bool = False,
     search_images: bool = False,
+    use_content_understanding: bool = False,
+    content_understanding_endpoint: Union[str, None] = None,
 ):
-    sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images)
+    sentence_text_splitter = SentenceTextSplitter()
 
     doc_int_parser: Optional[DocumentAnalysisParser] = None
     # check if Azure Document Intelligence credentials are provided
@@ -170,6 +173,8 @@ def setup_file_processors(
         doc_int_parser = DocumentAnalysisParser(
             endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/",
             credential=documentintelligence_creds,
+            use_content_understanding=use_content_understanding,
+            content_understanding_endpoint=content_understanding_endpoint,
         )
 
     pdf_parser: Optional[Parser] = None
@@ -241,8 +246,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.",
-        epilog="Example: prepdocs.py '.\\data\*' -v",
+        description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index."
     )
     parser.add_argument("files", nargs="?", help="Files to be processed")
 
@@ -295,17 +299,18 @@ async def main(strategy: Strategy, setup_index: bool = True):
     args = parser.parse_args()
 
     if args.verbose:
-        logging.basicConfig(format="%(message)s")
+        logging.basicConfig(format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)])
         # We only set the level to INFO for our logger,
         # to avoid seeing the noisy INFO level logs from the Azure SDKs
-        logger.setLevel(logging.INFO)
+        logger.setLevel(logging.DEBUG)
 
     load_azd_env()
 
     use_int_vectorization = os.getenv("USE_FEATURE_INT_VECTORIZATION", "").lower() == "true"
     use_gptvision = os.getenv("USE_GPT4V", "").lower() == "true"
     use_acls = os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT") is not None
     dont_use_vectors = os.getenv("USE_VECTORS", "").lower() == "false"
+    use_content_understanding = os.getenv("USE_MEDIA_DESCRIBER_AZURE_CU", "").lower() == "true"
 
     # Use the current user identity to connect to Azure services. See infra/main.bicep for role assignments.
     if tenant_id := os.getenv("AZURE_TENANT_ID"):
@@ -403,6 +408,8 @@ async def main(strategy: Strategy, setup_index: bool = True):
             local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER") == "true",
             local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER") == "true",
             search_images=use_gptvision,
+            use_content_understanding=use_content_understanding,
+            content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"),
         )
         image_embeddings_service = setup_image_embeddings_service(
             azure_credential=azd_credential,
@@ -421,6 +428,8 @@ async def main(strategy: Strategy, setup_index: bool = True):
             search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
             use_acls=use_acls,
             category=args.category,
+            use_content_understanding=use_content_understanding,
+            content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"),
         )
 
     loop.run_until_complete(main(ingestion_strategy, setup_index=not args.remove and not args.removeall))

diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py
@@ -171,7 +171,7 @@ def sourcepage_from_file_page(cls, filename, page=0) -> str:
 
     @classmethod
     def blob_image_name_from_file_page(cls, filename, page=0) -> str:
-        return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".png"
+        return os.path.splitext(os.path.basename(filename))[0] + f"-{page+1}" + ".png"
 
     @classmethod
     def blob_name_from_file_name(cls, filename) -> str:

diff --git a/app/backend/prepdocslib/cu_image.py b/app/backend/prepdocslib/cu_image.py
@@ -0,0 +1,109 @@
+import logging
+from typing import Union
+
+import aiohttp
+from azure.core.credentials_async import AsyncTokenCredential
+from azure.identity.aio import get_bearer_token_provider
+from rich.progress import Progress
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
+
+logger = logging.getLogger("scripts")
+
+CU_API_VERSION = "2024-12-01-preview"
+
+PATH_ANALYZER_MANAGEMENT = "/analyzers/{analyzerId}"
+PATH_ANALYZER_MANAGEMENT_OPERATION = "/analyzers/{analyzerId}/operations/{operationId}"
+
+# Define Analyzer inference paths
+PATH_ANALYZER_INFERENCE = "/analyzers/{analyzerId}:analyze"
+PATH_ANALYZER_INFERENCE_GET_IMAGE = "/analyzers/{analyzerId}/results/{operationId}/images/{imageId}"
+
+analyzer_name = "image_analyzer"
+image_schema = {
+    "analyzerId": analyzer_name,
+    "name": "Image understanding",
+    "description": "Extract detailed structured information from images extracted from documents.",
+    "baseAnalyzerId": "prebuilt-image",
+    "scenario": "image",
+    "config": {"returnDetails": False},
+    "fieldSchema": {
+        "name": "ImageInformation",
+        "descriptions": "Description of image.",
+        "fields": {
+            "Description": {
+                "type": "string",
+                "description": "Description of the image. If the image has a title, start with the title. Include a 2-sentence summary. If the image is a chart, diagram, or table, include the underlying data in an HTML table tag, with accurate numbers. If the image is a chart, describe any axis or legends. The only allowed HTML tags are the table/thead/tr/td/tbody tags.",
+            },
+        },
+    },
+}
+
+
+class ContentUnderstandingManager:
+
+    def __init__(self, endpoint: str, credential: Union[AsyncTokenCredential, str]):
+        self.endpoint = endpoint
+        self.credential = credential
+
+    async def poll_api(self, session, poll_url, headers):
+
+        @retry(stop=stop_after_attempt(60), wait=wait_fixed(2), retry=retry_if_exception_type(ValueError))
+        async def poll():
+            async with session.get(poll_url, headers=headers) as response:
+                response.raise_for_status()
+                response_json = await response.json()
+                if response_json["status"] == "Failed":
+                    raise Exception("Failed")
+                if response_json["status"] == "Running":
+                    raise ValueError("Running")
+                return response_json
+
+        return await poll()
+
+    async def create_analyzer(self):
+        logger.info("Creating analyzer '%s'...", image_schema["analyzerId"])
+
+        token_provider = get_bearer_token_provider(self.credential, "https://cognitiveservices.azure.com/.default")
+        token = await token_provider()
+        headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
+        params = {"api-version": CU_API_VERSION}
+        analyzer_id = image_schema["analyzerId"]
+        cu_endpoint = f"{self.endpoint}/contentunderstanding/analyzers/{analyzer_id}"
+        async with aiohttp.ClientSession() as session:
+            async with session.put(url=cu_endpoint, params=params, headers=headers, json=image_schema) as response:
+                if response.status == 409:
+                    logger.info("Analyzer '%s' already exists.", analyzer_id)
+                    return
+                elif response.status != 201:
+                    data = await response.text()
+                    logger.error("Error creating analyzer: %s", data)
+                    response.raise_for_status()
+                else:
+                    poll_url = response.headers.get("Operation-Location")
+
+            with Progress() as progress:
+                progress.add_task("Creating analyzer...", total=None, start=False)
+                await self.poll_api(session, poll_url, headers)
+
+    async def describe_image(self, image_bytes) -> str:
+        logger.info("Sending image to Azure Content Understanding service...")
+        async with aiohttp.ClientSession() as session:
+            token = await self.credential.get_token("https://cognitiveservices.azure.com/.default")
+            headers = {"Authorization": "Bearer " + token.token}
+            params = {"api-version": CU_API_VERSION}
+
+            async with session.post(
+                url=f"{self.endpoint}/contentunderstanding/analyzers/{analyzer_name}:analyze",
+                params=params,
+                headers=headers,
+                data=image_bytes,
+            ) as response:
+                response.raise_for_status()
+                poll_url = response.headers["Operation-Location"]
+
+                with Progress() as progress:
+                    progress.add_task("Processing...", total=None, start=False)
+                    results = await self.poll_api(session, poll_url, headers)
+
+                fields = results["result"]["contents"][0]["fields"]
+                return fields["DescriptionHTML"]["valueString"]