diff --git a/.azdo/pipelines/azure-dev.yml b/.azdo/pipelines/azure-dev.yml index 3495e06a45..8c61d4acaf 100644 --- a/.azdo/pipelines/azure-dev.yml +++ b/.azdo/pipelines/azure-dev.yml @@ -120,6 +120,7 @@ steps: DEPLOYMENT_TARGET: $(DEPLOYMENT_TARGET) AZURE_CONTAINER_APPS_WORKLOAD_PROFILE: $(AZURE_CONTAINER_APPS_WORKLOAD_PROFILE) USE_CHAT_HISTORY_BROWSER: $(USE_CHAT_HISTORY_BROWSER) + USE_MEDIA_DESCRIBER_AZURE_CU: $(USE_MEDIA_DESCRIBER_AZURE_CU) - task: AzureCLI@2 displayName: Deploy Application inputs: diff --git a/.github/workflows/azure-dev.yml b/.github/workflows/azure-dev.yml index 798e589413..f233b4a821 100644 --- a/.github/workflows/azure-dev.yml +++ b/.github/workflows/azure-dev.yml @@ -103,6 +103,7 @@ jobs: DEPLOYMENT_TARGET: ${{ vars.DEPLOYMENT_TARGET }} AZURE_CONTAINER_APPS_WORKLOAD_PROFILE: ${{ vars.AZURE_CONTAINER_APPS_WORKLOAD_PROFILE }} USE_CHAT_HISTORY_BROWSER: ${{ vars.USE_CHAT_HISTORY_BROWSER }} + USE_MEDIA_DESCRIBER_AZURE_CU: ${{ vars.USE_MEDIA_DESCRIBER_AZURE_CU }} steps: - name: Checkout uses: actions/checkout@v4 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 55b7be4e4c..8da97a9703 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -22,7 +22,7 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio - [Running unit tests](#running-unit-tests) - [Running E2E tests](#running-e2e-tests) - [Code Style](#code-style) -- [Adding new azd environment variables](#add-new-azd-environment-variables) +- [Adding new azd environment variables](#adding-new-azd-environment-variables) ## Code of Conduct @@ -166,6 +166,8 @@ If you followed the steps above to install the pre-commit hooks, then you can ju When adding new azd environment variables, please remember to update: +1. [main.parameters.json](./main.parameters.json) +1. [appEnvVariables in main.bicep](./main.bicep) 1. App Service's [azure.yaml](./azure.yaml) 1. [ADO pipeline](.azdo/pipelines/azure-dev.yml). 1. [Github workflows](.github/workflows/azure-dev.yml) diff --git a/README.md b/README.md index fc8c009bfe..b9e0aad4c9 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,9 @@ However, you can try the [Azure pricing calculator](https://azure.com/e/e3490de2 - Azure AI Document Intelligence: SO (Standard) tier using pre-built layout. Pricing per document page, sample documents have 261 pages total. [Pricing](https://azure.microsoft.com/pricing/details/form-recognizer/) - Azure AI Search: Basic tier, 1 replica, free level of semantic search. Pricing per hour. [Pricing](https://azure.microsoft.com/pricing/details/search/) - Azure Blob Storage: Standard tier with ZRS (Zone-redundant storage). Pricing per storage and read operations. [Pricing](https://azure.microsoft.com/pricing/details/storage/blobs/) -- Azure Cosmos DB: Serverless tier. Pricing per request unit and storage. [Pricing](https://azure.microsoft.com/pricing/details/cosmos-db/) +- Azure Cosmos DB: Only provisioned if you enabled [chat history with Cosmos DB](docs/deploy_features.md#enabling-persistent-chat-history-with-azure-cosmos-db). Serverless tier. Pricing per request unit and storage. [Pricing](https://azure.microsoft.com/pricing/details/cosmos-db/) +- Azure AI Vision: Only provisioned if you enabled [GPT-4 with vision](docs/gpt4v.md). Pricing per 1K transactions. [Pricing](https://azure.microsoft.com/en-us/pricing/details/cognitive-services/computer-vision/) +- Azure AI Content Understanding: Only provisioned if you enabled [media description](docs/deploy_features.md#enabling-media-description-with-azure-content-understanding). Pricing per TODO. [Pricing](TODO) - Azure Monitor: Pay-as-you-go tier. Costs based on data ingested. [Pricing](https://azure.microsoft.com/pricing/details/monitor/) To reduce costs, you can switch to free SKUs for various services, but those SKUs have limitations. diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index 54ec6f0dd9..31c7740866 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -7,6 +7,7 @@ from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider +from rich.logging import RichHandler from load_azd_env import load_azd_env from prepdocslib.blobmanager import BlobManager @@ -158,8 +159,10 @@ def setup_file_processors( local_pdf_parser: bool = False, local_html_parser: bool = False, search_images: bool = False, + use_content_understanding: bool = False, + content_understanding_endpoint: Union[str, None] = None, ): - sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images) + sentence_text_splitter = SentenceTextSplitter() doc_int_parser: Optional[DocumentAnalysisParser] = None # check if Azure Document Intelligence credentials are provided @@ -170,6 +173,8 @@ def setup_file_processors( doc_int_parser = DocumentAnalysisParser( endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/", credential=documentintelligence_creds, + use_content_understanding=use_content_understanding, + content_understanding_endpoint=content_understanding_endpoint, ) pdf_parser: Optional[Parser] = None @@ -241,8 +246,7 @@ async def main(strategy: Strategy, setup_index: bool = True): if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.", - epilog="Example: prepdocs.py '.\\data\*' -v", + description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index." ) parser.add_argument("files", nargs="?", help="Files to be processed") @@ -295,10 +299,10 @@ async def main(strategy: Strategy, setup_index: bool = True): args = parser.parse_args() if args.verbose: - logging.basicConfig(format="%(message)s") + logging.basicConfig(format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)]) # We only set the level to INFO for our logger, # to avoid seeing the noisy INFO level logs from the Azure SDKs - logger.setLevel(logging.INFO) + logger.setLevel(logging.DEBUG) load_azd_env() @@ -306,6 +310,7 @@ async def main(strategy: Strategy, setup_index: bool = True): use_gptvision = os.getenv("USE_GPT4V", "").lower() == "true" use_acls = os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT") is not None dont_use_vectors = os.getenv("USE_VECTORS", "").lower() == "false" + use_content_understanding = os.getenv("USE_MEDIA_DESCRIBER_AZURE_CU", "").lower() == "true" # Use the current user identity to connect to Azure services. See infra/main.bicep for role assignments. if tenant_id := os.getenv("AZURE_TENANT_ID"): @@ -403,6 +408,8 @@ async def main(strategy: Strategy, setup_index: bool = True): local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER") == "true", local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER") == "true", search_images=use_gptvision, + use_content_understanding=use_content_understanding, + content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"), ) image_embeddings_service = setup_image_embeddings_service( azure_credential=azd_credential, @@ -421,6 +428,8 @@ async def main(strategy: Strategy, setup_index: bool = True): search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"), use_acls=use_acls, category=args.category, + use_content_understanding=use_content_understanding, + content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"), ) loop.run_until_complete(main(ingestion_strategy, setup_index=not args.remove and not args.removeall)) diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py index e9f18e795a..e8d01dda52 100644 --- a/app/backend/prepdocslib/blobmanager.py +++ b/app/backend/prepdocslib/blobmanager.py @@ -171,7 +171,7 @@ def sourcepage_from_file_page(cls, filename, page=0) -> str: @classmethod def blob_image_name_from_file_page(cls, filename, page=0) -> str: - return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".png" + return os.path.splitext(os.path.basename(filename))[0] + f"-{page+1}" + ".png" @classmethod def blob_name_from_file_name(cls, filename) -> str: diff --git a/app/backend/prepdocslib/cu_image.py b/app/backend/prepdocslib/cu_image.py new file mode 100644 index 0000000000..7db2b49c7d --- /dev/null +++ b/app/backend/prepdocslib/cu_image.py @@ -0,0 +1,109 @@ +import logging +from typing import Union + +import aiohttp +from azure.core.credentials_async import AsyncTokenCredential +from azure.identity.aio import get_bearer_token_provider +from rich.progress import Progress +from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed + +logger = logging.getLogger("scripts") + +CU_API_VERSION = "2024-12-01-preview" + +PATH_ANALYZER_MANAGEMENT = "/analyzers/{analyzerId}" +PATH_ANALYZER_MANAGEMENT_OPERATION = "/analyzers/{analyzerId}/operations/{operationId}" + +# Define Analyzer inference paths +PATH_ANALYZER_INFERENCE = "/analyzers/{analyzerId}:analyze" +PATH_ANALYZER_INFERENCE_GET_IMAGE = "/analyzers/{analyzerId}/results/{operationId}/images/{imageId}" + +analyzer_name = "image_analyzer" +image_schema = { + "analyzerId": analyzer_name, + "name": "Image understanding", + "description": "Extract detailed structured information from images extracted from documents.", + "baseAnalyzerId": "prebuilt-image", + "scenario": "image", + "config": {"returnDetails": False}, + "fieldSchema": { + "name": "ImageInformation", + "descriptions": "Description of image.", + "fields": { + "Description": { + "type": "string", + "description": "Description of the image. If the image has a title, start with the title. Include a 2-sentence summary. If the image is a chart, diagram, or table, include the underlying data in an HTML table tag, with accurate numbers. If the image is a chart, describe any axis or legends. The only allowed HTML tags are the table/thead/tr/td/tbody tags.", + }, + }, + }, +} + + +class ContentUnderstandingManager: + + def __init__(self, endpoint: str, credential: Union[AsyncTokenCredential, str]): + self.endpoint = endpoint + self.credential = credential + + async def poll_api(self, session, poll_url, headers): + + @retry(stop=stop_after_attempt(60), wait=wait_fixed(2), retry=retry_if_exception_type(ValueError)) + async def poll(): + async with session.get(poll_url, headers=headers) as response: + response.raise_for_status() + response_json = await response.json() + if response_json["status"] == "Failed": + raise Exception("Failed") + if response_json["status"] == "Running": + raise ValueError("Running") + return response_json + + return await poll() + + async def create_analyzer(self): + logger.info("Creating analyzer '%s'...", image_schema["analyzerId"]) + + token_provider = get_bearer_token_provider(self.credential, "https://cognitiveservices.azure.com/.default") + token = await token_provider() + headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"} + params = {"api-version": CU_API_VERSION} + analyzer_id = image_schema["analyzerId"] + cu_endpoint = f"{self.endpoint}/contentunderstanding/analyzers/{analyzer_id}" + async with aiohttp.ClientSession() as session: + async with session.put(url=cu_endpoint, params=params, headers=headers, json=image_schema) as response: + if response.status == 409: + logger.info("Analyzer '%s' already exists.", analyzer_id) + return + elif response.status != 201: + data = await response.text() + logger.error("Error creating analyzer: %s", data) + response.raise_for_status() + else: + poll_url = response.headers.get("Operation-Location") + + with Progress() as progress: + progress.add_task("Creating analyzer...", total=None, start=False) + await self.poll_api(session, poll_url, headers) + + async def describe_image(self, image_bytes) -> str: + logger.info("Sending image to Azure Content Understanding service...") + async with aiohttp.ClientSession() as session: + token = await self.credential.get_token("https://cognitiveservices.azure.com/.default") + headers = {"Authorization": "Bearer " + token.token} + params = {"api-version": CU_API_VERSION} + + async with session.post( + url=f"{self.endpoint}/contentunderstanding/analyzers/{analyzer_name}:analyze", + params=params, + headers=headers, + data=image_bytes, + ) as response: + response.raise_for_status() + poll_url = response.headers["Operation-Location"] + + with Progress() as progress: + progress.add_task("Processing...", total=None, start=False) + results = await self.poll_api(session, poll_url, headers) + + fields = results["result"]["contents"][0]["fields"] + return fields["DescriptionHTML"]["valueString"] diff --git a/app/backend/prepdocslib/figure_output.json b/app/backend/prepdocslib/figure_output.json new file mode 100644 index 0000000000..52177fd2c8 --- /dev/null +++ b/app/backend/prepdocslib/figure_output.json @@ -0,0 +1,127 @@ +"figures": [ + { + "id": "3.1", + "boundingRegions": [ + { + "pageNumber": 3, + "polygon": [ + 1.4703, + 2.8371, + 5.5409, + 2.8415, + 5.5381, + 6.6022, + 1.4681, + 6.5978 + ] + } + ], + "spans": [ + { + "offset": 801, + "length": 138 + } + ], + "elements": [ + "/paragraphs/6", + "/paragraphs/7" + ], + "caption": { + "content": "Global Financial Market Distribution (2023)", + "boundingRegions": [ + { + "pageNumber": 3, + "polygon": [ + 1.5745, + 2.1414, + 5.3702, + 2.1421, + 5.3701, + 2.7255, + 1.5744, + 2.7248 + ] + } + ], + "spans": [ + { + "offset": 822, + "length": 43 + } + ], + "elements": [ + "/paragraphs/5" + ] + } + }, + + + +!-- PageBreak -->\n\n\n## Introduction to Financial Markets\n\n\n
\n
Global Financial Market Distribution\n(2023)
\n\n\u2612\ + +{ + "@odata.context": "https://gptkb-g6kuptydvtma6.search.windows.net/indexes('gptkbindex')/$metadata#docs(*)", + "@odata.count": 13, + "@search.facets": { + "sourcefile": [ + { + "count": 13, + "value": "Financial Market Analysis Report 2023.pdf" + } + ] + }, + "value": [ + { + "@search.score": 1, + "content": "# Financial Market Analysis Report 2023\n\nAn In-Depth Exploration of Stocks, Cryptocurrencies, and Commodities\nPrepared by: Contoso Financial Analytics\n\n\n\n\n## Executive Summary\n\n. In this comprehensive report, Contoso Financial Analytics provides a\ndeep dive into the financial markets of 2023, focusing on the trends\nand fluctuations within stocks, cryptocurrencies, and commodities.\nOur analysis covers historical patterns, current market conditions, and\nfuture predictions, offering valuable insights for investors, analysts,\nand financial enthusiasts. This report leverages advanced data\nanalytics to present a clear picture of the complex interplay between\ndifferent financial markets and their potential trajectories\n\n\n\n\n## Introduction to Financial Markets\n\n\n
Title: Investment Portfolio Distribution\n\nType: chart\n\nDescription: This pie chart illustrates the distribution of an investment portfolio across four asset classes: Stocks, Bonds, Cryptocurrencies, and Commodities." + }, + { + "@search.score": 1, + "content": "-- PageBreak -->\n\n\n### Interplay Between Different Market Segments\n\n\n
S&P 500NASDAQBitcoinEthereumOilGold
S&P 5001
NASDAQ0.951
Bitcoin0.30.41
Ethereum0.350.450.91
Oil0.60.650.20.251
Gold-0.2-0.15-0.1-0.05-0.31
\n\n\nFinancial markets are interconnected, with movements in one segment often influencing others. This\nsection examines the correlations between stock indices, cryptocurrency prices, and commodity prices,\nrevealing how changes in one market can have ripple effects across the financial ecosystem.\n\n\n\n\n### Impact of Macroeconomic Factors\n\n\n
Title: Financial Market Trends (2018-2023)\n\nType: chart\n\n" + }, + { + "@search.score": 1, + "content": "-- PageBreak -->\n\n\n## Future Predictions and Trends\n\n\n
Title: Price Index Comparison (2024-2028)\n\nType: chart\n\nDescription: This bar chart compares the price indices of Oil, Bitcoin, and the S&P 500 from 2024 to 2028, with 2024 indexed to 100. Each year shows the relative price change for each asset class. Oil, Bitcoin, and the S&P 500 are represented by grey, orange, and blue bars, respectively. The chart indicates that all three assets increase in price over the years, with Bitcoin showing the most significant growth by 2028.\n\n| Year | Oil | Bitcoin | S&P 500 |\n|------|-----|---------|---------|\n| 2024 | 100 | 100 | 100 |\n| 2025 | 105 | 110 | 108 |\n| 2026 | 110 | 120 | 115 |\n| 2027 | 115 | 130 | 120 |\n| 2028 | 120 | 140 | 125 |
\n\n\nBased on historical data, current trends,\nand economic indicators, this section\npresents predictions for the future of\nfinancial markets. We explore potential\ntrajectories for stock indices,\ncryptocurrency values, and commodity\nprices, offering investors and analysts\nforesight into what the coming years\nmight hold." + }, + { + "@search.score": 1, + "content": "-- PageBreak -->\n\n\n### Impact of Macroeconomic Factors\n\n\n
Title: Financial Market Trends (2018-2023)\n\nType: chart\n\nDescription: This line chart illustrates the trends in interest rates, inflation data, and GDP growth from 2018 to 2023. The x-axis represents the years, while the y-axis shows the percentage values ranging from -5 to 5. The chart includes three lines: blue for interest rates, orange for inflation data, and gray for GDP growth. \n\n- **Interest Rates %**: \n - 2018: 2\n - 2019: 2\n - 2020: 1\n - 2021: 1\n - 2022: 2\n - 2023: 2\n\n- **Inflation Data %**: \n - 2018: 2\n - 2019: 1\n - 2020: 2\n - 2021: 3\n - 2022: 3\n - 2023: 2\n\n- **GDP Growth %**: \n - 2018: 3\n - 2019: 2\n - 2020: -4\n - 2021: 3\n - 2022: 2\n - 2023: 2\n\nThe chart shows a significant dip in GDP growth in 2020, likely due to economic disruptions, followed by a recovery in 2021. Inflation data shows a peak in 2021 and 2022, while interest rates remain relatively stable throughout the period.
\n\n\nMacroeconomic factors such as interest\nrates, inflation, and GDP growth play a\npivotal role in shaping financial " + }, + { + "@search.score": 1, + "content": "
\n\n\nMacroeconomic factors such as interest\nrates, inflation, and GDP growth play a\npivotal role in shaping financial markets.\nThis section analyzes how these factors\nhave influenced stock, cryptocurrency,\nand commodity markets over recent\nyears, providing insights into the\ncomplex relationship between the\neconomy and financial market\nperformance.\n\n\n\n\n## Future Predictions and Trends\n\n\n
Title: Price Index Comparison (2024-2028)\n\nType: chart\n\nDescription: This bar chart compares the price indices of Oil, Bitcoin, and the S&P 500 from 2024 to 2028, with 2024 indexed to 100. Each year shows the relative price change for each asset class. Oil, Bitcoin, and the S&P 500 are represented by grey, orange, and blue bars, respectively. The chart indicates that all three assets increase in price over the years, with Bitcoin showing the most significant growth by 2028.\n\n| Year | Oil | Bitcoin | S&P 500 |\n|------|-----|---------|---------|\n| 2024 | 100 | 100 | 100 |\n| 2025 | 105 | 110 | 108 |\n| 2026 | 110 | 120 | 115 |\n| 2027 | 115 | 130 | 120 |\n" + }, + { + "@search.score": 1, + "content": " We explore potential\ntrajectories for stock indices,\ncryptocurrency values, and commodity\nprices, offering investors and analysts\nforesight into what the coming years\nmight hold.\n\n\n\n\n## Conclusions\n\n. In conclusion, this report has traversed the multifaceted landscape of\nfinancial markets, shedding light on the intricate patterns and\ninterdependencies that define their behavior. From the volatility of\ncryptocurrencies to the steadiness of commodities, each segment\ntells a part of the story of our global economy. As Contoso Financial\nAnalytics, we are committed to providing our clients with the most\ncomprehensive and nuanced analysis, empowering them to make\ninformed financial decisions in an ever-evolving market.\n" + }, + { + "@search.score": 1, + "content": "as a new asset\nclass, captivating investors with their potential for\nhigh returns and their role in the future of finance.\nThis section explores the price dynamics of major\ncryptocurrencies like Bitcoin and Ethereum,\nanalyzing the factors driving their volatility and the\nimplications for the broader financial market.\n\n\n
Title: Cryptocurrency Legend\n\nType: diagram\n\nDescription: This image is a legend for a chart or graph, indicating the color coding for two cryptocurrencies. The blue line represents 'Bitconin' and the orange line represents 'Ethereum'. This legend is likely used to differentiate between the two data sets in a visual representation.
\n\n\n\n\n\n### Commodity Market Fluctuations\n\n\n
Title: Commodity Price Changes (2014-2022)\n\nType: chart\n\nDescription: This bar chart displays the annual percentage change in prices for three commodities: Wheat, Gold, and Oil, from 2014 to 2022. The horizontal axis represents the percentage change, ranging from -25% to 35%, while the vertical axis lists the years from 2014 to 2022. Each year has three bars representing Wheat (grey), Gold (orange), and Oil (blue)." + }, + { + "@search.score": 1, + "content": "2016 | ~5% | ~10% | ~-5% |\n| 2015 | ~-5% | ~0% | ~-10% |\n| 2014 | ~0% | ~5% | ~-20% |
\n\n\nCommodities such as oil, gold, and\nwheat are fundamental to the global\neconomy, influencing everything from\nenergy costs to food prices. This section\ndelves into the trends and factors\naffecting commodity prices, including\ngeopolitical events, supply-chain\ndisruptions, and environmental factors,\nproviding a comprehensive view of this\ncrucial market segment.\n\n\n\n\n### Interplay Between Different Market Segments\n\n\n
S&P 500NASDAQBitcoinEthereumOilGold
S&P 5001
NASDAQ0.951
Bitcoin0.30.41
Ethereum0.350.450.91
Oil0.60.650.20.251
Gold-0.2-0.15-0.1-0." + }, + { + "@search.score": 1, + "content": " Each\nsegment plays a crucial role in the overall economy, and their\ninteractions can have profound effects on global financial stability.\nThis section provides an overview of these segments and sets the\nstage for a detailed analysis\n\n\n\n\n## Stock Market Overview\n\n\n
Title: 5-Year Trend of the S&P 500 Index\n\nType: chart\n\nDescription: This line chart illustrates the 5-year trend of the S&P 500 Index from 2018 to 2022. The index shows a steady increase from 2018, peaking in 2021, followed by a decline in 2022. The y-axis represents the index value, ranging from 2000 to 5000, while the x-axis represents the years from 2018 to 2022.\n\n| Year | S&P 500 Index |\n|------|---------------|\n| 2018 | 2500 |\n| 2019 | 3000 |\n| 2020 | 3500 |\n| 2021 | 4500 |\n| 2022 | 4000 |
\n\n\nThe stock market is often considered the economy's\nheartbeat, reflecting corporate health and investor\nsentiment. Over the past five years, the S&P 500 index has\nexperienced significant volatility, with notable peaks and\ntroughs corresponding to various economic " + }, + { + "@search.score": 1, + "content": " Over the past five years, the S&P 500 index has\nexperienced significant volatility, with notable peaks and\ntroughs corresponding to various economic events. This\noverview examines the key factors that have influenced\nthe stock market's performance and what they indicate\nabout the economy's state\n\n\n\n\n## Cryptocurrency Market Dynamics\n\n\n
Title: Monthly Sales and Returns\n\nType: chart\n\nDescription: This line chart displays monthly sales and returns over a year. The blue line represents sales, which start at around 32,500 in January, peak at 42,500 in May, and end at 47,500 in December. The orange line represents returns, which remain relatively stable, starting at 2,500 in January and slightly increasing to 3,000 by December.\n\n| Month | Sales | Returns |\n|-------|--------|---------|\n| Jan | 32500 | 2500 |\n| Feb | 30000 | 2500 |\n| Mar | 35000 | 2500 |\n| Apr | 37500 | 2500 |\n| May | 42500 | 2500 |\n| Jun | 40000 | 2500 |\n| Jul | 37500 | 2500 |\n| Aug | 35000 | 2500 |\n| Sep | 40000 | 2500 |\n| Oct | 42500 | 2500 |\n| Nov | 45000 | 2750 |\n| " + }, + { + "@search.score": 1, + "content": "-- PageBreak -->\n\n\n## Introduction to Financial Markets\n\n\n
Title: Investment Portfolio Distribution\n\nType: chart\n\nDescription: This pie chart illustrates the distribution of an investment portfolio across four asset classes: Stocks, Bonds, Cryptocurrencies, and Commodities. The chart is divided into four colored sections, each representing a different asset class. Stocks are represented in blue, Bonds in orange, Cryptocurrencies in gray, and Commodities in yellow. The chart visually indicates the proportion of each asset class within the portfolio, though specific percentages are not provided.\n\n| Asset Class | Color |\n|-------------------|--------|\n| Stocks | Blue |\n| Bonds | Orange |\n| Cryptocurrencies | Gray |\n| Commodities | Yellow |
\n\n\nThe global financial market is a vast and intricate network of\nexchanges, instruments, and assets, ranging from traditional stocks\nand bonds to modern cryptocurrencies and commodities. Each\nsegment plays a crucial role in the overall economy, and their\ninteractions can have profound effects on global financial stability." + }, + { + "@search.score": 1, + "content": "-- PageBreak -->\n\n\n## Cryptocurrency Market Dynamics\n\n\n
Title: Monthly Sales and Returns\n\nType: chart\n\nDescription: This line chart displays monthly sales and returns over a year. The blue line represents sales, which start at around 32,500 in January, peak at 42,500 in May, and end at 47,500 in December. The orange line represents returns, which remain relatively stable, starting at 2,500 in January and slightly increasing to 3,000 by December.\n\n| Month | Sales | Returns |\n|-------|--------|---------|\n| Jan | 32500 | 2500 |\n| Feb | 30000 | 2500 |\n| Mar | 35000 | 2500 |\n| Apr | 37500 | 2500 |\n| May | 42500 | 2500 |\n| Jun | 40000 | 2500 |\n| Jul | 37500 | 2500 |\n| Aug | 35000 | 2500 |\n| Sep | 40000 | 2500 |\n| Oct | 42500 | 2500 |\n| Nov | 45000 | 2750 |\n| Dec | 47500 | 3000 |
\n\n\nCryptocurrencies have emerged as a new asset\nclass, captivating investors with their potential for\nhigh returns and their role in the future of finance.\nThis section explores the price dynamics of major\ncryptocurrencies like Bitcoin and Ethereum,\n" + }, + { + "@search.score": 1, + "content": "-- PageBreak -->\n\n\n### Commodity Market Fluctuations\n\n\n
Title: Commodity Price Changes (2014-2022)\n\nType: chart\n\nDescription: This bar chart displays the annual percentage change in prices for three commodities: Wheat, Gold, and Oil, from 2014 to 2022. The horizontal axis represents the percentage change, ranging from -25% to 35%, while the vertical axis lists the years from 2014 to 2022. Each year has three bars representing Wheat (grey), Gold (orange), and Oil (blue).\n\n| Year | Wheat (%) | Gold (%) | Oil (%) |\n|------|-----------|----------|---------|\n| 2022 | ~0% | ~5% | ~-5% |\n| 2021 | ~5% | ~-5% | ~30% |\n| 2020 | ~5% | ~5% | ~-20% |\n| 2019 | ~0% | ~15% | ~5% |\n| 2018 | ~-5% | ~0% | ~15% |\n| 2017 | ~0% | ~5% | ~10% |\n| 2016 | ~5% | ~10% | ~-5% |\n| 2015 | ~-5% | ~0% | ~-10% |\n| 2014 | ~0% | ~5% | ~-20% |
\n\n\nCommodities such as oil, gold, and\nwheat are fundamental to the global\neconomy, influencing everything from\nenergy costs to food prices." + } + ] + } diff --git a/app/backend/prepdocslib/filestrategy.py b/app/backend/prepdocslib/filestrategy.py index 55b24b6f3a..5f5a1f44af 100644 --- a/app/backend/prepdocslib/filestrategy.py +++ b/app/backend/prepdocslib/filestrategy.py @@ -2,6 +2,7 @@ from typing import List, Optional from .blobmanager import BlobManager +from .cu_image import ContentUnderstandingManager from .embeddings import ImageEmbeddings, OpenAIEmbeddings from .fileprocessor import FileProcessor from .listfilestrategy import File, ListFileStrategy @@ -50,6 +51,8 @@ def __init__( search_analyzer_name: Optional[str] = None, use_acls: bool = False, category: Optional[str] = None, + use_content_understanding: bool = False, + content_understanding_endpoint: Optional[str] = None, ): self.list_file_strategy = list_file_strategy self.blob_manager = blob_manager @@ -61,6 +64,8 @@ def __init__( self.search_info = search_info self.use_acls = use_acls self.category = category + self.use_content_understanding = use_content_understanding + self.content_understanding_endpoint = content_understanding_endpoint async def setup(self): search_manager = SearchManager( @@ -73,6 +78,10 @@ async def setup(self): ) await search_manager.create_index() + if self.use_content_understanding: + cu_manager = ContentUnderstandingManager(self.content_understanding_endpoint, self.search_info.credential) + await cu_manager.create_analyzer() + async def run(self): search_manager = SearchManager( self.search_info, self.search_analyzer_name, self.use_acls, False, self.embeddings diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py index f12fe70b94..857235c571 100644 --- a/app/backend/prepdocslib/page.py +++ b/app/backend/prepdocslib/page.py @@ -3,7 +3,7 @@ class Page: A single page from a document Attributes: - page_num (int): Page number + page_num (int): Page number (0-indexed) offset (int): If the text of the entire Document was concatenated into a single string, the index of the first character on the page. For example, if page 1 had the text "hello" and page 2 had the text "world", the offset of page 2 is 5 ("hellow") text (str): The text of the page """ @@ -17,6 +17,10 @@ def __init__(self, page_num: int, offset: int, text: str): class SplitPage: """ A section of a page that has been split into a smaller chunk. + + Attributes: + page_num (int): Page number (0-indexed) + text (str): The text of the section """ def __init__(self, page_num: int, text: str): diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py index 6604110020..d7326004a8 100644 --- a/app/backend/prepdocslib/pdfparser.py +++ b/app/backend/prepdocslib/pdfparser.py @@ -1,13 +1,24 @@ import html +import io +import json import logging +from enum import Enum from typing import IO, AsyncGenerator, Union +import pymupdf from azure.ai.documentintelligence.aio import DocumentIntelligenceClient -from azure.ai.documentintelligence.models import DocumentTable +from azure.ai.documentintelligence.models import ( + AnalyzeDocumentRequest, + AnalyzeResult, + DocumentFigure, + DocumentTable, +) from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential +from PIL import Image from pypdf import PdfReader +from .cu_image import ContentUnderstandingManager from .page import Page from .parser import Parser @@ -39,59 +50,139 @@ class DocumentAnalysisParser(Parser): """ def __init__( - self, endpoint: str, credential: Union[AsyncTokenCredential, AzureKeyCredential], model_id="prebuilt-layout" + self, + endpoint: str, + credential: Union[AsyncTokenCredential, AzureKeyCredential], + model_id="prebuilt-layout", + use_content_understanding=True, + content_understanding_endpoint: str = None, ): self.model_id = model_id self.endpoint = endpoint self.credential = credential + self.use_content_understanding = use_content_understanding + self.content_understanding_endpoint = content_understanding_endpoint async def parse(self, content: IO) -> AsyncGenerator[Page, None]: logger.info("Extracting text from '%s' using Azure Document Intelligence", content.name) + cu_manager = ContentUnderstandingManager(self.content_understanding_endpoint, self.credential) async with DocumentIntelligenceClient( endpoint=self.endpoint, credential=self.credential ) as document_intelligence_client: - poller = await document_intelligence_client.begin_analyze_document( - model_id=self.model_id, analyze_request=content, content_type="application/octet-stream" - ) - form_recognizer_results = await poller.result() + # turn content into bytes + content_bytes = content.read() + if self.use_content_understanding: + poller = await document_intelligence_client.begin_analyze_document( + model_id="prebuilt-layout", + analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes), + output=["figures"], + features=["ocrHighResolution"], + output_content_format="markdown", + ) + doc_for_pymupdf = pymupdf.open(stream=io.BytesIO(content_bytes)) + else: + poller = await document_intelligence_client.begin_analyze_document( + model_id=self.model_id, analyze_request=content, content_type="application/octet-stream" + ) + form_recognizer_results: AnalyzeResult = await poller.result() offset = 0 - for page_num, page in enumerate(form_recognizer_results.pages): + pages_json = [] + for page in form_recognizer_results.pages: tables_on_page = [ table for table in (form_recognizer_results.tables or []) - if table.bounding_regions and table.bounding_regions[0].page_number == page_num + 1 + if table.bounding_regions and table.bounding_regions[0].page_number == page.page_number ] + figures_on_page = [] + if self.use_content_understanding: + figures_on_page = [ + figure + for figure in (form_recognizer_results.figures or []) + if figure.bounding_regions and figure.bounding_regions[0].page_number == page.page_number + ] + + class ObjectType(Enum): + NONE = -1 + TABLE = 0 + FIGURE = 1 # mark all positions of the table spans in the page page_offset = page.spans[0].offset page_length = page.spans[0].length - table_chars = [-1] * page_length - for table_id, table in enumerate(tables_on_page): + mask_chars = [(ObjectType.NONE, None)] * page_length + for table_idx, table in enumerate(tables_on_page): for span in table.spans: # replace all table spans with "table_id" in table_chars array for i in range(span.length): idx = span.offset - page_offset + i if idx >= 0 and idx < page_length: - table_chars[idx] = table_id + mask_chars[idx] = (ObjectType.TABLE, table_idx) + for figure_idx, figure in enumerate(figures_on_page): + for span in figure.spans: + # replace all figure spans with "figure_id" in figure_chars array + for i in range(span.length): + idx = span.offset - page_offset + i + if idx >= 0 and idx < page_length: + mask_chars[idx] = (ObjectType.FIGURE, figure_idx) # build page text by replacing characters in table spans with table html page_text = "" - added_tables = set() - for idx, table_id in enumerate(table_chars): - if table_id == -1: + added_objects = set() # set of object types todo mypy + for idx, mask_char in enumerate(mask_chars): + object_type, object_idx = mask_char + if object_type == ObjectType.NONE: page_text += form_recognizer_results.content[page_offset + idx] - elif table_id not in added_tables: - page_text += DocumentAnalysisParser.table_to_html(tables_on_page[table_id]) - added_tables.add(table_id) - - yield Page(page_num=page_num, offset=offset, text=page_text) + elif object_type == ObjectType.TABLE: + if mask_char not in added_objects: + page_text += DocumentAnalysisParser.table_to_html(tables_on_page[object_idx]) + added_objects.add(mask_char) + elif object_type == ObjectType.FIGURE: + if mask_char not in added_objects: + figure_html = await DocumentAnalysisParser.figure_to_html( + doc_for_pymupdf, cu_manager, figures_on_page[object_idx] + ) + page_text += figure_html + added_objects.add(mask_char) + # We remove these comments since they are not needed and skew the page numbers + page_text = page_text.replace("", "") + # We remove excess newlines at the beginning and end of the page + page_text = page_text.strip() + yield Page(page_num=page.page_number - 1, offset=offset, text=page_text) + # Serialize the page text to a JSON and save it locally + page_json = { + "page_num": page.page_number - 1, + "offset": offset, + "text": page_text, + } + pages_json.append(page_json) offset += len(page_text) - - @classmethod - def table_to_html(cls, table: DocumentTable): - table_html = "" + with open("pages.json", "w") as f: + json.dump(pages_json, f) + + @staticmethod + async def figure_to_html( + doc: pymupdf.Document, cu_manager: ContentUnderstandingManager, figure: DocumentFigure + ) -> str: + logger.info("Describing figure '%s'", figure.id) + for region in figure.bounding_regions: + # To learn more about bounding regions, see https://aka.ms/bounding-region + bounding_box = ( + region.polygon[0], # x0 (left) + region.polygon[1], # y0 (top + region.polygon[4], # x1 (right) + region.polygon[5], # y1 (bottom) + ) + page_number = figure.bounding_regions[0]["pageNumber"] # 1-indexed + cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box) + figure_description = await cu_manager.describe_image(cropped_img) + figure_title = (figure.caption and figure.caption.content) or "" + return f"
{figure_title}
{figure_description}
" + + @staticmethod + def table_to_html(table: DocumentTable): + table_html = "
" rows = [ sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index) for i in range(table.row_count) @@ -107,5 +198,28 @@ def table_to_html(cls, table: DocumentTable): cell_spans += f" rowSpan={cell.row_span}" table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}" table_html += "" - table_html += "
" + table_html += "
" return table_html + + @staticmethod + def crop_image_from_pdf_page(doc: pymupdf.Document, page_number, bounding_box) -> bytes: + """ + Crops a region from a given page in a PDF and returns it as an image. + + :param pdf_path: Path to the PDF file. + :param page_number: The page number to crop from (0-indexed). + :param bounding_box: A tuple of (x0, y0, x1, y1) coordinates for the bounding box. + :return: A PIL Image of the cropped area. + """ + page = doc.load_page(page_number) + + # Cropping the page. The rect requires the coordinates in the format (x0, y0, x1, y1). + bbx = [x * 72 for x in bounding_box] + rect = pymupdf.Rect(bbx) + # 72 is the DPI ? what? explain this from CU + pix = page.get_pixmap(matrix=pymupdf.Matrix(300 / 72, 300 / 72), clip=rect) + + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + bytes_io = io.BytesIO() + img.save(bytes_io, format="PNG") + return bytes_io.getvalue() diff --git a/app/backend/prepdocslib/textsplitter.py b/app/backend/prepdocslib/textsplitter.py index 30b0c1ad77..2c39dff850 100644 --- a/app/backend/prepdocslib/textsplitter.py +++ b/app/backend/prepdocslib/textsplitter.py @@ -87,14 +87,13 @@ class SentenceTextSplitter(TextSplitter): Class that splits pages into smaller chunks. This is required because embedding models may not be able to analyze an entire page at once """ - def __init__(self, has_image_embeddings: bool, max_tokens_per_section: int = 500): + def __init__(self, max_tokens_per_section: int = 500): self.sentence_endings = STANDARD_SENTENCE_ENDINGS + CJK_SENTENCE_ENDINGS self.word_breaks = STANDARD_WORD_BREAKS + CJK_WORD_BREAKS self.max_section_length = DEFAULT_SECTION_LENGTH self.sentence_search_limit = 100 self.max_tokens_per_section = max_tokens_per_section self.section_overlap = int(self.max_section_length * DEFAULT_OVERLAP_PERCENT / 100) - self.has_image_embeddings = has_image_embeddings def split_page_by_max_tokens(self, page_num: int, text: str) -> Generator[SplitPage, None, None]: """ @@ -192,15 +191,15 @@ def find_page(offset): section_text = all_text[start:end] yield from self.split_page_by_max_tokens(page_num=find_page(start), text=section_text) - last_table_start = section_text.rfind(" 2 * self.sentence_search_limit and last_table_start > section_text.rfind(" 2 * self.sentence_search_limit and last_figure_start > section_text.rfind( + "=1.3.7 numpy>=1,<2.1.0 # Used by openai embeddings.create to optimize embeddings (but not required) tiktoken tenacity -azure-ai-documentintelligence +azure-ai-documentintelligence==1.0.0b4 azure-cognitiveservices-speech azure-cosmos azure-search-documents==11.6.0b6 @@ -31,3 +31,4 @@ types-beautifulsoup4 msgraph-sdk==1.1.0 openai-messages-token-helper python-dotenv +rich diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt index 2efe32b484..c148bb04f8 100644 --- a/app/backend/requirements.txt +++ b/app/backend/requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.11 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile requirements.in @@ -24,7 +24,7 @@ asgiref==3.8.1 # via opentelemetry-instrumentation-asgi attrs==24.2.0 # via aiohttp -azure-ai-documentintelligence==1.0.0b3 +azure-ai-documentintelligence==1.0.0b4 # via -r requirements.in azure-cognitiveservices-speech==1.40.0 # via -r requirements.in @@ -155,11 +155,15 @@ jinja2==3.1.4 # quart jiter==0.5.0 # via openai +markdown-it-py==3.0.0 + # via rich markupsafe==2.1.5 # via # jinja2 # quart # werkzeug +mdurl==0.1.2 + # via markdown-it-py microsoft-kiota-abstractions==1.3.3 # via # microsoft-kiota-authentication-azure @@ -338,6 +342,8 @@ pydantic==2.8.2 # via openai pydantic-core==2.20.1 # via pydantic +pygments==2.18.0 + # via rich pyjwt[crypto]==2.9.0 # via # -r requirements.in @@ -372,6 +378,8 @@ requests==2.32.3 # tiktoken requests-oauthlib==2.0.0 # via msrest +rich==13.9.4 + # via -r requirements.in six==1.16.0 # via # azure-core diff --git a/azure.yaml b/azure.yaml index fd673f48e0..d72dc2ff13 100644 --- a/azure.yaml +++ b/azure.yaml @@ -115,6 +115,7 @@ pipeline: - DEPLOYMENT_TARGET - AZURE_CONTAINER_APPS_WORKLOAD_PROFILE - USE_CHAT_HISTORY_BROWSER + - USE_MEDIA_DESCRIBER_AZURE_CU secrets: - AZURE_SERVER_APP_SECRET - AZURE_CLIENT_APP_SECRET diff --git a/docs/data_ingestion.md b/docs/data_ingestion.md index db64792029..0cf8c16417 100644 --- a/docs/data_ingestion.md +++ b/docs/data_ingestion.md @@ -69,7 +69,7 @@ A [recent change](https://github.com/Azure-Samples/azure-search-openai-demo/pull You may want to remove documents from the index. For example, if you're using the sample data, you may want to remove the documents that are already in the index before adding your own. -To remove all documents, use `scripts/prepdocs.sh --removeall` or `scripts/prepdocs.ps1 --removeall`. +To remove all documents, use `./scripts/prepdocs.sh --removeall` or `./scripts/prepdocs.ps1 --removeall`. You can also remove individual documents by using the `--remove` flag. Open either `scripts/prepdocs.sh` or `scripts/prepdocs.ps1` and replace `/data/*` with `/data/YOUR-DOCUMENT-FILENAME-GOES-HERE.pdf`. Then run `scripts/prepdocs.sh --remove` or `scripts/prepdocs.ps1 --remove`. diff --git a/docs/deploy_features.md b/docs/deploy_features.md index b1291a00b4..ea0c7e8288 100644 --- a/docs/deploy_features.md +++ b/docs/deploy_features.md @@ -7,6 +7,7 @@ You should typically enable these features before running `azd up`. Once you've * [Using GPT-4](#using-gpt-4) * [Using text-embedding-3 models](#using-text-embedding-3-models) * [Enabling GPT-4 Turbo with Vision](#enabling-gpt-4-turbo-with-vision) +* [Enabling media description with Azure Content Understanding](#enabling-media-description-with-azure-content-understanding) * [Enabling client-side chat history](#enabling-client-side-chat-history) * [Enabling persistent chat history with Azure Cosmos DB](#enabling-persistent-chat-history-with-azure-cosmos-db) * [Enabling language picker](#enabling-language-picker) @@ -149,8 +150,31 @@ If you have already deployed: ## Enabling GPT-4 Turbo with Vision +⚠️ This feature is not currently compatible with [integrated vectorization](#enabling-integrated-vectorization). + This section covers the integration of GPT-4 Vision with Azure AI Search. Learn how to enhance your search capabilities with the power of image and text indexing, enabling advanced search functionalities over diverse document types. For a detailed guide on setup and usage, visit our [Enabling GPT-4 Turbo with Vision](gpt4v.md) page. +## Enabling media description with Azure Content Understanding + +⚠️ This feature is not currently compatible with [integrated vectorization](#enabling-integrated-vectorization). +It is compatible with [GPT vision integration](./gpt4v.md), but the features provide similar functionality. + +By default, if your documents contain image-like figures, the data ingestion process will ignore those figures, +so users will not be able to ask questions about them. + +You can optionably enable the description of media content using Azure Content Understanding. When enabled, the data ingestion process will send figures to Azure Content Understanding and replace the figure with the description in the indexed document. +To learn more about this process and compare it to the gpt-4 vision integration, see [this guide](./data_ingestion.md#media-description). + +To enable media description with Azure Content Understanding, run: + +```shell +azd env set USE_MEDIA_DESCRIBER_AZURE_CU true +``` + +If you have already run `azd up`, you will need to run `azd provision` to create the new Content Understanding service. +If you have already indexed your documents and want to re-index them with the media descriptions, +first [remove the existing documents](./data_ingestion.md#removing-documents) and then [re-ingest the data](./data_ingestion.md#indexing-additional-documents). + ## Enabling client-side chat history This feature allows users to view the chat history of their conversation, stored in the browser using [IndexedDB](https://developer.mozilla.org/docs/Web/API/IndexedDB_API). That means the chat history will be available only on the device where the chat was initiated. To enable browser-stored chat history, run: @@ -215,6 +239,8 @@ azd env set USE_SPEECH_OUTPUT_BROWSER true ## Enabling Integrated Vectorization +⚠️ This feature is not currently compatible with the [GPT vision integration](./gpt4v.md). + Azure AI search recently introduced an [integrated vectorization feature in preview mode](https://techcommunity.microsoft.com/blog/azure-ai-services-blog/announcing-the-public-preview-of-integrated-vectorization-in-azure-ai-search/3960809). This feature is a cloud-based approach to data ingestion, which takes care of document format cracking, data extraction, chunking, vectorization, and indexing, all with Azure technologies. To enable integrated vectorization with this sample: @@ -238,8 +264,6 @@ To enable integrated vectorization with this sample: 4. You can view the resources such as the indexer and skillset in Azure Portal and monitor the status of the vectorization process. -⚠️ This feature is not currently compatible with the [GPT vision integration](./gpt4v.md). - ## Enabling authentication By default, the deployed Azure web app will have no authentication or access restrictions enabled, meaning anyone with routable network access to the web app can chat with your indexed data. If you'd like to automatically setup authentication and user login as part of the `azd up` process, see [this guide](./login_and_acl.md). diff --git a/infra/abbreviations.json b/infra/abbreviations.json index 5084711603..3673672a7e 100644 --- a/infra/abbreviations.json +++ b/infra/abbreviations.json @@ -29,6 +29,7 @@ "containerInstanceContainerGroups": "ci", "containerRegistryRegistries": "cr", "containerServiceManagedClusters": "aks-", + "cognitiveServicesContentUnderstanding": "cu-", "databricksWorkspaces": "dbw-", "dataFactoryFactories": "adf-", "dataLakeAnalyticsAccounts": "dla", diff --git a/infra/main.bicep b/infra/main.bicep index 344623df20..4431f6899f 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -117,6 +117,9 @@ param computerVisionResourceGroupName string = '' // Set in main.parameters.json param computerVisionResourceGroupLocation string = '' // Set in main.parameters.json param computerVisionSkuName string // Set in main.parameters.json +param contentUnderstandingServiceName string = '' // Set in main.parameters.json +param contentUnderstandingResourceGroupName string = '' // Set in main.parameters.json + param chatGptModelName string = '' param chatGptDeploymentName string = '' param chatGptDeploymentVersion string = '' @@ -216,6 +219,9 @@ param useVectors bool = false @description('Use Built-in integrated Vectorization feature of AI Search to vectorize and ingest documents') param useIntegratedVectorization bool = false +@description('Use media description feature with Azure Content Understanding during ingestion') +param useMediaDescriberAzureCU bool = true + @description('Enable user document upload feature') param useUserUpload bool = false param useLocalPdfParser bool = false @@ -267,6 +273,10 @@ resource computerVisionResourceGroup 'Microsoft.Resources/resourceGroups@2021-04 name: !empty(computerVisionResourceGroupName) ? computerVisionResourceGroupName : resourceGroup.name } +resource contentUnderstandingResourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' existing = if (!empty(contentUnderstandingResourceGroupName)) { + name: !empty(contentUnderstandingResourceGroupName) ? contentUnderstandingResourceGroupName : resourceGroup.name +} + resource searchServiceResourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' existing = if (!empty(searchServiceResourceGroupName)) { name: !empty(searchServiceResourceGroupName) ? searchServiceResourceGroupName : resourceGroup.name } @@ -392,6 +402,7 @@ var appEnvVariables = { AZURE_DOCUMENTINTELLIGENCE_SERVICE: documentIntelligence.outputs.name USE_LOCAL_PDF_PARSER: useLocalPdfParser USE_LOCAL_HTML_PARSER: useLocalHtmlParser + USE_MEDIA_DESCRIBER_AZURE_CU: useMediaDescriberAzureCU RUNNING_IN_PRODUCTION: 'true' } @@ -593,6 +604,28 @@ module computerVision 'br/public:avm/res/cognitive-services/account:0.7.2' = if } } + +module contentUnderstanding 'br/public:avm/res/cognitive-services/account:0.7.2' = if (useMediaDescriberAzureCU) { + name: 'content-understanding' + scope: contentUnderstandingResourceGroup + params: { + name: !empty(contentUnderstandingServiceName) + ? contentUnderstandingServiceName + : '${abbrs.cognitiveServicesContentUnderstanding}${resourceToken}' + kind: 'AIServices' + networkAcls: { + defaultAction: 'Allow' + } + customSubDomainName: !empty(contentUnderstandingServiceName) + ? contentUnderstandingServiceName + : '${abbrs.cognitiveServicesContentUnderstanding}${resourceToken}' + // Hard-coding to westus for now, due to limited availability and no overlap with Document Intelligence + location: 'westus' + tags: tags + sku: 'S0' + } +} + module speech 'br/public:avm/res/cognitive-services/account:0.7.2' = if (useSpeechOutputAzure) { name: 'speech-service' scope: speechResourceGroup diff --git a/infra/main.parameters.json b/infra/main.parameters.json index a7ba80373e..54541ca8a6 100644 --- a/infra/main.parameters.json +++ b/infra/main.parameters.json @@ -35,6 +35,12 @@ "computerVisionSkuName": { "value": "${AZURE_COMPUTER_VISION_SKU=S1}" }, + "contentUnderstandingServiceName": { + "value": "${AZURE_CONTENT_UNDERSTANDING_SERVICE}" + }, + "contentUnderstandingResourceGroupName": { + "value": "${AZURE_CONTENT_UNDERSTANDING_RESOURCE_GROUP}" + }, "documentIntelligenceServiceName": { "value": "${AZURE_DOCUMENTINTELLIGENCE_SERVICE}" }, @@ -289,6 +295,9 @@ }, "azureContainerAppsWorkloadProfile": { "value": "${AZURE_CONTAINER_APPS_WORKLOAD_PROFILE=Consumption}" + }, + "useMediaDescriberAzureCU": { + "value": "${USE_MEDIA_DESCRIBER_AZURE_CU=false}" } } } diff --git a/scripts/prepdocs.sh b/scripts/prepdocs.sh index b75a5efc96..ae277b149e 100755 --- a/scripts/prepdocs.sh +++ b/scripts/prepdocs.sh @@ -17,4 +17,4 @@ if [ $# -gt 0 ]; then additionalArgs="$@" fi -./.venv/bin/python ./app/backend/prepdocs.py './data/*' --verbose $additionalArgs +./.venv/bin/python ./app/backend/prepdocs.py './data/GPT4V_Examples/Financial Market Analysis Report 2023.pdf' --verbose $additionalArgs diff --git a/tests/snapshots/test_prepdocslib_textsplitter/test_pages_with_figures/split_pages_with_figures.json b/tests/snapshots/test_prepdocslib_textsplitter/test_pages_with_figures/split_pages_with_figures.json new file mode 100644 index 0000000000..72bddc1dbe --- /dev/null +++ b/tests/snapshots/test_prepdocslib_textsplitter/test_pages_with_figures/split_pages_with_figures.json @@ -0,0 +1,50 @@ +[ + { + "text": "# Financial Market Analysis Report 2023\n\nAn In-Depth Exploration of Stocks, Cryptocurrencies, and Commodities\nPrepared by: Contoso Financial Analytics## Executive Summary\n\n. In this comprehensive report, Contoso Financial Analytics provides a\ndeep dive into the financial markets of 2023, focusing on the trends\nand fluctuations within stocks, cryptocurrencies, and commodities.\nOur analysis covers historical patterns, current market conditions, and\nfuture predictions, offering valuable insights for investors, analysts,\nand financial enthusiasts. This report leverages advanced data\nanalytics to present a clear picture of the complex interplay between\ndifferent financial markets and their potential trajectories## Introduction to Financial Markets\n\n\n
Global Financial Market Distribution (2023)
This pie chart represents the distribution of investments across four categories: Stocks, Bonds, Cryptocurrencies, and Commodities. The chart is divided into four colored sections, each representing a different category.", + "page_num": 0 + }, + { + "text": "a clear picture of the complex interplay between\ndifferent financial markets and their potential trajectories## Introduction to Financial Markets\n\n\n
Global Financial Market Distribution (2023)
This pie chart represents the distribution of investments across four categories: Stocks, Bonds, Cryptocurrencies, and Commodities. The chart is divided into four colored sections, each representing a different category. Stocks are shown in blue, Bonds in orange, Cryptocurrencies in gray, and Commodities in yellow. The chart visually indicates the proportion of each investment type within a portfolio.

CategoryColor
StocksBlue
BondsOrange
CryptocurrenciesGray
CommoditiesYellow
\n\n\nThe global financial market is a vast and intricate network of\nexchanges, instruments, and assets, ranging from traditional stocks\nand bonds to modern cryptocurrencies and commodities. Each\nsegment plays a crucial role in the overall economy, and their\ninteractions can have profound effects on global financial stability.", + "page_num": 1 + }, + { + "text": " Each\nsegment plays a crucial role in the overall economy, and their\ninteractions can have profound effects on global financial stability.\nThis section provides an overview of these segments and sets the\nstage for a detailed analysis## Stock Market Overview\n\n\n

5-Year Trend of the S&P 500 Index

This line chart shows the trend of the S&P 500 Index over a five-year period from 2018 to 2022. The index starts at around 2500 in 2018, rises steadily to a peak of about 4500 in 2021, and then declines slightly to approximately 4000 in 2022.

YearS&P 500 Index
20182500
20193000
20203500
20214500
20224000
\n\n\nThe stock market is often considered the economy's\nheartbeat, reflecting corporate health and investor\nsentiment. Over the past five years, the S&P 500 index has\nexperienced significant volatility, with notable peaks and\ntroughs corresponding to various economic events.", + "page_num": 2 + }, + { + "text": " Over the past five years, the S&P 500 index has\nexperienced significant volatility, with notable peaks and\ntroughs corresponding to various economic events. This\noverview examines the key factors that have influenced\nthe stock market's performance and what they indicate\nabout the economy's state## Cryptocurrency Market Dynamics\n\n\n
Price Fluctuations of Bitcoin and Ethereum (Last 12 Months)

This line graph shows two data series over the months from January to December. The blue line represents a data series that starts at around 32,500 in January, peaks in May at about 42,500, dips in July, and then rises steadily to approximately 47,500 in December. The orange line represents a much lower data series, remaining relatively flat throughout the year, starting at around 2,500 in January and ending slightly above 2,500 in December.

\n\n\nCryptocurrencies have emerged as a new asset\nclass, captivating investors with their potential for\nhigh returns and their role in the future of finance.\nThis section explores the price dynamics of major\ncryptocurrencies like Bitcoin and Ethereum,\nanalyzing the ", + "page_num": 3 + }, + { + "text": "\nThis section explores the price dynamics of major\ncryptocurrencies like Bitcoin and Ethereum,\nanalyzing the factors driving their volatility and the\nimplications for the broader financial market.\n\n\n

The image shows a legend with two colored lines and labels. A blue line is labeled \"Bitconin\" and an orange line is labeled \"Ethereum.\" This legend is likely used to differentiate between two data sets or categories in a chart or graph, with \"Bitconin\" and \"Ethereum\" representing different entities or variables.
### Commodity Market Fluctuations\n\n\n
Price Changes of Oil, Gold, and Wheat
This is a horizontal bar chart showing the annual percentage change in prices for Wheat, Gold, and Oil from 2014 to 2022. The chart uses different colors to represent each commodity: gray for Wheat, orange for Gold, and blue for Oil. The x-axis represents the percentage change, ranging from -25% to 35%, while the y-axis lists the years from 2014 to 2022.", + "page_num": 4 + }, + { + "text": "
### Commodity Market Fluctuations\n\n\n
Price Changes of Oil, Gold, and Wheat
This is a horizontal bar chart showing the annual percentage change in prices for Wheat, Gold, and Oil from 2014 to 2022. The chart uses different colors to represent each commodity: gray for Wheat, orange for Gold, and blue for Oil. The x-axis represents the percentage change, ranging from -25% to 35%, while the y-axis lists the years from 2014 to 2022.\n\n\n\n\n\n\n\n\n\n\n\n\n
YearWheatGoldOil
20225%2%0%
20213%4%30%
20201%5%-20%
20192%3%10%
20180%1%15%
20174%2%5%
20163%6%-5%
20151%0%10%
20142%5%-10%
\n\n\nCommodities such as oil, gold, and\nwheat are fundamental to the ", + "page_num": 4 + }, + { + "text": "20151%0%10%\n20142%5%-10%\n
\n\n\nCommodities such as oil, gold, and\nwheat are fundamental to the global\neconomy, influencing everything from\nenergy costs to food prices. This section\ndelves into the trends and factors\naffecting commodity prices, including\ngeopolitical events, supply-chain\ndisruptions, and environmental factors,\nproviding a comprehensive view of this\ncrucial market segment.### Interplay Between Different Market Segments\n\n\n
\n\n
S&P 500NASDAQBitcoinEthereumOilGold
S&P 5001
NASDAQ0.951
Bitcoin0.30.41
Ethereum0.350.450.91
Oil0.60.650.20.251
Gold-0.", + "page_num": 5 + }, + { + "text": "### Interplay Between Different Market Segments\n\n\n
S&P 500NASDAQBitcoinEthereumOilGold
S&P 5001
NASDAQ0.951
Bitcoin0.30.41
Ethereum0.350.450.91
Oil0.60.650.20.251
Gold-0.2-0.15-0.1-0.05-0.31
\n\n\nFinancial markets are interconnected, with movements in one segment often influencing others. This\nsection examines the correlations between stock indices, cryptocurrency prices, and commodity prices,\nrevealing how changes in one market can have ripple effects across the financial ecosystem.### Impact of Macroeconomic Factors\n\n\n
Impact of Interest Rates, Inflation, and GDP Growth on Financial ", + "page_num": 6 + }, + { + "text": "### Impact of Macroeconomic Factors\n\n\n
Impact of Interest Rates, Inflation, and GDP Growth on Financial Markets

The image is a line graph titled \"On Financial Markets\" showing the trends of Interest Rates %, Inflation Data %, and GDP Growth % from 2018 to 2023. The graph has three lines representing each of these metrics over the years.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n
YearInterest Rates %Inflation Data %GDP Growth %
2018223
201922.52
202011.5-4
20211.533
202223.52
20232.532.5
\n\n

The graph shows that GDP Growth % experienced a significant drop in 2020, while Inflation Data % and Interest Rates % remained relatively stable with slight fluctuations over the years.", + "page_num": 7 + }, + { + "text": "5

\n\n

The graph shows that GDP Growth % experienced a significant drop in 2020, while Inflation Data % and Interest Rates % remained relatively stable with slight fluctuations over the years.

\n\n\nMacroeconomic factors such as interest\nrates, inflation, and GDP growth play a\npivotal role in shaping financial markets.\nThis section analyzes how these factors\nhave influenced stock, cryptocurrency,\nand commodity markets over recent\nyears, providing insights into the\ncomplex relationship between the\neconomy and financial market\nperformance.## Future Predictions and Trends\n\n\n
Relative Growth Trends for S&P 500, Bitcoin, and Oil Prices (2024 Indexed to 100)

Prices (2024 Indexed to 100)

\n

This bar chart compares the indexed prices of Oil, Bitcoin, and the S&P 500 from 2024 to 2028, with 2024 set as the base year (indexed to 100). The chart shows the relative price changes over the years for each asset.

\n\n\n\n\n\n\n", + "page_num": 7 + }, + { + "text": "## Future Predictions and Trends\n\n\n
Relative Growth Trends for S&P 500, Bitcoin, and Oil Prices (2024 Indexed to 100)

Prices (2024 Indexed to 100)

\n

This bar chart compares the indexed prices of Oil, Bitcoin, and the S&P 500 from 2024 to 2028, with 2024 set as the base year (indexed to 100). The chart shows the relative price changes over the years for each asset.

\n
YearOilBitcoinS&P 500
2024100100100
\n\n\n\n\n\n\n\n\n\n\n
YearOilBitcoinS&P 500
2024100100100
2025105110108
2026110115112
2027115120116
2028120125120
\n\n\nBased on historical data, current trends,\nand economic indicators, this section\npresents predictions for the future of\nfinancial markets. We explore potential\ntrajectories for stock indices,\ncryptocurrency values, and commodity\nprices, offering investors and analysts\n", + "page_num": 8 + }, + { + "text": " We explore potential\ntrajectories for stock indices,\ncryptocurrency values, and commodity\nprices, offering investors and analysts\nforesight into what the coming years\nmight hold.## Conclusions\n\n. In conclusion, this report has traversed the multifaceted landscape of\nfinancial markets, shedding light on the intricate patterns and\ninterdependencies that define their behavior. From the volatility of\ncryptocurrencies to the steadiness of commodities, each segment\ntells a part of the story of our global economy. As Contoso Financial\nAnalytics, we are committed to providing our clients with the most\ncomprehensive and nuanced analysis, empowering them to make\ninformed financial decisions in an ever-evolving market.", + "page_num": 8 + } +] \ No newline at end of file diff --git a/tests/test-data/pages_with_figures.json b/tests/test-data/pages_with_figures.json new file mode 100644 index 0000000000..0b157c7f0c --- /dev/null +++ b/tests/test-data/pages_with_figures.json @@ -0,0 +1 @@ +[{"page_num": 0, "offset": 0, "text": "# Financial Market Analysis Report 2023\n\nAn In-Depth Exploration of Stocks, Cryptocurrencies, and Commodities\nPrepared by: Contoso Financial Analytics"}, {"page_num": 1, "offset": 150, "text": "## Executive Summary\n\n. In this comprehensive report, Contoso Financial Analytics provides a\ndeep dive into the financial markets of 2023, focusing on the trends\nand fluctuations within stocks, cryptocurrencies, and commodities.\nOur analysis covers historical patterns, current market conditions, and\nfuture predictions, offering valuable insights for investors, analysts,\nand financial enthusiasts. This report leverages advanced data\nanalytics to present a clear picture of the complex interplay between\ndifferent financial markets and their potential trajectories"}, {"page_num": 2, "offset": 716, "text": "## Introduction to Financial Markets\n\n\n
Global Financial Market Distribution (2023)
This pie chart represents the distribution of investments across four categories: Stocks, Bonds, Cryptocurrencies, and Commodities. The chart is divided into four colored sections, each representing a different category. Stocks are shown in blue, Bonds in orange, Cryptocurrencies in gray, and Commodities in yellow. The chart visually indicates the proportion of each investment type within a portfolio.

CategoryColor
StocksBlue
BondsOrange
CryptocurrenciesGray
CommoditiesYellow
\n\n\nThe global financial market is a vast and intricate network of\nexchanges, instruments, and assets, ranging from traditional stocks\nand bonds to modern cryptocurrencies and commodities. Each\nsegment plays a crucial role in the overall economy, and their\ninteractions can have profound effects on global financial stability.\nThis section provides an overview of these segments and sets the\nstage for a detailed analysis"}, {"page_num": 3, "offset": 1897, "text": "## Stock Market Overview\n\n\n

5-Year Trend of the S&P 500 Index

This line chart shows the trend of the S&P 500 Index over a five-year period from 2018 to 2022. The index starts at around 2500 in 2018, rises steadily to a peak of about 4500 in 2021, and then declines slightly to approximately 4000 in 2022.

YearS&P 500 Index
20182500
20193000
20203500
20214500
20224000
\n\n\nThe stock market is often considered the economy's\nheartbeat, reflecting corporate health and investor\nsentiment. Over the past five years, the S&P 500 index has\nexperienced significant volatility, with notable peaks and\ntroughs corresponding to various economic events. This\noverview examines the key factors that have influenced\nthe stock market's performance and what they indicate\nabout the economy's state"}, {"page_num": 4, "offset": 2937, "text": "## Cryptocurrency Market Dynamics\n\n\n
Price Fluctuations of Bitcoin and Ethereum (Last 12 Months)

This line graph shows two data series over the months from January to December. The blue line represents a data series that starts at around 32,500 in January, peaks in May at about 42,500, dips in July, and then rises steadily to approximately 47,500 in December. The orange line represents a much lower data series, remaining relatively flat throughout the year, starting at around 2,500 in January and ending slightly above 2,500 in December.

\n\n\nCryptocurrencies have emerged as a new asset\nclass, captivating investors with their potential for\nhigh returns and their role in the future of finance.\nThis section explores the price dynamics of major\ncryptocurrencies like Bitcoin and Ethereum,\nanalyzing the factors driving their volatility and the\nimplications for the broader financial market.\n\n\n

The image shows a legend with two colored lines and labels. A blue line is labeled \"Bitconin\" and an orange line is labeled \"Ethereum.\" This legend is likely used to differentiate between two data sets or categories in a chart or graph, with \"Bitconin\" and \"Ethereum\" representing different entities or variables.
"}, {"page_num": 5, "offset": 4243, "text": "### Commodity Market Fluctuations\n\n\n
Price Changes of Oil, Gold, and Wheat
This is a horizontal bar chart showing the annual percentage change in prices for Wheat, Gold, and Oil from 2014 to 2022. The chart uses different colors to represent each commodity: gray for Wheat, orange for Gold, and blue for Oil. The x-axis represents the percentage change, ranging from -25% to 35%, while the y-axis lists the years from 2014 to 2022.\n\n\n\n\n\n\n\n\n\n\n\n\n
YearWheatGoldOil
20225%2%0%
20213%4%30%
20201%5%-20%
20192%3%10%
20180%1%15%
20174%2%5%
20163%6%-5%
20151%0%10%
20142%5%-10%
\n\n\nCommodities such as oil, gold, and\nwheat are fundamental to the global\neconomy, influencing everything from\nenergy costs to food prices. This section\ndelves into the trends and factors\naffecting commodity prices, including\ngeopolitical events, supply-chain\ndisruptions, and environmental factors,\nproviding a comprehensive view of this\ncrucial market segment."}, {"page_num": 6, "offset": 5673, "text": "### Interplay Between Different Market Segments\n\n\n
S&P 500NASDAQBitcoinEthereumOilGold
S&P 5001
NASDAQ0.951
Bitcoin0.30.41
Ethereum0.350.450.91
Oil0.60.650.20.251
Gold-0.2-0.15-0.1-0.05-0.31
\n\n\nFinancial markets are interconnected, with movements in one segment often influencing others. This\nsection examines the correlations between stock indices, cryptocurrency prices, and commodity prices,\nrevealing how changes in one market can have ripple effects across the financial ecosystem."}, {"page_num": 7, "offset": 6695, "text": "### Impact of Macroeconomic Factors\n\n\n
Impact of Interest Rates, Inflation, and GDP Growth on Financial Markets

The image is a line graph titled \"On Financial Markets\" showing the trends of Interest Rates %, Inflation Data %, and GDP Growth % from 2018 to 2023. The graph has three lines representing each of these metrics over the years.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n
YearInterest Rates %Inflation Data %GDP Growth %
2018223
201922.52
202011.5-4
20211.533
202223.52
20232.532.5
\n\n

The graph shows that GDP Growth % experienced a significant drop in 2020, while Inflation Data % and Interest Rates % remained relatively stable with slight fluctuations over the years.

\n\n\nMacroeconomic factors such as interest\nrates, inflation, and GDP growth play a\npivotal role in shaping financial markets.\nThis section analyzes how these factors\nhave influenced stock, cryptocurrency,\nand commodity markets over recent\nyears, providing insights into the\ncomplex relationship between the\neconomy and financial market\nperformance."}, {"page_num": 8, "offset": 8102, "text": "## Future Predictions and Trends\n\n\n
Relative Growth Trends for S&P 500, Bitcoin, and Oil Prices (2024 Indexed to 100)

Prices (2024 Indexed to 100)

\n

This bar chart compares the indexed prices of Oil, Bitcoin, and the S&P 500 from 2024 to 2028, with 2024 set as the base year (indexed to 100). The chart shows the relative price changes over the years for each asset.

\n\n\n\n\n\n\n\n\n\n\n\n
YearOilBitcoinS&P 500
2024100100100
2025105110108
2026110115112
2027115120116
2028120125120
\n\n\nBased on historical data, current trends,\nand economic indicators, this section\npresents predictions for the future of\nfinancial markets. We explore potential\ntrajectories for stock indices,\ncryptocurrency values, and commodity\nprices, offering investors and analysts\nforesight into what the coming years\nmight hold."}, {"page_num": 9, "offset": 9281, "text": "## Conclusions\n\n. In conclusion, this report has traversed the multifaceted landscape of\nfinancial markets, shedding light on the intricate patterns and\ninterdependencies that define their behavior. From the volatility of\ncryptocurrencies to the steadiness of commodities, each segment\ntells a part of the story of our global economy. As Contoso Financial\nAnalytics, we are committed to providing our clients with the most\ncomprehensive and nuanced analysis, empowering them to make\ninformed financial decisions in an ever-evolving market."}] diff --git a/tests/test_prepdocslib_textsplitter.py b/tests/test_prepdocslib_textsplitter.py index 87049d4dad..c71e15c826 100644 --- a/tests/test_prepdocslib_textsplitter.py +++ b/tests/test_prepdocslib_textsplitter.py @@ -17,13 +17,13 @@ def test_sentencetextsplitter_split_empty_pages(): - t = SentenceTextSplitter(has_image_embeddings=False) + t = SentenceTextSplitter() assert list(t.split_pages([])) == [] def test_sentencetextsplitter_split_small_pages(): - t = SentenceTextSplitter(has_image_embeddings=False) + t = SentenceTextSplitter() split_pages = list(t.split_pages(pages=[Page(page_num=0, offset=0, text="Not a large page")])) assert len(split_pages) == 1 @@ -33,7 +33,7 @@ def test_sentencetextsplitter_split_small_pages(): @pytest.mark.asyncio async def test_sentencetextsplitter_list_parse_and_split(tmp_path, snapshot): - text_splitter = SentenceTextSplitter(has_image_embeddings=False) + text_splitter = SentenceTextSplitter() pdf_parser = LocalPdfParser() for pdf in Path("data").glob("*.pdf"): shutil.copy(str(pdf.absolute()), tmp_path) @@ -98,7 +98,7 @@ def pytest_generate_tests(metafunc): @pytest.mark.asyncio async def test_sentencetextsplitter_multilang(test_doc, tmp_path): - text_splitter = SentenceTextSplitter(has_image_embeddings=False) + text_splitter = SentenceTextSplitter() bpe = tiktoken.encoding_for_model(ENCODING_MODEL) pdf_parser = LocalPdfParser() @@ -133,7 +133,7 @@ async def test_sentencetextsplitter_multilang(test_doc, tmp_path): def test_split_tables(): - t = SentenceTextSplitter(has_image_embeddings=False) + t = SentenceTextSplitter() test_text_without_table = """Contoso Electronics is a leader in the aerospace industry, providing advanced electronic components for both commercial and military aircraft. We specialize in creating cutting- @@ -166,3 +166,23 @@ def test_split_tables(): assert "