diff --git a/DocumentVectorPipelineFunctions/BlobTriggerFunction.cs b/DocumentVectorPipelineFunctions/BlobTriggerFunction.cs index a2df548..fac17c2 100644 --- a/DocumentVectorPipelineFunctions/BlobTriggerFunction.cs +++ b/DocumentVectorPipelineFunctions/BlobTriggerFunction.cs @@ -1,3 +1,5 @@ +using System.ClientModel; +using System.Net; using Azure; using Azure.AI.FormRecognizer.DocumentAnalysis; using Azure.Storage.Blobs; @@ -20,9 +22,11 @@ public class BlobTriggerFunction( private const string AzureOpenAIModelDeploymentDimensionsName = "AzureOpenAIModelDimensions"; private static readonly int DefaultDimensions = 1536; - private static readonly int BufferSize = 4 * 1024 * 1024; // 4MB - private const int MaxBatchSize = 2048; + private const int MaxRetryCount = 100; + private const int RetryDelay = 10 * 1000; // 100 seconds + + private const int MaxBatchSize = 25; [Function("BlobTriggerFunction")] public async Task Run([BlobTrigger("documents/{name}", Connection = "AzureBlobStorageAccConnectionString")] BlobClient blobClient) @@ -46,12 +50,13 @@ private async Task HandleBlobCreateEventAsync(BlobClient blobClient) this._logger.LogInformation("Analyzing document using DocumentAnalyzerService from blobUri: '{blobUri}' using layout: {layout}", blobClient.Name, "prebuilt-read"); - MemoryStream memoryStream = new MemoryStream(); + using MemoryStream memoryStream = new MemoryStream(); await blobClient.DownloadToAsync(memoryStream); + memoryStream.Seek(0, SeekOrigin.Begin); var operation = await documentAnalysisClient.AnalyzeDocumentAsync( WaitUntil.Completed, - "prebuilt-read", + "prebuilt-document", memoryStream); var result = operation.Value; @@ -67,6 +72,7 @@ private async Task HandleBlobCreateEventAsync(BlobClient blobClient) if (batchChunkTexts.Count >= MaxBatchSize) { await this.ProcessCurrentBatchAsync(blobClient, cosmosDBClientWrapper, batchChunkTexts); + batchChunkTexts.Clear(); } } @@ -76,13 +82,20 @@ private async Task HandleBlobCreateEventAsync(BlobClient blobClient) await this.ProcessCurrentBatchAsync(blobClient, cosmosDBClientWrapper, batchChunkTexts); } - this._logger.LogInformation("Finished processing blob {0}, total chunks processed {1}.", blobClient.Name, totalChunksCount); + this._logger.LogInformation("Finished processing blob {name}, total chunks processed {count}.", blobClient.Name, totalChunksCount); } private async Task ProcessCurrentBatchAsync(BlobClient blobClient, CosmosDBClientWrapper cosmosDBClientWrapper, List batchChunkTexts) { + this._logger.LogInformation("Generating embeddings for : '{count}'.", batchChunkTexts.Count()); + var embeddings = await this.GenerateEmbeddingsWithRetryAsync(batchChunkTexts); + this._logger.LogInformation("Creating Cosmos DB documents for batch of size {count}", batchChunkTexts.Count); + await cosmosDBClientWrapper.UpsertDocumentsAsync(blobClient.Uri.AbsoluteUri, batchChunkTexts, embeddings); + } + private async Task GenerateEmbeddingsWithRetryAsync(IEnumerable batchChunkTexts) + { int embeddingDimensions = configuration.GetValue(AzureOpenAIModelDeploymentDimensionsName, DefaultDimensions); this._logger.LogInformation("Using OpenAI model dimensions: '{embeddingDimensions}'.", embeddingDimensions); @@ -90,10 +103,35 @@ private async Task ProcessCurrentBatchAsync(BlobClient blobClient, CosmosDBClien { Dimensions = embeddingDimensions }; - var embeddings = await embeddingClient.GenerateEmbeddingsAsync(batchChunkTexts.Select(p => p.Text).ToList(), embeddingGenerationOptions); - await cosmosDBClientWrapper.UpsertDocumentsAsync(blobClient.Uri.AbsoluteUri, batchChunkTexts, embeddings); - batchChunkTexts.Clear(); + int retryCount = 0; + while (retryCount < MaxRetryCount) + { + try + { + return await embeddingClient.GenerateEmbeddingsAsync(batchChunkTexts.Select(p => p.Text).ToList(), embeddingGenerationOptions); + } + catch (ClientResultException ex) + { + if (ex.Status is ((int)HttpStatusCode.TooManyRequests) or ((int)HttpStatusCode.Unauthorized)) + { + if (retryCount >= MaxRetryCount) + { + throw new Exception($"Max retry attempts reached generating embeddings with exception: {ex}."); + } + + retryCount++; + + await Task.Delay(RetryDelay); + } + else + { + throw new Exception($"Failed to generate embeddings with error: {ex}."); + } + } + } + + throw new Exception($"Failed to generate embeddings after retrying for ${MaxRetryCount} times."); } private async Task HandleBlobDeleteEventAsync(BlobClient blobClient) diff --git a/DocumentVectorPipelineFunctions/Program.cs b/DocumentVectorPipelineFunctions/Program.cs index bf0afc4..cdd31b3 100644 --- a/DocumentVectorPipelineFunctions/Program.cs +++ b/DocumentVectorPipelineFunctions/Program.cs @@ -27,7 +27,6 @@ .ConfigureFunctionsWorkerDefaults() .ConfigureAppConfiguration(config => { - config.AddEnvironmentVariables(); config.AddUserSecrets(optional: true, reloadOnChange: false); }); diff --git a/deployment/cosmosdb.bicep b/deployment/cosmosdb.bicep index 81b2edb..1a77c59 100644 --- a/deployment/cosmosdb.bicep +++ b/deployment/cosmosdb.bicep @@ -1,7 +1,7 @@ param location string = resourceGroup().location param capabilities array = [ { name: 'EnableServerless' } - { name: 'EnableNoSQLVectorSearch' /*TODO: This doesn't seem to work on account creation.*/} + { name: 'EnableNoSQLVectorSearch' /*TODO: This doesn't seem to work on account creation.*/ } ] // Input parameters @@ -9,7 +9,6 @@ param databaseName string param name string param tags object - // Create cosmosdb account resource cosmosDB 'Microsoft.DocumentDB/databaseAccounts@2024-05-15' = { name: name @@ -38,18 +37,6 @@ resource managedIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023- name: managedIdentityName } -// Assign storage account contributor role to azure function app -param id_roles_arr array = ['b24988ac-6180-42a0-ab88-20f7382dd24c', '230815da-be43-4aae-9cb4-875f7bd000aa'] // Contributor (priviledged role), CosmosDB Operator, Data contributor -resource roleAssignmentFUnctionApp 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for id_role in id_roles_arr : { - name: guid(resourceGroup().id, '${cosmosDB.name}-funcrole', id_role) - scope: cosmosDB - properties: { - roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', id_role) - principalId: managedIdentity.properties.principalId - } - } -] - // Create database resource database 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases@2024-05-15' = { parent: cosmosDB @@ -62,5 +49,16 @@ resource database 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases@2024-05-15 tags: tags } +param id_role string = '00000000-0000-0000-0000-000000000002' // Built-in data contributor +resource roleAssignmentSqlCosmosDB 'Microsoft.DocumentDB/databaseAccounts/sqlRoleAssignments@2021-10-15' = { + name: guid(resourceGroup().id, '${name}-datacontributorrole', id_role) + parent: cosmosDB + properties: { + principalId: managedIdentity.properties.principalId + roleDefinitionId: resourceId('Microsoft.DocumentDB/databaseAccounts/sqlRoleDefinitions', name, id_role) + scope: cosmosDB.id + } +} + output CosmosDBAccountName string = cosmosDB.name output CosmosDBEndpoint string = cosmosDB.properties.documentEndpoint diff --git a/deployment/documentintelligence.bicep b/deployment/documentintelligence.bicep index 97ef355..e68f237 100644 --- a/deployment/documentintelligence.bicep +++ b/deployment/documentintelligence.bicep @@ -39,14 +39,15 @@ resource documentIntelligence 'Microsoft.CognitiveServices/accounts@2024-04-01-p sku: sku } -param storage_account_id_roles array = ['a97b65f3-24c7-4388-baec-2e87135dc908','a001fd3d-188f-4b5d-821b-7da978bf7442'] //Cognitive service user, openai contributor -resource roleAssignmentFuncStorageAccount 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for id_role in storage_account_id_roles : { - name: guid(resourceGroup().id, '${documentIntelligence.name}-storagerole', id_role) - scope: documentIntelligence - properties: { - roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', id_role) - principalId: managedIdentity.properties.principalId - } +param storage_account_id_roles array = ['a97b65f3-24c7-4388-baec-2e87135dc908'] //Cognitive service user +resource roleAssignmentDocumentIntelligence 'Microsoft.Authorization/roleAssignments@2022-04-01' = [ + for id_role in storage_account_id_roles: { + name: guid(resourceGroup().id, '${documentIntelligence.name}-storagerole', id_role) + scope: documentIntelligence + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', id_role) + principalId: managedIdentity.properties.principalId + } } ] diff --git a/deployment/functionapp.bicep b/deployment/functionapp.bicep index 9fff5e8..6f09c52 100644 --- a/deployment/functionapp.bicep +++ b/deployment/functionapp.bicep @@ -37,7 +37,6 @@ resource storageAccount 'Microsoft.Storage/storageAccounts@2023-05-01' existing } var storageConnectionStringValue = 'DefaultEndpointsProtocol=https;AccountName=${storageAccount.name};EndpointSuffix=${environment().suffixes.storage};AccountKey=${storageAccount.listKeys().keys[0].value}' - // Create webapps storage account to hold webapps related resources resource func_app_storage_account 'Microsoft.Storage/storageAccounts@2023-05-01' = { name: funcAppStorageAccountName @@ -55,10 +54,10 @@ resource func_app_storage_account 'Microsoft.Storage/storageAccounts@2023-05-01' } var funcAppStorageConnectionStringValue = 'DefaultEndpointsProtocol=https;AccountName=${func_app_storage_account.name};EndpointSuffix=${environment().suffixes.storage};AccountKey=${func_app_storage_account.listKeys().keys[0].value}' - // Assign storage account contributor role to func_app_storage_account param storage_account_id_roles array = ['ba92f5b4-2d11-453d-a403-e96b0029c9fe'] // Storage blob data contributor -resource roleAssignmentFuncStorageAccount 'Microsoft.Authorization/roleAssignments@2020-04-01-preview' = [for id_role in storage_account_id_roles : { +resource roleAssignmentFuncStorageAccount 'Microsoft.Authorization/roleAssignments@2020-04-01-preview' = [ + for id_role in storage_account_id_roles: { name: guid(resourceGroup().id, '${func_app_storage_account.name}-webjobsrole', id_role) scope: func_app_storage_account properties: { @@ -68,7 +67,6 @@ resource roleAssignmentFuncStorageAccount 'Microsoft.Authorization/roleAssignmen } ] - // Create a new Log Analytics workspace to back the Azure Application Insights instance resource logAnalytics 'Microsoft.OperationalInsights/workspaces@2023-09-01' = { name: logAnalyticsName @@ -110,8 +108,7 @@ resource appservice_plan 'Microsoft.Web/serverfarms@2023-12-01' = { sku: { name: 'Y1' } - properties: { - } + properties: {} } // Deploy the Azure Function app with application @@ -221,7 +218,8 @@ resource funcApp 'Microsoft.Web/sites@2023-12-01' = { // Assign storage account contributor role to azure function app param id_roles_arr array = ['b24988ac-6180-42a0-ab88-20f7382dd24c'] // Contributor (priviledged access) -resource roleAssignmentFUnctionApp 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for id_role in id_roles_arr : { +resource roleAssignmentFunctionApp 'Microsoft.Authorization/roleAssignments@2022-04-01' = [ + for id_role in id_roles_arr: { name: guid(resourceGroup().id, '${func_app_storage_account.name}-funcrole', id_role) scope: funcApp properties: { diff --git a/deployment/main.bicep b/deployment/main.bicep index d04f74a..891cfea 100644 --- a/deployment/main.bicep +++ b/deployment/main.bicep @@ -38,7 +38,6 @@ param document_intelligence_sku object param document_intelligence_publicNetworkAccess string param document_intelligence_disableLocalAuth bool - // User managed identity resource module userManagedIdentity_deployment 'userIdentity.bicep' = { name: 'userManagedIdentity_deployment' @@ -47,7 +46,6 @@ module userManagedIdentity_deployment 'userIdentity.bicep' = { } } - // Storage resource module storage_deployment 'storage.bicep' = { name: 'storage_deployment' @@ -55,14 +53,13 @@ module storage_deployment 'storage.bicep' = { name: storage_name containers: storage_containers tags: tags - managedIdentityName:managedIdentity_name + managedIdentityName: managedIdentity_name } dependsOn: [ userManagedIdentity_deployment ] } - // CosmosDB resource module cosmosdb_deployment 'cosmosdb.bicep' = { name: 'cosmosdb_deployment' @@ -79,7 +76,6 @@ module cosmosdb_deployment 'cosmosdb.bicep' = { ] } - // Document Intelligence resource module document_intelligence_deployment 'documentintelligence.bicep' = { name: 'document_intelligence_deployment' @@ -97,18 +93,17 @@ module document_intelligence_deployment 'documentintelligence.bicep' = { ] } - // OpenAI Resource module open_ai_deployment 'openai.bicep' = { name: 'open_ai_deployment' params: { deployments: open_ai_deployments - managedIdentityName:managedIdentity_name + managedIdentityName: managedIdentity_name name: open_ai_name format: open_ai_format kind: open_ai_kind sku: open_ai_sku - publicNetworkAccess:open_ai_publicNetworkAccess + publicNetworkAccess: open_ai_publicNetworkAccess tags: tags } dependsOn: [ @@ -116,12 +111,11 @@ module open_ai_deployment 'openai.bicep' = { ] } - // Function App Resource module function_app_deployment 'functionapp.bicep' = { name: 'function_app_deployment' params: { - managedIdentityName:managedIdentity_name + managedIdentityName: managedIdentity_name functionAppName: function_app_name funcAppStorageSkuName: function_app_storageSkuName funcAppStorageAccountName: function_app_storageAccountName @@ -144,7 +138,6 @@ module function_app_deployment 'functionapp.bicep' = { ] } - // Output params // User Managed Identity and KeyVault Output Params output AZURE_USER_MANAGED_IDENTITY_NAME string = userManagedIdentity_deployment.outputs.AzureManagedIdentityName diff --git a/deployment/main.bicepparam b/deployment/main.bicepparam index f53b31b..b1047cc 100644 --- a/deployment/main.bicepparam +++ b/deployment/main.bicepparam @@ -20,19 +20,16 @@ param storage_containers = [ } ] - // Function app params param function_app_storageSkuName = 'Standard_LRS' - // CosmosDB params param cosmosdb_databaseName = 'semantic_search_db' param cosmosdb_capabilities = [ { name: 'EnableServerless' } - { name: 'EnableNoSQLVectorSearch'} + { name: 'EnableNoSQLVectorSearch' } ] - // Document Intelligence Params param document_intelligence_sku = { name: 'S0' @@ -40,7 +37,6 @@ param document_intelligence_sku = { param document_intelligence_publicNetworkAccess = 'Enabled' param document_intelligence_disableLocalAuth = false - // Open AI params param modelDeployment = 'text-embedding-3-large' param modelDimensions = '1536' @@ -49,7 +45,7 @@ param open_ai_deployments = [ name: modelDeployment sku: { name: 'Standard' - capacity: 10 + capacity: 50 } model: { name: modelDeployment diff --git a/deployment/openai.bicep b/deployment/openai.bicep index 4210937..4b15a48 100644 --- a/deployment/openai.bicep +++ b/deployment/openai.bicep @@ -1,4 +1,4 @@ -param location string= resourceGroup().location +param location string = resourceGroup().location // Input parameters param deployments array @@ -49,15 +49,17 @@ resource openAiDeployments 'Microsoft.CognitiveServices/accounts/deployments@202 } ] - // Assign user managed identity to openai app. param managedIdentityName string resource managedIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-07-31-preview' existing = { name: managedIdentityName } -param storage_account_id_roles array = ['b24988ac-6180-42a0-ab88-20f7382dd24c', 'a001fd3d-188f-4b5d-821b-7da978bf7442','a97b65f3-24c7-4388-baec-2e87135dc908'] // contributor (priviledged access), Cognitive Services OpenAI Contributor +param storage_account_id_roles array = [ + 'a97b65f3-24c7-4388-baec-2e87135dc908' //Cognitive Services User +] -resource roleAssignmentFuncStorageAccount 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for id_role in storage_account_id_roles : { +resource roleAssignmentOpenAIAccount 'Microsoft.Authorization/roleAssignments@2022-04-01' = [ + for id_role in storage_account_id_roles: { name: guid(resourceGroup().id, '${name}-openairole', id_role) scope: openAi properties: { @@ -67,6 +69,5 @@ resource roleAssignmentFuncStorageAccount 'Microsoft.Authorization/roleAssignmen } ] - output openAIServiceName string = openAi.name output openAIServiceEndpoint string = openAi.properties.endpoint diff --git a/deployment/openai.bicepparam b/deployment/openai.bicepparam index 6a2672e..4f32453 100644 --- a/deployment/openai.bicepparam +++ b/deployment/openai.bicepparam @@ -20,4 +20,4 @@ param sku = 'S0' param kind = 'OpenAI' param format = 'OpenAI' param publicNetworkAccess = 'Enabled' -param tags = {} +param tags = {} diff --git a/deployment/storage.bicep b/deployment/storage.bicep index b5ac60b..f40f3fa 100644 --- a/deployment/storage.bicep +++ b/deployment/storage.bicep @@ -23,10 +23,10 @@ resource blobService 'Microsoft.Storage/storageAccounts/blobServices@2023-05-01' } resource blobContainers 'Microsoft.Storage/storageAccounts/blobServices/containers@2023-05-01' = [ - for container in containers: { - parent: blobService - name: container.name - } + for container in containers: { + parent: blobService + name: container.name + } ] // Assign user identity permissions to storage account @@ -35,10 +35,9 @@ resource managedIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023- name: managedIdentityName } - -//param storage_account_id_roles array = ['b24988ac-6180-42a0-ab88-20f7382dd24c','ba92f5b4-2d11-453d-a403-e96b0029c9fe'] // Contributor, Storage blob data contributor param storage_account_id_roles array = ['2a2b9908-6ea1-4ae2-8e65-a410df84e7d1'] // Storage blob data reader -resource roleAssignmentFuncStorageAccount 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for id_role in storage_account_id_roles : { +resource roleAssignmentStorageAccount 'Microsoft.Authorization/roleAssignments@2022-04-01' = [ + for id_role in storage_account_id_roles: { name: guid(resourceGroup().id, '${storage.name}-storagerole', id_role) scope: blobService properties: {