Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
New changes.
Browse files Browse the repository at this point in the history
  • Loading branch information
amisi01 committed Aug 24, 2024
1 parent 297e867 commit b710197
Show file tree
Hide file tree
Showing 10 changed files with 91 additions and 68 deletions.
54 changes: 46 additions & 8 deletions DocumentVectorPipelineFunctions/BlobTriggerFunction.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
using System.ClientModel;
using System.Net;
using Azure;
using Azure.AI.FormRecognizer.DocumentAnalysis;
using Azure.Storage.Blobs;
Expand All @@ -20,9 +22,11 @@ public class BlobTriggerFunction(

private const string AzureOpenAIModelDeploymentDimensionsName = "AzureOpenAIModelDimensions";
private static readonly int DefaultDimensions = 1536;
private static readonly int BufferSize = 4 * 1024 * 1024; // 4MB

private const int MaxBatchSize = 2048;
private const int MaxRetryCount = 100;
private const int RetryDelay = 10 * 1000; // 100 seconds

private const int MaxBatchSize = 25;

[Function("BlobTriggerFunction")]
public async Task Run([BlobTrigger("documents/{name}", Connection = "AzureBlobStorageAccConnectionString")] BlobClient blobClient)
Expand All @@ -46,12 +50,13 @@ private async Task HandleBlobCreateEventAsync(BlobClient blobClient)

this._logger.LogInformation("Analyzing document using DocumentAnalyzerService from blobUri: '{blobUri}' using layout: {layout}", blobClient.Name, "prebuilt-read");

MemoryStream memoryStream = new MemoryStream();
using MemoryStream memoryStream = new MemoryStream();
await blobClient.DownloadToAsync(memoryStream);
memoryStream.Seek(0, SeekOrigin.Begin);

var operation = await documentAnalysisClient.AnalyzeDocumentAsync(
WaitUntil.Completed,
"prebuilt-read",
"prebuilt-document",
memoryStream);

var result = operation.Value;
Expand All @@ -67,6 +72,7 @@ private async Task HandleBlobCreateEventAsync(BlobClient blobClient)
if (batchChunkTexts.Count >= MaxBatchSize)
{
await this.ProcessCurrentBatchAsync(blobClient, cosmosDBClientWrapper, batchChunkTexts);
batchChunkTexts.Clear();
}
}

Expand All @@ -76,24 +82,56 @@ private async Task HandleBlobCreateEventAsync(BlobClient blobClient)
await this.ProcessCurrentBatchAsync(blobClient, cosmosDBClientWrapper, batchChunkTexts);
}

this._logger.LogInformation("Finished processing blob {0}, total chunks processed {1}.", blobClient.Name, totalChunksCount);
this._logger.LogInformation("Finished processing blob {name}, total chunks processed {count}.", blobClient.Name, totalChunksCount);
}

private async Task ProcessCurrentBatchAsync(BlobClient blobClient, CosmosDBClientWrapper cosmosDBClientWrapper, List<TextChunk> batchChunkTexts)
{
this._logger.LogInformation("Generating embeddings for : '{count}'.", batchChunkTexts.Count());
var embeddings = await this.GenerateEmbeddingsWithRetryAsync(batchChunkTexts);

this._logger.LogInformation("Creating Cosmos DB documents for batch of size {count}", batchChunkTexts.Count);
await cosmosDBClientWrapper.UpsertDocumentsAsync(blobClient.Uri.AbsoluteUri, batchChunkTexts, embeddings);
}

private async Task<EmbeddingCollection> GenerateEmbeddingsWithRetryAsync(IEnumerable<TextChunk> batchChunkTexts)
{
int embeddingDimensions = configuration.GetValue<int>(AzureOpenAIModelDeploymentDimensionsName, DefaultDimensions);
this._logger.LogInformation("Using OpenAI model dimensions: '{embeddingDimensions}'.", embeddingDimensions);

EmbeddingGenerationOptions embeddingGenerationOptions = new EmbeddingGenerationOptions()
{
Dimensions = embeddingDimensions
};
var embeddings = await embeddingClient.GenerateEmbeddingsAsync(batchChunkTexts.Select(p => p.Text).ToList(), embeddingGenerationOptions);
await cosmosDBClientWrapper.UpsertDocumentsAsync(blobClient.Uri.AbsoluteUri, batchChunkTexts, embeddings);

batchChunkTexts.Clear();
int retryCount = 0;
while (retryCount < MaxRetryCount)
{
try
{
return await embeddingClient.GenerateEmbeddingsAsync(batchChunkTexts.Select(p => p.Text).ToList(), embeddingGenerationOptions);
}
catch (ClientResultException ex)
{
if (ex.Status is ((int)HttpStatusCode.TooManyRequests) or ((int)HttpStatusCode.Unauthorized))
{
if (retryCount >= MaxRetryCount)
{
throw new Exception($"Max retry attempts reached generating embeddings with exception: {ex}.");
}

retryCount++;

await Task.Delay(RetryDelay);
}
else
{
throw new Exception($"Failed to generate embeddings with error: {ex}.");
}
}
}

throw new Exception($"Failed to generate embeddings after retrying for ${MaxRetryCount} times.");
}

private async Task HandleBlobDeleteEventAsync(BlobClient blobClient)
Expand Down
1 change: 0 additions & 1 deletion DocumentVectorPipelineFunctions/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
.ConfigureFunctionsWorkerDefaults()
.ConfigureAppConfiguration(config =>
{
config.AddEnvironmentVariables();
config.AddUserSecrets<BlobTriggerFunction>(optional: true, reloadOnChange: false);
});

Expand Down
26 changes: 12 additions & 14 deletions deployment/cosmosdb.bicep
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
param location string = resourceGroup().location
param capabilities array = [
{ name: 'EnableServerless' }
{ name: 'EnableNoSQLVectorSearch' /*TODO: This doesn't seem to work on account creation.*/}
{ name: 'EnableNoSQLVectorSearch' /*TODO: This doesn't seem to work on account creation.*/ }
]

// Input parameters
param databaseName string
param name string
param tags object


// Create cosmosdb account
resource cosmosDB 'Microsoft.DocumentDB/databaseAccounts@2024-05-15' = {
name: name
Expand Down Expand Up @@ -38,18 +37,6 @@ resource managedIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-
name: managedIdentityName
}

// Assign storage account contributor role to azure function app
param id_roles_arr array = ['b24988ac-6180-42a0-ab88-20f7382dd24c', '230815da-be43-4aae-9cb4-875f7bd000aa'] // Contributor (priviledged role), CosmosDB Operator, Data contributor
resource roleAssignmentFUnctionApp 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for id_role in id_roles_arr : {
name: guid(resourceGroup().id, '${cosmosDB.name}-funcrole', id_role)
scope: cosmosDB
properties: {
roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', id_role)
principalId: managedIdentity.properties.principalId
}
}
]

// Create database
resource database 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases@2024-05-15' = {
parent: cosmosDB
Expand All @@ -62,5 +49,16 @@ resource database 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases@2024-05-15
tags: tags
}

param id_role string = '00000000-0000-0000-0000-000000000002' // Built-in data contributor
resource roleAssignmentSqlCosmosDB 'Microsoft.DocumentDB/databaseAccounts/sqlRoleAssignments@2021-10-15' = {
name: guid(resourceGroup().id, '${name}-datacontributorrole', id_role)
parent: cosmosDB
properties: {
principalId: managedIdentity.properties.principalId
roleDefinitionId: resourceId('Microsoft.DocumentDB/databaseAccounts/sqlRoleDefinitions', name, id_role)
scope: cosmosDB.id
}
}

output CosmosDBAccountName string = cosmosDB.name
output CosmosDBEndpoint string = cosmosDB.properties.documentEndpoint
17 changes: 9 additions & 8 deletions deployment/documentintelligence.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,15 @@ resource documentIntelligence 'Microsoft.CognitiveServices/accounts@2024-04-01-p
sku: sku
}

param storage_account_id_roles array = ['a97b65f3-24c7-4388-baec-2e87135dc908','a001fd3d-188f-4b5d-821b-7da978bf7442'] //Cognitive service user, openai contributor
resource roleAssignmentFuncStorageAccount 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for id_role in storage_account_id_roles : {
name: guid(resourceGroup().id, '${documentIntelligence.name}-storagerole', id_role)
scope: documentIntelligence
properties: {
roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', id_role)
principalId: managedIdentity.properties.principalId
}
param storage_account_id_roles array = ['a97b65f3-24c7-4388-baec-2e87135dc908'] //Cognitive service user
resource roleAssignmentDocumentIntelligence 'Microsoft.Authorization/roleAssignments@2022-04-01' = [
for id_role in storage_account_id_roles: {
name: guid(resourceGroup().id, '${documentIntelligence.name}-storagerole', id_role)
scope: documentIntelligence
properties: {
roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', id_role)
principalId: managedIdentity.properties.principalId
}
}
]

Expand Down
12 changes: 5 additions & 7 deletions deployment/functionapp.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ resource storageAccount 'Microsoft.Storage/storageAccounts@2023-05-01' existing
}
var storageConnectionStringValue = 'DefaultEndpointsProtocol=https;AccountName=${storageAccount.name};EndpointSuffix=${environment().suffixes.storage};AccountKey=${storageAccount.listKeys().keys[0].value}'


// Create webapps storage account to hold webapps related resources
resource func_app_storage_account 'Microsoft.Storage/storageAccounts@2023-05-01' = {
name: funcAppStorageAccountName
Expand All @@ -55,10 +54,10 @@ resource func_app_storage_account 'Microsoft.Storage/storageAccounts@2023-05-01'
}
var funcAppStorageConnectionStringValue = 'DefaultEndpointsProtocol=https;AccountName=${func_app_storage_account.name};EndpointSuffix=${environment().suffixes.storage};AccountKey=${func_app_storage_account.listKeys().keys[0].value}'


// Assign storage account contributor role to func_app_storage_account
param storage_account_id_roles array = ['ba92f5b4-2d11-453d-a403-e96b0029c9fe'] // Storage blob data contributor
resource roleAssignmentFuncStorageAccount 'Microsoft.Authorization/roleAssignments@2020-04-01-preview' = [for id_role in storage_account_id_roles : {
resource roleAssignmentFuncStorageAccount 'Microsoft.Authorization/roleAssignments@2020-04-01-preview' = [
for id_role in storage_account_id_roles: {
name: guid(resourceGroup().id, '${func_app_storage_account.name}-webjobsrole', id_role)
scope: func_app_storage_account
properties: {
Expand All @@ -68,7 +67,6 @@ resource roleAssignmentFuncStorageAccount 'Microsoft.Authorization/roleAssignmen
}
]


// Create a new Log Analytics workspace to back the Azure Application Insights instance
resource logAnalytics 'Microsoft.OperationalInsights/workspaces@2023-09-01' = {
name: logAnalyticsName
Expand Down Expand Up @@ -110,8 +108,7 @@ resource appservice_plan 'Microsoft.Web/serverfarms@2023-12-01' = {
sku: {
name: 'Y1'
}
properties: {
}
properties: {}
}

// Deploy the Azure Function app with application
Expand Down Expand Up @@ -221,7 +218,8 @@ resource funcApp 'Microsoft.Web/sites@2023-12-01' = {

// Assign storage account contributor role to azure function app
param id_roles_arr array = ['b24988ac-6180-42a0-ab88-20f7382dd24c'] // Contributor (priviledged access)
resource roleAssignmentFUnctionApp 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for id_role in id_roles_arr : {
resource roleAssignmentFunctionApp 'Microsoft.Authorization/roleAssignments@2022-04-01' = [
for id_role in id_roles_arr: {
name: guid(resourceGroup().id, '${func_app_storage_account.name}-funcrole', id_role)
scope: funcApp
properties: {
Expand Down
15 changes: 4 additions & 11 deletions deployment/main.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ param document_intelligence_sku object
param document_intelligence_publicNetworkAccess string
param document_intelligence_disableLocalAuth bool


// User managed identity resource
module userManagedIdentity_deployment 'userIdentity.bicep' = {
name: 'userManagedIdentity_deployment'
Expand All @@ -47,22 +46,20 @@ module userManagedIdentity_deployment 'userIdentity.bicep' = {
}
}


// Storage resource
module storage_deployment 'storage.bicep' = {
name: 'storage_deployment'
params: {
name: storage_name
containers: storage_containers
tags: tags
managedIdentityName:managedIdentity_name
managedIdentityName: managedIdentity_name
}
dependsOn: [
userManagedIdentity_deployment
]
}


// CosmosDB resource
module cosmosdb_deployment 'cosmosdb.bicep' = {
name: 'cosmosdb_deployment'
Expand All @@ -79,7 +76,6 @@ module cosmosdb_deployment 'cosmosdb.bicep' = {
]
}


// Document Intelligence resource
module document_intelligence_deployment 'documentintelligence.bicep' = {
name: 'document_intelligence_deployment'
Expand All @@ -97,31 +93,29 @@ module document_intelligence_deployment 'documentintelligence.bicep' = {
]
}


// OpenAI Resource
module open_ai_deployment 'openai.bicep' = {
name: 'open_ai_deployment'
params: {
deployments: open_ai_deployments
managedIdentityName:managedIdentity_name
managedIdentityName: managedIdentity_name
name: open_ai_name
format: open_ai_format
kind: open_ai_kind
sku: open_ai_sku
publicNetworkAccess:open_ai_publicNetworkAccess
publicNetworkAccess: open_ai_publicNetworkAccess
tags: tags
}
dependsOn: [
userManagedIdentity_deployment
]
}


// Function App Resource
module function_app_deployment 'functionapp.bicep' = {
name: 'function_app_deployment'
params: {
managedIdentityName:managedIdentity_name
managedIdentityName: managedIdentity_name
functionAppName: function_app_name
funcAppStorageSkuName: function_app_storageSkuName
funcAppStorageAccountName: function_app_storageAccountName
Expand All @@ -144,7 +138,6 @@ module function_app_deployment 'functionapp.bicep' = {
]
}


// Output params
// User Managed Identity and KeyVault Output Params
output AZURE_USER_MANAGED_IDENTITY_NAME string = userManagedIdentity_deployment.outputs.AzureManagedIdentityName
Expand Down
8 changes: 2 additions & 6 deletions deployment/main.bicepparam
Original file line number Diff line number Diff line change
Expand Up @@ -20,27 +20,23 @@ param storage_containers = [
}
]


// Function app params
param function_app_storageSkuName = 'Standard_LRS'


// CosmosDB params
param cosmosdb_databaseName = 'semantic_search_db'
param cosmosdb_capabilities = [
{ name: 'EnableServerless' }
{ name: 'EnableNoSQLVectorSearch'}
{ name: 'EnableNoSQLVectorSearch' }
]


// Document Intelligence Params
param document_intelligence_sku = {
name: 'S0'
}
param document_intelligence_publicNetworkAccess = 'Enabled'
param document_intelligence_disableLocalAuth = false


// Open AI params
param modelDeployment = 'text-embedding-3-large'
param modelDimensions = '1536'
Expand All @@ -49,7 +45,7 @@ param open_ai_deployments = [
name: modelDeployment
sku: {
name: 'Standard'
capacity: 10
capacity: 50
}
model: {
name: modelDeployment
Expand Down
11 changes: 6 additions & 5 deletions deployment/openai.bicep
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
param location string= resourceGroup().location
param location string = resourceGroup().location

// Input parameters
param deployments array
Expand Down Expand Up @@ -49,15 +49,17 @@ resource openAiDeployments 'Microsoft.CognitiveServices/accounts/deployments@202
}
]


// Assign user managed identity to openai app.
param managedIdentityName string
resource managedIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-07-31-preview' existing = {
name: managedIdentityName
}
param storage_account_id_roles array = ['b24988ac-6180-42a0-ab88-20f7382dd24c', 'a001fd3d-188f-4b5d-821b-7da978bf7442','a97b65f3-24c7-4388-baec-2e87135dc908'] // contributor (priviledged access), Cognitive Services OpenAI Contributor
param storage_account_id_roles array = [
'a97b65f3-24c7-4388-baec-2e87135dc908' //Cognitive Services User
]

resource roleAssignmentFuncStorageAccount 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for id_role in storage_account_id_roles : {
resource roleAssignmentOpenAIAccount 'Microsoft.Authorization/roleAssignments@2022-04-01' = [
for id_role in storage_account_id_roles: {
name: guid(resourceGroup().id, '${name}-openairole', id_role)
scope: openAi
properties: {
Expand All @@ -67,6 +69,5 @@ resource roleAssignmentFuncStorageAccount 'Microsoft.Authorization/roleAssignmen
}
]


output openAIServiceName string = openAi.name
output openAIServiceEndpoint string = openAi.properties.endpoint
2 changes: 1 addition & 1 deletion deployment/openai.bicepparam
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ param sku = 'S0'
param kind = 'OpenAI'
param format = 'OpenAI'
param publicNetworkAccess = 'Enabled'
param tags = {}
param tags = {}
Loading

0 comments on commit b710197

Please sign in to comment.