From f3e517e41a33d891aed8329f5170b1d8c263b1cc Mon Sep 17 00:00:00 2001 From: kevindelmont <133667252+kevindelmont@users.noreply.github.com> Date: Sat, 14 Sep 2024 21:03:45 +0200 Subject: [PATCH] add budget metrics by scope (subscriptions/billing account) and budget forecast metric (#102) * add budget metrics by scope * remove unused field in budget prometheus struc * Fix lint error prometheus field unused * re-add MetricsCollectorAzureRmCosts collector * Fix retry with header --------- Co-authored-by: PaulPowershell <116181531+PaulPowershell@users.noreply.github.com> --- config/config.go | 1 + config/config_budget.go | 9 ++ default.yaml | 2 + example.yaml | 19 +++- main.go | 15 +++ metrics_azurerm_budgets.go | 219 +++++++++++++++++++++++++++++++++++++ metrics_azurerm_costs.go | 177 ++++-------------------------- 7 files changed, 285 insertions(+), 157 deletions(-) create mode 100644 config/config_budget.go create mode 100644 metrics_azurerm_budgets.go diff --git a/config/config.go b/config/config.go index e33929f..69df428 100644 --- a/config/config.go +++ b/config/config.go @@ -17,6 +17,7 @@ type ( Iam CollectorBase `yaml:"iam"` Graph CollectorGraph `yaml:"graph"` Costs CollectorCosts `yaml:"costs"` + Budgets CollectorBudgets `yaml:"budgets"` Reservation CollectorReservation `yaml:"reservation"` Portscan CollectorPortscan `yaml:"portscan"` } `yaml:"collectors"` diff --git a/config/config_budget.go b/config/config_budget.go new file mode 100644 index 0000000..c14d072 --- /dev/null +++ b/config/config_budget.go @@ -0,0 +1,9 @@ +package config + +type ( + CollectorBudgets struct { + CollectorBase `yaml:",inline"` + + Scopes []string `yaml:"scopes"` + } +) diff --git a/default.yaml b/default.yaml index eccf64c..a856d7e 100644 --- a/default.yaml +++ b/default.yaml @@ -23,6 +23,8 @@ collectors: costs: {} + budgets: {} + reservation: {} portscan: diff --git a/example.yaml b/example.yaml index abb6da1..76e85bc 100644 --- a/example.yaml +++ b/example.yaml @@ -55,7 +55,7 @@ collectors: application: "" servicePrincipal: "" - # Azure cost metrics (cost queries, budgets) + # Azure cost metrics (cost queries) # needs queries below costs: scrapeTime: 60m @@ -104,6 +104,23 @@ collectors: # optional, additional static labels labels: {} + # Azure budget metrics + budgets: + scrapeTime: 1h + + # optional, see https://learn.microsoft.com/en-us/rest/api/cost-management/query/usage?tabs=HTTP + # will disable fetching by subscription and will enable fetching by scope + #scopes: [...] + # '/subscriptions/{subscriptionId}/' for subscription scope + # '/subscriptions/{subscriptionId}/resourceGroups/{resourceGroupName}' for resourceGroup scope + # '/providers/Microsoft.Billing/billingAccounts/{billingAccountId}' for Billing Account scope + # '/providers/Microsoft.Billing/billingAccounts/{billingAccountId}/departments/{departmentId}' for Department scope + # '/providers/Microsoft.Billing/billingAccounts/{billingAccountId}/enrollmentAccounts/{enrollmentAccountId}' for EnrollmentAccount scope + # '/providers/Microsoft.Management/managementGroups/{managementGroupId} for Management Group scope + # '/providers/Microsoft.Billing/billingAccounts/{billingAccountId}/billingProfiles/{billingProfileId}' for billingProfile scope + # '/providers/Microsoft.Billing/billingAccounts/{billingAccountId}/billingProfiles/{billingProfileId}/invoiceSections/{invoiceSectionId}' for invoiceSection scope + # '/providers/Microsoft.Billing/billingAccounts/{billingAccountId}/customers/{customerId}' specific for partners + reservation: scrapeTime: 1h diff --git a/main.go b/main.go index 2f94161..18936d1 100644 --- a/main.go +++ b/main.go @@ -259,6 +259,21 @@ func initMetricCollector() { logger.With(zap.String("collector", collectorName)).Infof("collector disabled") } + collectorName = "budgets" + if Config.Collectors.Budgets.IsEnabled() { + c := collector.New(collectorName, &MetricsCollectorAzureRmBudgets{}, logger) + c.SetScapeTime(*Config.Collectors.Budgets.ScrapeTime) + c.SetCache( + Opts.GetCachePath(collectorName+".json"), + collector.BuildCacheTag(cacheTag, Config.Azure, Config.Collectors.Budgets), + ) + if err := c.Start(); err != nil { + logger.Fatal(err.Error()) + } + } else { + logger.With(zap.String("collector", collectorName)).Infof("collector disabled") + } + collectorName = "defender" if Config.Collectors.Defender.IsEnabled() { c := collector.New(collectorName, &MetricsCollectorAzureRmDefender{}, logger) diff --git a/metrics_azurerm_budgets.go b/metrics_azurerm_budgets.go new file mode 100644 index 0000000..96bf8ac --- /dev/null +++ b/metrics_azurerm_budgets.go @@ -0,0 +1,219 @@ +package main + +import ( + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/consumption/armconsumption" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions" + "github.com/prometheus/client_golang/prometheus" + "github.com/webdevops/go-common/azuresdk/armclient" + "github.com/webdevops/go-common/prometheus/collector" + "github.com/webdevops/go-common/utils/to" + "go.uber.org/zap" +) + +// Define MetricsCollectorAzureRmBudgets struct +type MetricsCollectorAzureRmBudgets struct { + collector.Processor + + prometheus struct { + consumptionBudgetInfo *prometheus.GaugeVec + consumptionBudgetLimit *prometheus.GaugeVec + consumptionBudgetCurrent *prometheus.GaugeVec + consumptionBudgetForecast *prometheus.GaugeVec + consumptionBudgetUsage *prometheus.GaugeVec + } +} + +// Setup method to initialize Prometheus metrics +func (m *MetricsCollectorAzureRmBudgets) Setup(collector *collector.Collector) { + m.Processor.Setup(collector) + + // ---------------------------------------------------- + // Budget + m.prometheus.consumptionBudgetInfo = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "azurerm_budgets_info", + Help: "Azure ResourceManager consumption budget info", + }, + []string{ + "scope", + "resourceID", + "subscriptionID", + "budgetName", + "resourceGroup", + "category", + "timeGrain", + }, + ) + m.Collector.RegisterMetricList("consumptionBudgetInfo", m.prometheus.consumptionBudgetInfo, true) + + m.prometheus.consumptionBudgetLimit = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "azurerm_budgets_limit", + Help: "Azure ResourceManager consumption budget limit", + }, + []string{ + "scope", + "resourceID", + "subscriptionID", + "resourceGroup", + "budgetName", + }, + ) + m.Collector.RegisterMetricList("consumptionBudgetLimit", m.prometheus.consumptionBudgetLimit, true) + + m.prometheus.consumptionBudgetUsage = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "azurerm_budgets_usage", + Help: "Azure ResourceManager consumption budget usage percentage", + }, + []string{ + "scope", + "resourceID", + "subscriptionID", + "resourceGroup", + "budgetName", + }, + ) + m.Collector.RegisterMetricList("consumptionBudgetUsage", m.prometheus.consumptionBudgetUsage, true) + + m.prometheus.consumptionBudgetCurrent = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "azurerm_budgets_current", + Help: "Azure ResourceManager consumption budget current", + }, + []string{ + "scope", + "resourceID", + "subscriptionID", + "resourceGroup", + "budgetName", + "unit", + }, + ) + m.Collector.RegisterMetricList("consumptionBudgetCurrent", m.prometheus.consumptionBudgetCurrent, true) + + m.prometheus.consumptionBudgetForecast = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "azurerm_budgets_forecast", + Help: "Azure ResourceManager consumption budget forecast", + }, + []string{ + "scope", + "resourceID", + "subscriptionID", + "resourceGroup", + "budgetName", + "unit", + }, + ) + m.Collector.RegisterMetricList("consumptionBudgetForecast", m.prometheus.consumptionBudgetForecast, true) +} + +func (m *MetricsCollectorAzureRmBudgets) Reset() {} + +func (m *MetricsCollectorAzureRmBudgets) Collect(callback chan<- func()) { + if Config.Collectors.Budgets.Scopes != nil && len(Config.Collectors.Budgets.Scopes) > 0 { + for _, scope := range Config.Collectors.Budgets.Scopes { + // Run the budget query for the current scope + m.collectBudgetMetrics(logger, scope, callback) + } + } else { + // using subscription iterator + iterator := AzureSubscriptionsIterator + + err := iterator.ForEach(m.Logger(), func(subscription *armsubscriptions.Subscription, logger *zap.SugaredLogger) { + m.collectBudgetMetrics( + logger, + *subscription.ID, + callback, + ) + }) + if err != nil { + m.Logger().Panic(err) + } + } +} + +func (m *MetricsCollectorAzureRmBudgets) collectBudgetMetrics(logger *zap.SugaredLogger, scope string, callback chan<- func()) { + clientFactory, err := armconsumption.NewClientFactory("", AzureClient.GetCred(), AzureClient.NewArmClientOptions()) + if err != nil { + logger.Panic(err) + } + + infoMetric := m.Collector.GetMetricList("consumptionBudgetInfo") + usageMetric := m.Collector.GetMetricList("consumptionBudgetUsage") + limitMetric := m.Collector.GetMetricList("consumptionBudgetLimit") + currentMetric := m.Collector.GetMetricList("consumptionBudgetCurrent") + forecastMetric := m.Collector.GetMetricList("consumptionBudgetForecast") + + pager := clientFactory.NewBudgetsClient().NewListPager(scope, nil) + + for pager.More() { + result, err := pager.NextPage(m.Context()) + if err != nil { + logger.Panic(err) + } + + if result.Value == nil { + continue + } + + for _, budget := range result.Value { + resourceId := to.String(budget.ID) + + azureResource, _ := armclient.ParseResourceId(resourceId) + + infoMetric.AddInfo(prometheus.Labels{ + "scope": scope, + "resourceID": stringToStringLower(resourceId), + "subscriptionID": azureResource.Subscription, + "resourceGroup": azureResource.ResourceGroup, + "budgetName": to.String(budget.Name), + "category": stringToStringLower(string(*budget.Properties.Category)), + "timeGrain": string(*budget.Properties.TimeGrain), + }) + + if budget.Properties.Amount != nil { + limitMetric.Add(prometheus.Labels{ + "scope": scope, + "resourceID": stringToStringLower(resourceId), + "subscriptionID": azureResource.Subscription, + "resourceGroup": azureResource.ResourceGroup, + "budgetName": to.String(budget.Name), + }, *budget.Properties.Amount) + } + + if budget.Properties.CurrentSpend != nil { + currentMetric.Add(prometheus.Labels{ + "scope": scope, + "resourceID": stringToStringLower(resourceId), + "subscriptionID": azureResource.Subscription, + "resourceGroup": azureResource.ResourceGroup, + "budgetName": to.String(budget.Name), + "unit": to.StringLower(budget.Properties.CurrentSpend.Unit), + }, *budget.Properties.CurrentSpend.Amount) + } + + if budget.Properties.ForecastSpend != nil { + forecastMetric.Add(prometheus.Labels{ + "scope": scope, + "resourceID": stringToStringLower(resourceId), + "subscriptionID": azureResource.Subscription, + "resourceGroup": azureResource.ResourceGroup, + "budgetName": to.String(budget.Name), + "unit": to.StringLower(budget.Properties.ForecastSpend.Unit), + }, *budget.Properties.ForecastSpend.Amount) + } + + if budget.Properties.Amount != nil && budget.Properties.CurrentSpend != nil { + usageMetric.Add(prometheus.Labels{ + "scope": scope, + "resourceID": stringToStringLower(resourceId), + "subscriptionID": azureResource.Subscription, + "resourceGroup": azureResource.ResourceGroup, + "budgetName": to.String(budget.Name), + }, *budget.Properties.CurrentSpend.Amount / *budget.Properties.Amount) + } + } + } +} diff --git a/metrics_azurerm_costs.go b/metrics_azurerm_costs.go index bc14675..b1e3ae7 100644 --- a/metrics_azurerm_costs.go +++ b/metrics_azurerm_costs.go @@ -11,7 +11,6 @@ import ( armruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm/runtime" "github.com/Azure/azure-sdk-for-go/sdk/azcore/policy" "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" - "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/consumption/armconsumption" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/costmanagement/armcostmanagement" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions" "github.com/prometheus/client_golang/prometheus" @@ -31,16 +30,6 @@ const ( type ( MetricsCollectorAzureRmCosts struct { collector.Processor - - prometheus struct { - consumptionBudgetInfo *prometheus.GaugeVec - consumptionBudgetLimit *prometheus.GaugeVec - consumptionBudgetCurrent *prometheus.GaugeVec - consumptionBudgetUsage *prometheus.GaugeVec - - costmanagementOverallUsage *prometheus.GaugeVec - costmanagementOverallActualCost *prometheus.GaugeVec - } } MetricsCollectorAzureRmCostsQuery struct { @@ -67,67 +56,6 @@ type ( func (m *MetricsCollectorAzureRmCosts) Setup(collector *collector.Collector) { m.Processor.Setup(collector) - // ---------------------------------------------------- - // Budget - m.prometheus.consumptionBudgetInfo = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "azurerm_costs_budget_info", - Help: "Azure ResourceManager consumption budget info", - }, - []string{ - "resourceID", - "subscriptionID", - "budgetName", - "resourceGroup", - "category", - "timeGrain", - }, - ) - m.Collector.RegisterMetricList("consumptionBudgetInfo", m.prometheus.consumptionBudgetInfo, true) - - m.prometheus.consumptionBudgetLimit = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "azurerm_costs_budget_limit", - Help: "Azure ResourceManager consumption budget limit", - }, - []string{ - "resourceID", - "subscriptionID", - "resourceGroup", - "budgetName", - }, - ) - m.Collector.RegisterMetricList("consumptionBudgetLimit", m.prometheus.consumptionBudgetLimit, true) - - m.prometheus.consumptionBudgetUsage = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "azurerm_costs_budget_usage", - Help: "Azure ResourceManager consumption budget usage percentage", - }, - []string{ - "resourceID", - "subscriptionID", - "resourceGroup", - "budgetName", - }, - ) - m.Collector.RegisterMetricList("consumptionBudgetUsage", m.prometheus.consumptionBudgetUsage, true) - - m.prometheus.consumptionBudgetCurrent = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "azurerm_costs_budget_current", - Help: "Azure ResourceManager consumption budget current", - }, - []string{ - "resourceID", - "subscriptionID", - "resourceGroup", - "budgetName", - "unit", - }, - ) - m.Collector.RegisterMetricList("consumptionBudgetCurrent", m.prometheus.consumptionBudgetCurrent, true) - // ---------------------------------------------------- // Costs (by Query) @@ -194,18 +122,6 @@ func (m *MetricsCollectorAzureRmCosts) Collect(callback chan<- func()) { m.collectRunCostQuery(&query, exportType, callback) } - - // run budget collection - err := AzureSubscriptionsIterator.ForEach(m.Logger(), func(subscription *armsubscriptions.Subscription, logger *zap.SugaredLogger) { - logger.Info(`fetching cost budget report`) - m.collectBudgetMetrics( - logger.With(zap.String("consumption", "Budgets")), - subscription, - ) - }) - if err != nil { - m.Logger().Panic(err) - } } func (m *MetricsCollectorAzureRmCosts) collectRunCostQuery(query *config.CollectorCostsQuery, exportType armcostmanagement.ExportType, callback chan<- func()) { @@ -252,73 +168,6 @@ func (m *MetricsCollectorAzureRmCosts) collectRunCostQuery(query *config.Collect } } -func (m *MetricsCollectorAzureRmCosts) collectBudgetMetrics(logger *zap.SugaredLogger, subscription *armsubscriptions.Subscription) { - client, err := armconsumption.NewBudgetsClient(AzureClient.GetCred(), AzureClient.NewArmClientOptions()) - if err != nil { - logger.Panic(err) - } - - infoMetric := m.Collector.GetMetricList("consumptionBudgetInfo") - usageMetric := m.Collector.GetMetricList("consumptionBudgetUsage") - limitMetric := m.Collector.GetMetricList("consumptionBudgetLimit") - currentMetric := m.Collector.GetMetricList("consumptionBudgetCurrent") - - pager := client.NewListPager(*subscription.ID, nil) - - for pager.More() { - result, err := pager.NextPage(m.Context()) - if err != nil { - logger.Panic(err) - } - - if result.Value == nil { - continue - } - - for _, budget := range result.Value { - resourceId := to.String(budget.ID) - azureResource, _ := armclient.ParseResourceId(resourceId) - - infoMetric.AddInfo(prometheus.Labels{ - "resourceID": stringToStringLower(resourceId), - "subscriptionID": azureResource.Subscription, - "resourceGroup": azureResource.ResourceGroup, - "budgetName": to.String(budget.Name), - "category": stringToStringLower(string(*budget.Properties.Category)), - "timeGrain": string(*budget.Properties.TimeGrain), - }) - - if budget.Properties.Amount != nil { - limitMetric.Add(prometheus.Labels{ - "resourceID": stringToStringLower(resourceId), - "subscriptionID": azureResource.Subscription, - "resourceGroup": azureResource.ResourceGroup, - "budgetName": to.String(budget.Name), - }, *budget.Properties.Amount) - } - - if budget.Properties.CurrentSpend != nil { - currentMetric.Add(prometheus.Labels{ - "resourceID": stringToStringLower(resourceId), - "subscriptionID": azureResource.Subscription, - "resourceGroup": azureResource.ResourceGroup, - "budgetName": to.String(budget.Name), - "unit": to.StringLower(budget.Properties.CurrentSpend.Unit), - }, *budget.Properties.CurrentSpend.Amount) - } - - if budget.Properties.Amount != nil && budget.Properties.CurrentSpend != nil { - usageMetric.Add(prometheus.Labels{ - "resourceID": stringToStringLower(resourceId), - "subscriptionID": azureResource.Subscription, - "resourceGroup": azureResource.ResourceGroup, - "budgetName": to.String(budget.Name), - }, *budget.Properties.CurrentSpend.Amount / *budget.Properties.Amount) - } - } - } -} - func (m *MetricsCollectorAzureRmCosts) collectCostManagementMetrics(logger *zap.SugaredLogger, metricList *collector.MetricList, scope string, exportType armcostmanagement.ExportType, query *config.CollectorCostsQuery, timeframe string, subscription *armsubscriptions.Subscription) { logger.Infof(`fetching cost report for query "%v"`, query.Name) @@ -409,7 +258,7 @@ func (m *MetricsCollectorAzureRmCosts) collectCostManagementMetrics(logger *zap. params.TimePeriod = &timePeriod } - result, err := m.sendCostQuery(m.Context(), logger, scope, params, nil) + result, err := m.sendCostQuery(m.Context(), logger, scope, params) if err != nil { logger.Panic(err) } @@ -550,10 +399,10 @@ func (m *MetricsCollectorAzureRmCosts) collectCostManagementMetrics(logger *zap. time.Sleep(Config.Collectors.Costs.RequestDelay) } -func (m *MetricsCollectorAzureRmCosts) sendCostQuery(ctx context.Context, logger *zap.SugaredLogger, scope string, parameters armcostmanagement.QueryDefinition, options *armcostmanagement.QueryClientUsageOptions) (armcostmanagement.QueryClientUsageResponse, error) { +func (m *MetricsCollectorAzureRmCosts) sendCostQuery(ctx context.Context, logger *zap.SugaredLogger, scope string, parameters armcostmanagement.QueryDefinition) (armcostmanagement.QueryClientUsageResponse, error) { clientOpts := AzureClient.NewArmClientOptions() - // cost queries should not retry soo fast, we have a strict rate limit on azure side + // Initialize the client with appropriate retry options. clientOpts.Retry = policy.RetryOptions{ MaxRetries: 3, RetryDelay: 30 * time.Second, @@ -571,7 +420,7 @@ func (m *MetricsCollectorAzureRmCosts) sendCostQuery(ctx context.Context, logger logger.Panic(err.Error()) } - // paging + // Set up the pipeline for paging. pl, err := armruntime.NewPipeline("azurerm-costs", gitTag, AzureClient.GetCred(), runtime.PipelineOptions{}, AzureClient.NewArmClientOptions()) if err != nil { logger.Panic(err.Error()) @@ -596,6 +445,18 @@ func (m *MetricsCollectorAzureRmCosts) sendCostQuery(ctx context.Context, logger } defer resp.Body.Close() + if resp.StatusCode == http.StatusTooManyRequests { + retryAfterHeader := resp.Header.Get("X-Ms-Ratelimit-Microsoft.costmanagement-Entity-Retry-After") + retryAfter, err := strconv.Atoi(retryAfterHeader) + if err != nil { + logger.Errorf("Unable to parse retry-after header: %v", retryAfterHeader) + return fmt.Errorf("unable to parse retry-after header: %v", retryAfterHeader) + } + logger.Errorf("Received 429 Too Many Requests. Retrying after %d seconds. Headers: %v", retryAfter, resp.Header) + time.Sleep(time.Duration(retryAfter) * time.Second) + return fmt.Errorf("received 429 Too Many Requests, retrying after %d seconds", retryAfter) + } + if runtime.HasStatusCode(resp, http.StatusOK) { pagerResult := armcostmanagement.QueryClientUsageResponse{} if err := runtime.UnmarshalAsJSON(resp, &pagerResult); err == nil { @@ -605,12 +466,16 @@ func (m *MetricsCollectorAzureRmCosts) sendCostQuery(ctx context.Context, logger logger.Panic(err.Error()) } } else { - return fmt.Errorf(`unexpected status code: %v`, resp.StatusCode) + return fmt.Errorf("unexpected status code: %v", resp.StatusCode) } return nil }() if err != nil { + // If we encounter a rate limit error, retry after the specified delay. + if strings.Contains(err.Error(), "received 429 Too Many Requests") { + continue + } return result, err }