From 7f8bb76f810c6865699282c7b8f1f4cf39e3a675 Mon Sep 17 00:00:00 2001 From: Stefan Kurek Date: Mon, 27 Nov 2023 18:26:25 -0500 Subject: [PATCH] MSSQL Integration: Adds query_config_path to allow for custom metrics through custom exporter config file (#5768) * Adds query_config_path to mssql integration to allow for custom metrics through custom exporter config file * Updates query_config_path to agent flow and fixes tests/docs * Adds both query_config and query_config_file to mssql integration * Removes query_config_file from mssql config params --- CHANGELOG.md | 4 +- component/prometheus/exporter/mssql/mssql.go | 21 +- .../prometheus/exporter/mssql/mssql_test.go | 126 ++++++++- .../components/prometheus.exporter.mssql.md | 244 ++++++++++++++++- .../integrations/mssql-config.md | 252 +++++++++++++++++- pkg/integrations/mssql/collector.go | 2 +- pkg/integrations/mssql/sql_exporter.go | 30 ++- pkg/integrations/mssql/sql_exporter_test.go | 78 +++++- 8 files changed, 721 insertions(+), 36 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a60a93219a85..c4d686873671 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -133,7 +133,7 @@ v0.38.0 (2023-11-21) - Make component list sortable in web UI. (@hainenber) - Adds new metrics (`mssql_server_total_memory_bytes`, `mssql_server_target_memory_bytes`, - and `mssql_available_commit_memory_bytes`) for `mssql` integration. + and `mssql_available_commit_memory_bytes`) for `mssql` integration (@StefanKurek). - Grafana Agent Operator: `config-reloader` container no longer runs as root. (@rootmout) @@ -150,6 +150,8 @@ v0.38.0 (2023-11-21) - Allow agent to start with `module.git` config if cached before. (@hainenber) +- Adds new optional config parameter `query_config` to `mssql` integration to allow for custom metrics (@StefanKurek) + ### Bugfixes - Set exit code 1 on grafana-agentctl non-runnable command. (@fgouteroux) diff --git a/component/prometheus/exporter/mssql/mssql.go b/component/prometheus/exporter/mssql/mssql.go index 3e9770fb8892..bef73f16a44c 100644 --- a/component/prometheus/exporter/mssql/mssql.go +++ b/component/prometheus/exporter/mssql/mssql.go @@ -2,14 +2,18 @@ package mssql import ( "errors" + "fmt" "time" + "github.com/burningalchemist/sql_exporter/config" "github.com/grafana/agent/component" "github.com/grafana/agent/component/prometheus/exporter" "github.com/grafana/agent/pkg/integrations" "github.com/grafana/agent/pkg/integrations/mssql" + "github.com/grafana/agent/pkg/util" "github.com/grafana/river/rivertypes" config_util "github.com/prometheus/common/config" + "gopkg.in/yaml.v2" ) func init() { @@ -36,10 +40,11 @@ var DefaultArguments = Arguments{ // Arguments controls the mssql exporter. type Arguments struct { - ConnectionString rivertypes.Secret `river:"connection_string,attr"` - MaxIdleConnections int `river:"max_idle_connections,attr,optional"` - MaxOpenConnections int `river:"max_open_connections,attr,optional"` - Timeout time.Duration `river:"timeout,attr,optional"` + ConnectionString rivertypes.Secret `river:"connection_string,attr"` + MaxIdleConnections int `river:"max_idle_connections,attr,optional"` + MaxOpenConnections int `river:"max_open_connections,attr,optional"` + Timeout time.Duration `river:"timeout,attr,optional"` + QueryConfig rivertypes.OptionalSecret `river:"query_config,attr,optional"` } // SetToDefault implements river.Defaulter. @@ -60,6 +65,13 @@ func (a *Arguments) Validate() error { if a.Timeout <= 0 { return errors.New("timeout must be positive") } + + var collectorConfig config.CollectorConfig + err := yaml.UnmarshalStrict([]byte(a.QueryConfig.Value), &collectorConfig) + if err != nil { + return fmt.Errorf("invalid query_config: %s", err) + } + return nil } @@ -69,5 +81,6 @@ func (a *Arguments) Convert() *mssql.Config { MaxIdleConnections: a.MaxIdleConnections, MaxOpenConnections: a.MaxOpenConnections, Timeout: a.Timeout, + QueryConfig: util.RawYAML(a.QueryConfig.Value), } } diff --git a/component/prometheus/exporter/mssql/mssql_test.go b/component/prometheus/exporter/mssql/mssql_test.go index b9a47ad3b776..4fad4a819780 100644 --- a/component/prometheus/exporter/mssql/mssql_test.go +++ b/component/prometheus/exporter/mssql/mssql_test.go @@ -4,11 +4,13 @@ import ( "testing" "time" + "github.com/burningalchemist/sql_exporter/config" "github.com/grafana/agent/pkg/integrations/mssql" "github.com/grafana/river" "github.com/grafana/river/rivertypes" config_util "github.com/prometheus/common/config" "github.com/stretchr/testify/require" + "gopkg.in/yaml.v2" ) func TestRiverUnmarshal(t *testing.T) { @@ -16,8 +18,7 @@ func TestRiverUnmarshal(t *testing.T) { connection_string = "sqlserver://user:pass@localhost:1433" max_idle_connections = 3 max_open_connections = 3 - timeout = "10s" - ` + timeout = "10s"` var args Arguments err := river.Unmarshal([]byte(riverConfig), &args) @@ -33,6 +34,64 @@ func TestRiverUnmarshal(t *testing.T) { require.Equal(t, expected, args) } +func TestRiverUnmarshalWithInlineQueryConfig(t *testing.T) { + riverConfig := ` + connection_string = "sqlserver://user:pass@localhost:1433" + max_idle_connections = 3 + max_open_connections = 3 + timeout = "10s" + query_config = "{ collector_name: mssql_standard, metrics: [ { metric_name: mssql_local_time_seconds, type: gauge, help: 'Local time in seconds since epoch (Unix time).', values: [ unix_time ], query: \"SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time\" } ] }"` + + var args Arguments + err := river.Unmarshal([]byte(riverConfig), &args) + require.NoError(t, err) + var collectorConfig config.CollectorConfig + err = yaml.UnmarshalStrict([]byte(args.QueryConfig.Value), &collectorConfig) + require.NoError(t, err) + + require.Equal(t, rivertypes.Secret("sqlserver://user:pass@localhost:1433"), args.ConnectionString) + require.Equal(t, 3, args.MaxIdleConnections) + require.Equal(t, 3, args.MaxOpenConnections) + require.Equal(t, 10*time.Second, args.Timeout) + require.Equal(t, "mssql_standard", collectorConfig.Name) + require.Equal(t, 1, len(collectorConfig.Metrics)) + require.Equal(t, "mssql_local_time_seconds", collectorConfig.Metrics[0].Name) + require.Equal(t, "gauge", collectorConfig.Metrics[0].TypeString) + require.Equal(t, "Local time in seconds since epoch (Unix time).", collectorConfig.Metrics[0].Help) + require.Equal(t, 1, len(collectorConfig.Metrics[0].Values)) + require.Contains(t, collectorConfig.Metrics[0].Values, "unix_time") + require.Equal(t, "SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time", collectorConfig.Metrics[0].QueryLiteral) +} + +func TestRiverUnmarshalWithInlineQueryConfigYaml(t *testing.T) { + riverConfig := ` + connection_string = "sqlserver://user:pass@localhost:1433" + max_idle_connections = 3 + max_open_connections = 3 + timeout = "10s" + query_config = "collector_name: mssql_standard\nmetrics:\n- metric_name: mssql_local_time_seconds\n type: gauge\n help: 'Local time in seconds since epoch (Unix time).'\n values: [unix_time]\n query: \"SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time\""` + + var args Arguments + err := river.Unmarshal([]byte(riverConfig), &args) + require.NoError(t, err) + var collectorConfig config.CollectorConfig + err = yaml.UnmarshalStrict([]byte(args.QueryConfig.Value), &collectorConfig) + require.NoError(t, err) + + require.Equal(t, rivertypes.Secret("sqlserver://user:pass@localhost:1433"), args.ConnectionString) + require.Equal(t, 3, args.MaxIdleConnections) + require.Equal(t, 3, args.MaxOpenConnections) + require.Equal(t, 10*time.Second, args.Timeout) + require.Equal(t, "mssql_standard", collectorConfig.Name) + require.Equal(t, 1, len(collectorConfig.Metrics)) + require.Equal(t, "mssql_local_time_seconds", collectorConfig.Metrics[0].Name) + require.Equal(t, "gauge", collectorConfig.Metrics[0].TypeString) + require.Equal(t, "Local time in seconds since epoch (Unix time).", collectorConfig.Metrics[0].Help) + require.Equal(t, 1, len(collectorConfig.Metrics[0].Values)) + require.Contains(t, collectorConfig.Metrics[0].Values, "unix_time") + require.Equal(t, "SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time", collectorConfig.Metrics[0].QueryLiteral) +} + func TestUnmarshalInvalid(t *testing.T) { invalidRiverConfig := ` connection_string = "sqlserver://user:pass@localhost:1433" @@ -44,6 +103,37 @@ func TestUnmarshalInvalid(t *testing.T) { var invalidArgs Arguments err := river.Unmarshal([]byte(invalidRiverConfig), &invalidArgs) require.Error(t, err) + require.EqualError(t, err, "timeout must be positive") +} + +func TestUnmarshalInvalidQueryConfigYaml(t *testing.T) { + invalidRiverConfig := ` + connection_string = "sqlserver://user:pass@localhost:1433" + max_idle_connections = 1 + max_open_connections = 1 + timeout = "1s" + query_config = "{ collector_name: mssql_standard, metrics: [ { metric_name: mssql_local_time_seconds, type: gauge, help: 'Local time in seconds since epoch (Unix time).', values: [ unix_time ], query: \"SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time\" }" + ` + + var invalidArgs Arguments + err := river.Unmarshal([]byte(invalidRiverConfig), &invalidArgs) + require.Error(t, err) + require.EqualError(t, err, "invalid query_config: yaml: line 1: did not find expected ',' or ']'") +} + +func TestUnmarshalInvalidProperty(t *testing.T) { + invalidRiverConfig := ` + connection_string = "sqlserver://user:pass@localhost:1433" + max_idle_connections = 1 + max_open_connections = 1 + timeout = "1s" + query_config = "collector_name: mssql_standard\nbad_param: true\nmetrics:\n- metric_name: mssql_local_time_seconds\n type: gauge\n help: 'Local time in seconds since epoch (Unix time).'\n values: [unix_time]\n query: \"SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time\"" + ` + + var invalidArgs Arguments + err := river.Unmarshal([]byte(invalidRiverConfig), &invalidArgs) + require.Error(t, err) + require.EqualError(t, err, "invalid query_config: unknown fields in collector: bad_param") } func TestArgumentsValidate(t *testing.T) { @@ -89,6 +179,9 @@ func TestArgumentsValidate(t *testing.T) { MaxIdleConnections: 1, MaxOpenConnections: 1, Timeout: 10 * time.Second, + QueryConfig: rivertypes.OptionalSecret{ + Value: `{ collector_name: mssql_standard, metrics: [ { metric_name: mssql_local_time_seconds, type: gauge, help: 'Local time in seconds since epoch (Unix time).', values: [ unix_time ], query: "SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time" } ] }`, + }, }, wantErr: false, }, @@ -107,20 +200,31 @@ func TestArgumentsValidate(t *testing.T) { } func TestConvert(t *testing.T) { - riverConfig := ` - connection_string = "sqlserver://user:pass@localhost:1433" - ` - var args Arguments - err := river.Unmarshal([]byte(riverConfig), &args) - require.NoError(t, err) + strQueryConfig := `collector_name: mssql_standard +metrics: +- metric_name: mssql_local_time_seconds + type: gauge + help: 'Local time in seconds since epoch (Unix time).' + values: [unix_time] + query: "SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time"` + args := Arguments{ + ConnectionString: rivertypes.Secret("sqlserver://user:pass@localhost:1433"), + MaxIdleConnections: 1, + MaxOpenConnections: 1, + Timeout: 10 * time.Second, + QueryConfig: rivertypes.OptionalSecret{ + Value: strQueryConfig, + }, + } res := args.Convert() expected := mssql.Config{ ConnectionString: config_util.Secret("sqlserver://user:pass@localhost:1433"), - MaxIdleConnections: DefaultArguments.MaxIdleConnections, - MaxOpenConnections: DefaultArguments.MaxOpenConnections, - Timeout: DefaultArguments.Timeout, + MaxIdleConnections: 1, + MaxOpenConnections: 1, + Timeout: 10 * time.Second, + QueryConfig: []byte(strQueryConfig), } require.Equal(t, expected, *res) } diff --git a/docs/sources/flow/reference/components/prometheus.exporter.mssql.md b/docs/sources/flow/reference/components/prometheus.exporter.mssql.md index 84786ee074a0..93fb305f8a5d 100644 --- a/docs/sources/flow/reference/components/prometheus.exporter.mssql.md +++ b/docs/sources/flow/reference/components/prometheus.exporter.mssql.md @@ -12,7 +12,8 @@ title: prometheus.exporter.mssql # prometheus.exporter.mssql The `prometheus.exporter.mssql` component embeds -[sql_exporter](https://github.com/burningalchemist/sql_exporter) for collecting stats from a Microsoft SQL Server. +[sql_exporter](https://github.com/burningalchemist/sql_exporter) for collecting stats from a Microsoft SQL Server and exposing them as +Prometheus metrics. ## Usage @@ -27,12 +28,13 @@ prometheus.exporter.mssql "LABEL" { The following arguments can be used to configure the exporter's behavior. Omitted fields take their default values. -| Name | Type | Description | Default | Required | -| ---------------------- | ---------- | ----------------------------------------------------------------- | ------- | -------- | -| `connection_string` | `secret` | The connection string used to connect to an Microsoft SQL Server. | | yes | -| `max_idle_connections` | `int` | Maximum number of idle connections to any one target. | `3` | no | -| `max_open_connections` | `int` | Maximum number of open connections to any one target. | `3` | no | -| `timeout` | `duration` | The query timeout in seconds. | `"10s"` | no | +| Name | Type | Description | Default | Required | +| ---------------------- | ---------- | ------------------------------------------------------------------- | ------- | -------- | +| `connection_string` | `secret` | The connection string used to connect to an Microsoft SQL Server. | | yes | +| `max_idle_connections` | `int` | Maximum number of idle connections to any one target. | `3` | no | +| `max_open_connections` | `int` | Maximum number of open connections to any one target. | `3` | no | +| `timeout` | `duration` | The query timeout in seconds. | `"10s"` | no | +| `query_config` | `string` | MSSQL query to Prometheus metric configuration as an inline string. | | no | [The sql_exporter examples](https://github.com/burningalchemist/sql_exporter/blob/master/examples/azure-sql-mi/sql_exporter.yml#L21) show the format of the `connection_string` argument: @@ -40,6 +42,15 @@ Omitted fields take their default values. sqlserver://USERNAME_HERE:PASSWORD_HERE@SQLMI_HERE_ENDPOINT.database.windows.net:1433?encrypt=true&hostNameInCertificate=%2A.SQL_MI_DOMAIN_HERE.database.windows.net&trustservercertificate=true ``` +If specified, the `query_config` argument must be a YAML document as string defining which MSSQL queries map to custom Prometheus metrics. +`query_config` is typically loaded by using the exports of another component. For example, + +- `local.file.LABEL.content` +- `remote.http.LABEL.content` +- `remote.s3.LABEL.content` + +See [sql_exporter](https://github.com/burningalchemist/sql_exporter#collectors) for details on how to create a configuration. + ## Blocks The `prometheus.exporter.mssql` component does not support any blocks, and is configured @@ -100,3 +111,222 @@ Replace the following: - `PASSWORD`: The password to use for authentication to the remote_write API. [scrape]: {{< relref "./prometheus.scrape.md" >}} + +## Custom metrics +You can use the optional `query_config` parameter to retrieve custom Prometheus metrics for a MSSQL instance. + +If this is defined, the new configuration will be used to query your MSSQL instance and create whatever Prometheus metrics are defined. +If you want additional metrics on top of the default metrics, the default configuration must be used as a base. + +The default configuration used by this integration is as follows: +``` +collector_name: mssql_standard + +metrics: + - metric_name: mssql_local_time_seconds + type: gauge + help: 'Local time in seconds since epoch (Unix time).' + values: [unix_time] + query: | + SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time + - metric_name: mssql_connections + type: gauge + help: 'Number of active connections.' + key_labels: + - db + values: [count] + query: | + SELECT DB_NAME(sp.dbid) AS db, COUNT(sp.spid) AS count + FROM sys.sysprocesses sp + GROUP BY DB_NAME(sp.dbid) + # + # Collected from sys.dm_os_performance_counters + # + - metric_name: mssql_deadlocks_total + type: counter + help: 'Number of lock requests that resulted in a deadlock.' + values: [cntr_value] + query: | + SELECT cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Number of Deadlocks/sec' AND instance_name = '_Total' + - metric_name: mssql_user_errors_total + type: counter + help: 'Number of user errors.' + values: [cntr_value] + query: | + SELECT cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Errors/sec' AND instance_name = 'User Errors' + - metric_name: mssql_kill_connection_errors_total + type: counter + help: 'Number of severe errors that caused SQL Server to kill the connection.' + values: [cntr_value] + query: | + SELECT cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Errors/sec' AND instance_name = 'Kill Connection Errors' + - metric_name: mssql_page_life_expectancy_seconds + type: gauge + help: 'The minimum number of seconds a page will stay in the buffer pool on this node without references.' + values: [cntr_value] + query: | + SELECT top(1) cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Page life expectancy' + - metric_name: mssql_batch_requests_total + type: counter + help: 'Number of command batches received.' + values: [cntr_value] + query: | + SELECT cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Batch Requests/sec' + - metric_name: mssql_log_growths_total + type: counter + help: 'Number of times the transaction log has been expanded, per database.' + key_labels: + - db + values: [cntr_value] + query: | + SELECT rtrim(instance_name) AS db, cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Log Growths' AND instance_name <> '_Total' + - metric_name: mssql_buffer_cache_hit_ratio + type: gauge + help: 'Ratio of requests that hit the buffer cache' + values: [BufferCacheHitRatio] + query: | + SELECT (a.cntr_value * 1.0 / b.cntr_value) * 100.0 as BufferCacheHitRatio + FROM sys.dm_os_performance_counters a + JOIN (SELECT cntr_value, OBJECT_NAME + FROM sys.dm_os_performance_counters + WHERE counter_name = 'Buffer cache hit ratio base' + AND OBJECT_NAME = 'SQLServer:Buffer Manager') b ON a.OBJECT_NAME = b.OBJECT_NAME + WHERE a.counter_name = 'Buffer cache hit ratio' + AND a.OBJECT_NAME = 'SQLServer:Buffer Manager' + + - metric_name: mssql_checkpoint_pages_sec + type: gauge + help: 'Checkpoint Pages Per Second' + values: [cntr_value] + query: | + SELECT cntr_value + FROM sys.dm_os_performance_counters + WHERE [counter_name] = 'Checkpoint pages/sec' + # + # Collected from sys.dm_io_virtual_file_stats + # + - metric_name: mssql_io_stall_seconds_total + type: counter + help: 'Stall time in seconds per database and I/O operation.' + key_labels: + - db + value_label: operation + values: + - read + - write + query_ref: mssql_io_stall + + # + # Collected from sys.dm_os_process_memory + # + - metric_name: mssql_resident_memory_bytes + type: gauge + help: 'SQL Server resident memory size (AKA working set).' + values: [resident_memory_bytes] + query_ref: mssql_process_memory + + - metric_name: mssql_virtual_memory_bytes + type: gauge + help: 'SQL Server committed virtual memory size.' + values: [virtual_memory_bytes] + query_ref: mssql_process_memory + + - metric_name: mssql_available_commit_memory_bytes + type: gauge + help: 'SQL Server available to be committed memory size.' + values: [available_commit_limit_bytes] + query_ref: mssql_process_memory + + - metric_name: mssql_memory_utilization_percentage + type: gauge + help: 'The percentage of committed memory that is in the working set.' + values: [memory_utilization_percentage] + query_ref: mssql_process_memory + + - metric_name: mssql_page_fault_count_total + type: counter + help: 'The number of page faults that were incurred by the SQL Server process.' + values: [page_fault_count] + query_ref: mssql_process_memory + + # + # Collected from sys.dm_os_sys_info + # + - metric_name: mssql_server_total_memory_bytes + type: gauge + help: 'SQL Server committed memory in the memory manager.' + values: [committed_memory_bytes] + query_ref: mssql_os_sys_info + + - metric_name: mssql_server_target_memory_bytes + type: gauge + help: 'SQL Server target committed memory set for the memory manager.' + values: [committed_memory_target_bytes] + query_ref: mssql_os_sys_info + + # + # Collected from sys.dm_os_sys_memory + # + - metric_name: mssql_os_memory + type: gauge + help: 'OS physical memory, used and available.' + value_label: 'state' + values: [used, available] + query: | + SELECT + (total_physical_memory_kb - available_physical_memory_kb) * 1024 AS used, + available_physical_memory_kb * 1024 AS available + FROM sys.dm_os_sys_memory + - metric_name: mssql_os_page_file + type: gauge + help: 'OS page file, used and available.' + value_label: 'state' + values: [used, available] + query: | + SELECT + (total_page_file_kb - available_page_file_kb) * 1024 AS used, + available_page_file_kb * 1024 AS available + FROM sys.dm_os_sys_memory +queries: + # Populates `mssql_io_stall` and `mssql_io_stall_total` + - query_name: mssql_io_stall + query: | + SELECT + cast(DB_Name(a.database_id) as varchar) AS [db], + sum(io_stall_read_ms) / 1000.0 AS [read], + sum(io_stall_write_ms) / 1000.0 AS [write] + FROM + sys.dm_io_virtual_file_stats(null, null) a + INNER JOIN sys.master_files b ON a.database_id = b.database_id AND a.file_id = b.file_id + GROUP BY a.database_id + # Populates `mssql_resident_memory_bytes`, `mssql_virtual_memory_bytes`, mssql_available_commit_memory_bytes, + # and `mssql_memory_utilization_percentage`, and `mssql_page_fault_count_total` + - query_name: mssql_process_memory + query: | + SELECT + physical_memory_in_use_kb * 1024 AS resident_memory_bytes, + virtual_address_space_committed_kb * 1024 AS virtual_memory_bytes, + available_commit_limit_kb * 1024 AS available_commit_limit_bytes, + memory_utilization_percentage, + page_fault_count + FROM sys.dm_os_process_memory + # Populates `mssql_server_total_memory_bytes` and `mssql_server_target_memory_bytes`. + - query_name: mssql_os_sys_info + query: | + SELECT + committed_kb * 1024 AS committed_memory_bytes, + committed_target_kb * 1024 AS committed_memory_target_bytes + FROM sys.dm_os_sys_info +``` diff --git a/docs/sources/static/configuration/integrations/mssql-config.md b/docs/sources/static/configuration/integrations/mssql-config.md index d79b360bfd10..76edb2fae888 100644 --- a/docs/sources/static/configuration/integrations/mssql-config.md +++ b/docs/sources/static/configuration/integrations/mssql-config.md @@ -10,7 +10,7 @@ title: mssql_config # mssql_config -The `mssql_configs` block configures the `mssql` integration, an embedded version of [`sql_exporter`](https://github.com/burningalchemist/sql_exporter) that lets you collect [Microsoft SQL Server](https://www.microsoft.com/en-us/sql-server) metrics. +The `mssql_config` block configures the `mssql` integration, an embedded version of [`sql_exporter`](https://github.com/burningalchemist/sql_exporter) that lets you collect [Microsoft SQL Server](https://www.microsoft.com/en-us/sql-server) metrics. It is recommended that you have a dedicated user set up for monitoring an mssql instance. The user for monitoring must have the following grants in order to populate the metrics: @@ -19,12 +19,24 @@ GRANT VIEW ANY DEFINITION TO GRANT VIEW SERVER STATE TO ``` +## Quick configuration example + +To get started, define the MSSQL connection string in Grafana Agent's integration block: + +```yaml +metrics: + wal_directory: /tmp/wal +integrations: + mssql: + enabled: true + connection_string: "sqlserver://[user]:[pass]@localhost:1433" +``` Full reference of options: ```yaml - # Enables the mssql integration, allowing the Agent to automatically - # collect metrics for the specified mssql instance. + # Enables the MSSQL integration, allowing the Agent to automatically + # collect metrics for the specified MSSQL instance. [enabled: | default = false] # Sets an explicit value for the instance label when the integration is @@ -34,7 +46,7 @@ Full reference of options: [instance: ] # Automatically collect metrics from this integration. If disabled, - # the mssql integration is run but not scraped and thus not + # the MSSQL integration is run but not scraped and thus not # remote-written. Metrics for the integration are exposed at # /integrations/mssql/metrics and can be scraped by an external # process. @@ -64,17 +76,241 @@ Full reference of options: # Exporter-specific configuration options # - # The connection_string to use to connect to the mssql instance. + # The connection_string to use to connect to the MSSQL instance. # It is specified in the form of: "sqlserver://:@:" connection_string: - # The maximum number of open database connections to the mssql instance. + # The maximum number of open database connections to the MSSQL instance. [max_open_connections: | default = 3] - # The maximum number of idle database connections to the mssql instance. + # The maximum number of idle database connections to the MSSQL instance. [max_idle_connections: | default = 3] - # The timeout for scraping metrics from the mssql instance. + # The timeout for scraping metrics from the MSSQL instance. [timeout: | default = "10s"] + # Embedded MSSQL query configuration for specifying custom MSSQL Prometheus metrics. + # See https://github.com/burningalchemist/sql_exporter#collectors for more details how to specify your metric configurations. + query_config: + [- ... ] + [- ... ]] +``` + +## Custom metrics +You can use the optional `query_config` parameter to retrieve custom Prometheus metrics for a MSSQL instance. + +If this is defined, the new configuration will be used to query your MSSQL instance and create whatever Prometheus metrics are defined. +If you want additional metrics on top of the default metrics, the default configuration must be used as a base. + +The default configuration used by this integration is as follows: +``` +collector_name: mssql_standard + +metrics: + - metric_name: mssql_local_time_seconds + type: gauge + help: 'Local time in seconds since epoch (Unix time).' + values: [unix_time] + query: | + SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time + - metric_name: mssql_connections + type: gauge + help: 'Number of active connections.' + key_labels: + - db + values: [count] + query: | + SELECT DB_NAME(sp.dbid) AS db, COUNT(sp.spid) AS count + FROM sys.sysprocesses sp + GROUP BY DB_NAME(sp.dbid) + # + # Collected from sys.dm_os_performance_counters + # + - metric_name: mssql_deadlocks_total + type: counter + help: 'Number of lock requests that resulted in a deadlock.' + values: [cntr_value] + query: | + SELECT cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Number of Deadlocks/sec' AND instance_name = '_Total' + - metric_name: mssql_user_errors_total + type: counter + help: 'Number of user errors.' + values: [cntr_value] + query: | + SELECT cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Errors/sec' AND instance_name = 'User Errors' + - metric_name: mssql_kill_connection_errors_total + type: counter + help: 'Number of severe errors that caused SQL Server to kill the connection.' + values: [cntr_value] + query: | + SELECT cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Errors/sec' AND instance_name = 'Kill Connection Errors' + - metric_name: mssql_page_life_expectancy_seconds + type: gauge + help: 'The minimum number of seconds a page will stay in the buffer pool on this node without references.' + values: [cntr_value] + query: | + SELECT top(1) cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Page life expectancy' + - metric_name: mssql_batch_requests_total + type: counter + help: 'Number of command batches received.' + values: [cntr_value] + query: | + SELECT cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Batch Requests/sec' + - metric_name: mssql_log_growths_total + type: counter + help: 'Number of times the transaction log has been expanded, per database.' + key_labels: + - db + values: [cntr_value] + query: | + SELECT rtrim(instance_name) AS db, cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Log Growths' AND instance_name <> '_Total' + - metric_name: mssql_buffer_cache_hit_ratio + type: gauge + help: 'Ratio of requests that hit the buffer cache' + values: [BufferCacheHitRatio] + query: | + SELECT (a.cntr_value * 1.0 / b.cntr_value) * 100.0 as BufferCacheHitRatio + FROM sys.dm_os_performance_counters a + JOIN (SELECT cntr_value, OBJECT_NAME + FROM sys.dm_os_performance_counters + WHERE counter_name = 'Buffer cache hit ratio base' + AND OBJECT_NAME = 'SQLServer:Buffer Manager') b ON a.OBJECT_NAME = b.OBJECT_NAME + WHERE a.counter_name = 'Buffer cache hit ratio' + AND a.OBJECT_NAME = 'SQLServer:Buffer Manager' + + - metric_name: mssql_checkpoint_pages_sec + type: gauge + help: 'Checkpoint Pages Per Second' + values: [cntr_value] + query: | + SELECT cntr_value + FROM sys.dm_os_performance_counters + WHERE [counter_name] = 'Checkpoint pages/sec' + # + # Collected from sys.dm_io_virtual_file_stats + # + - metric_name: mssql_io_stall_seconds_total + type: counter + help: 'Stall time in seconds per database and I/O operation.' + key_labels: + - db + value_label: operation + values: + - read + - write + query_ref: mssql_io_stall + + # + # Collected from sys.dm_os_process_memory + # + - metric_name: mssql_resident_memory_bytes + type: gauge + help: 'SQL Server resident memory size (AKA working set).' + values: [resident_memory_bytes] + query_ref: mssql_process_memory + + - metric_name: mssql_virtual_memory_bytes + type: gauge + help: 'SQL Server committed virtual memory size.' + values: [virtual_memory_bytes] + query_ref: mssql_process_memory + + - metric_name: mssql_available_commit_memory_bytes + type: gauge + help: 'SQL Server available to be committed memory size.' + values: [available_commit_limit_bytes] + query_ref: mssql_process_memory + + - metric_name: mssql_memory_utilization_percentage + type: gauge + help: 'The percentage of committed memory that is in the working set.' + values: [memory_utilization_percentage] + query_ref: mssql_process_memory + + - metric_name: mssql_page_fault_count_total + type: counter + help: 'The number of page faults that were incurred by the SQL Server process.' + values: [page_fault_count] + query_ref: mssql_process_memory + + # + # Collected from sys.dm_os_sys_info + # + - metric_name: mssql_server_total_memory_bytes + type: gauge + help: 'SQL Server committed memory in the memory manager.' + values: [committed_memory_bytes] + query_ref: mssql_os_sys_info + + - metric_name: mssql_server_target_memory_bytes + type: gauge + help: 'SQL Server target committed memory set for the memory manager.' + values: [committed_memory_target_bytes] + query_ref: mssql_os_sys_info + + # + # Collected from sys.dm_os_sys_memory + # + - metric_name: mssql_os_memory + type: gauge + help: 'OS physical memory, used and available.' + value_label: 'state' + values: [used, available] + query: | + SELECT + (total_physical_memory_kb - available_physical_memory_kb) * 1024 AS used, + available_physical_memory_kb * 1024 AS available + FROM sys.dm_os_sys_memory + - metric_name: mssql_os_page_file + type: gauge + help: 'OS page file, used and available.' + value_label: 'state' + values: [used, available] + query: | + SELECT + (total_page_file_kb - available_page_file_kb) * 1024 AS used, + available_page_file_kb * 1024 AS available + FROM sys.dm_os_sys_memory +queries: + # Populates `mssql_io_stall` and `mssql_io_stall_total` + - query_name: mssql_io_stall + query: | + SELECT + cast(DB_Name(a.database_id) as varchar) AS [db], + sum(io_stall_read_ms) / 1000.0 AS [read], + sum(io_stall_write_ms) / 1000.0 AS [write] + FROM + sys.dm_io_virtual_file_stats(null, null) a + INNER JOIN sys.master_files b ON a.database_id = b.database_id AND a.file_id = b.file_id + GROUP BY a.database_id + # Populates `mssql_resident_memory_bytes`, `mssql_virtual_memory_bytes`, mssql_available_commit_memory_bytes, + # and `mssql_memory_utilization_percentage`, and `mssql_page_fault_count_total` + - query_name: mssql_process_memory + query: | + SELECT + physical_memory_in_use_kb * 1024 AS resident_memory_bytes, + virtual_address_space_committed_kb * 1024 AS virtual_memory_bytes, + available_commit_limit_kb * 1024 AS available_commit_limit_bytes, + memory_utilization_percentage, + page_fault_count + FROM sys.dm_os_process_memory + # Populates `mssql_server_total_memory_bytes` and `mssql_server_target_memory_bytes`. + - query_name: mssql_os_sys_info + query: | + SELECT + committed_kb * 1024 AS committed_memory_bytes, + committed_target_kb * 1024 AS committed_memory_target_bytes + FROM sys.dm_os_sys_info ``` diff --git a/pkg/integrations/mssql/collector.go b/pkg/integrations/mssql/collector.go index 90f966df5c29..f9c8066d1dad 100644 --- a/pkg/integrations/mssql/collector.go +++ b/pkg/integrations/mssql/collector.go @@ -11,7 +11,7 @@ import ( "github.com/go-kit/log/level" "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" - "gopkg.in/yaml.v2" + "gopkg.in/yaml.v3" ) // Embedded config.CollectorConfig as yaml. diff --git a/pkg/integrations/mssql/sql_exporter.go b/pkg/integrations/mssql/sql_exporter.go index 083a26ea3eef..fd9af2278be0 100644 --- a/pkg/integrations/mssql/sql_exporter.go +++ b/pkg/integrations/mssql/sql_exporter.go @@ -9,12 +9,14 @@ import ( "github.com/go-kit/log" "github.com/prometheus/client_golang/prometheus" config_util "github.com/prometheus/common/config" + "gopkg.in/yaml.v3" "github.com/burningalchemist/sql_exporter" "github.com/burningalchemist/sql_exporter/config" "github.com/grafana/agent/pkg/integrations" integrations_v2 "github.com/grafana/agent/pkg/integrations/v2" "github.com/grafana/agent/pkg/integrations/v2/metricsutils" + "github.com/grafana/agent/pkg/util" "github.com/prometheus/common/model" ) @@ -31,6 +33,7 @@ type Config struct { MaxIdleConnections int `yaml:"max_idle_connections,omitempty"` MaxOpenConnections int `yaml:"max_open_connections,omitempty"` Timeout time.Duration `yaml:"timeout,omitempty"` + QueryConfig util.RawYAML `yaml:"query_config,omitempty"` } func (c Config) validate() error { @@ -77,7 +80,13 @@ func (c *Config) UnmarshalYAML(unmarshal func(interface{}) error) error { *c = DefaultConfig type plain Config - return unmarshal((*plain)(c)) + err := unmarshal((*plain)(c)) + if err != nil { + return err + } + + var customQueryConfig config.CollectorConfig + return yaml.Unmarshal(c.QueryConfig, &customQueryConfig) } // Name returns the name of the integration this config is for. @@ -96,6 +105,15 @@ func (c *Config) NewIntegration(l log.Logger) (integrations.Integration, error) return nil, fmt.Errorf("failed to validate config: %w", err) } + // Initialize collectorConfig from config params if needed + customCollectorConfig, err := createCollectorConfig(c.QueryConfig) + if err != nil { + return nil, fmt.Errorf("failed to create mssql target: %w", err) + } + if customCollectorConfig != nil { + collectorConfig = *customCollectorConfig + } + t, err := sql_exporter.NewTarget( "mssqlintegration", "", @@ -123,3 +141,13 @@ func (c *Config) NewIntegration(l log.Logger) (integrations.Integration, error) integrations.WithCollectors(col), ), nil } + +func createCollectorConfig(queryConfig util.RawYAML) (*config.CollectorConfig, error) { + var customCollectorConfig *config.CollectorConfig + + if err := yaml.Unmarshal(queryConfig, &customCollectorConfig); err != nil { + return nil, fmt.Errorf("query_config not in correct format: %w", err) + } + + return customCollectorConfig, nil +} diff --git a/pkg/integrations/mssql/sql_exporter_test.go b/pkg/integrations/mssql/sql_exporter_test.go index 361722b2c4da..90b4840baae3 100644 --- a/pkg/integrations/mssql/sql_exporter_test.go +++ b/pkg/integrations/mssql/sql_exporter_test.go @@ -11,6 +11,18 @@ import ( ) func TestConfig_validate(t *testing.T) { + strConfig := `--- +collector_name: mssql_standard + +metrics: +- metric_name: mssql_local_time_seconds + type: gauge + help: 'Local time in seconds since epoch (Unix time).' + values: [unix_time] + query: | + SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time + ` + testCases := []struct { name string input Config @@ -84,6 +96,16 @@ func TestConfig_validate(t *testing.T) { }, err: "timeout must be positive", }, + { + name: "good query config", + input: Config{ + ConnectionString: "sqlserver://user:pass@localhost:1433", + MaxIdleConnections: 3, + MaxOpenConnections: 3, + Timeout: 10 * time.Second, + QueryConfig: []byte(strConfig), + }, + }, } for _, tc := range testCases { @@ -98,6 +120,7 @@ func TestConfig_validate(t *testing.T) { }) } } + func TestConfig_UnmarshalYaml(t *testing.T) { t.Run("only required values", func(t *testing.T) { strConfig := `connection_string: "sqlserver://user:pass@localhost:1433"` @@ -115,12 +138,27 @@ func TestConfig_UnmarshalYaml(t *testing.T) { }) t.Run("all values", func(t *testing.T) { - strConfig := ` -connection_string: "sqlserver://user:pass@localhost:1433" + strQueryConfig := `collector_name: mssql_standard +metrics: +- metric_name: mssql_local_time_seconds + help: Local time in seconds since epoch (Unix time). + type: gauge + values: + - unix_time + query: SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time +` + strConfig := `connection_string: "sqlserver://user:pass@localhost:1433" max_idle_connections: 5 max_open_connections: 6 timeout: 1m -` +query_config: + collector_name: mssql_standard + metrics: + - metric_name: mssql_local_time_seconds + help: 'Local time in seconds since epoch (Unix time).' + type: "gauge" + values: [unix_time] + query: "SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time"` var c Config @@ -131,17 +169,29 @@ timeout: 1m MaxIdleConnections: 5, MaxOpenConnections: 6, Timeout: time.Minute, + QueryConfig: []byte(strQueryConfig), }, c) }) } func TestConfig_NewIntegration(t *testing.T) { t.Run("integration with valid config", func(t *testing.T) { + strQueryConfig := `--- +collector_name: mssql_standard + +metrics: +- metric_name: mssql_local_time_seconds + type: gauge + help: 'Local time in seconds since epoch (Unix time).' + values: [unix_time] + query: SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time +` c := &Config{ ConnectionString: "sqlserver://user:pass@localhost:1433", MaxIdleConnections: 3, MaxOpenConnections: 3, Timeout: 10 * time.Second, + QueryConfig: []byte(strQueryConfig), } i, err := c.NewIntegration(log.NewJSONLogger(os.Stdout)) @@ -161,6 +211,28 @@ func TestConfig_NewIntegration(t *testing.T) { require.Nil(t, i) require.ErrorContains(t, err, "failed to validate config:") }) + + t.Run("integration with invalid query config", func(t *testing.T) { + strQueryConfig := `collector_name: mssql_standard + +metrics: +- metric_name: mssql_local_time_seconds + help: 'Local time in seconds since epoch (Unix time).' + values: [unix_time] + query: SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time +` + c := &Config{ + ConnectionString: "sqlserver://user:pass@localhost:1433", + MaxIdleConnections: 3, + MaxOpenConnections: 3, + Timeout: 10 * time.Second, + QueryConfig: []byte(strQueryConfig), + } + + i, err := c.NewIntegration(log.NewJSONLogger(os.Stdout)) + require.Nil(t, i) + require.ErrorContains(t, err, "failed to create mssql target: query_config not in correct format: ") + }) } func TestConfig_AgentKey(t *testing.T) {