Skip to content

Commit

Permalink
Expose prometheus agent sharding strategy (#1605)
Browse files Browse the repository at this point in the history
* Expose prometheus agent sharding strategy

Signed-off-by: QuentinBisson <[email protected]>

* Update main.go

Co-authored-by: Marie Roque <[email protected]>

* Update service/controller/resource/monitoring/remotewriteconfig/types.go

Co-authored-by: Marie Roque <[email protected]>
Signed-off-by: QuentinBisson <[email protected]>

* fix: add missing or in helm chart

Signed-off-by: QuentinBisson <[email protected]>

---------

Signed-off-by: QuentinBisson <[email protected]>
Co-authored-by: Marie Roque <[email protected]>
  • Loading branch information
QuentinBisson and marieroque authored May 2, 2024
1 parent 797429d commit 200c0e0
Show file tree
Hide file tree
Showing 15 changed files with 114 additions and 56 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed

- Expose prometheus agent sharding strategies as prometheus-meta-operator configuration parameters so we can experiment with the scaling strategies.
-
## [4.73.1] - 2024-05-01

### Fixed
Expand Down
5 changes: 5 additions & 0 deletions flag/service/prometheus/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,8 @@ type Prometheus struct {
ScrapeInterval string
Version string
}

type PrometheusAgent struct {
ShardScaleUpSeriesCount string
ShardScaleDownPercentage string
}
23 changes: 12 additions & 11 deletions flag/service/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,16 @@ import (

// Service is an intermediate data structure for command line configuration flags.
type Service struct {
Grafana grafana.Grafana
Ingress ingress.Ingress
Installation installation.Installation
Kubernetes kubernetes.Kubernetes
Opsgenie opsgenie.Opsgenie
Mimir mimir.Mimir
Prometheus prometheus.Prometheus
Provider provider.Provider
Security security.Security
Slack slack.Slack
Vault vault.Vault
Grafana grafana.Grafana
Ingress ingress.Ingress
Installation installation.Installation
Kubernetes kubernetes.Kubernetes
Opsgenie opsgenie.Opsgenie
Mimir mimir.Mimir
Prometheus prometheus.Prometheus
PrometheusAgent prometheus.PrometheusAgent
Provider provider.Provider
Security security.Security
Slack slack.Slack
Vault vault.Vault
}
9 changes: 9 additions & 0 deletions helm/prometheus-meta-operator/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,15 @@ data:
{{- if .Values.prometheus.version }}
version: {{ .Values.prometheus.version }}
{{- end }}
{{- if or (.Values.prometheusAgent).shardScaleUpSeriesCount (.Values.prometheusAgent).shardScaleDownPercentage }}
prometheusAgent:
{{- if .Values.prometheusAgent.shardScaleUpSeriesCount }}
shardScaleUpSeriesCount: {{ .Values.prometheusAgent.shardScaleUpSeriesCount }}
{{- end }}
{{- if .Values.prometheusAgent.shardScaleDownPercentage }}
shardScaleDownPercentage: {{ .Values.prometheusAgent.shardScaleDownPercentage }}
{{- end }}
{{- end }}
slack:
apiToken: {{ .Values.alertmanager.slack.apiToken }}
apiURL: {{ .Values.alertmanager.slack.apiURL }}
Expand Down
3 changes: 3 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ func mainE(ctx context.Context) error {
daemonCommand.PersistentFlags().String(f.Service.Prometheus.ImageRepository, "giantswarm/prometheus", "Prometheus container image repository.")
daemonCommand.PersistentFlags().String(f.Service.Prometheus.Version, "v2.47.1", "Prometheus container image version.")

daemonCommand.PersistentFlags().Float64(f.Service.PrometheusAgent.ShardScaleUpSeriesCount, 1_000_000, "Prometheus agent shard scale up series count to know when to add a shard.")
daemonCommand.PersistentFlags().Float64(f.Service.PrometheusAgent.ShardScaleDownPercentage, 0.20, "Prometheus agent shard scale down percentage to know when to scale down the number of shards.")

daemonCommand.PersistentFlags().String(f.Service.Provider.Kind, "", "Provider of the installation. One of aws, azure, kvm.")
daemonCommand.PersistentFlags().String(f.Service.Provider.Flavor, "", "Provider flavor. One of capi or vintage.")

Expand Down
3 changes: 3 additions & 0 deletions service/controller/clusterapi/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (

"github.com/giantswarm/prometheus-meta-operator/v2/pkg/cluster"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/project"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/monitoring/remotewriteconfig"
)

type ControllerConfig struct {
Expand Down Expand Up @@ -49,6 +50,8 @@ type ControllerConfig struct {
PrometheusImageRepository string
PrometheusVersion string

PrometheusAgentShardingStrategy remotewriteconfig.PrometheusAgentShardingStrategy

RestrictedAccessEnabled bool
WhitelistedSubnets string

Expand Down
4 changes: 4 additions & 0 deletions service/controller/clusterapi/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ type Config struct {
PrometheusImageRepository string
PrometheusVersion string

PrometheusAgentShardingStrategy remotewriteconfig.PrometheusAgentShardingStrategy

RestrictedAccessEnabled bool
WhitelistedSubnets string

Expand Down Expand Up @@ -195,6 +197,8 @@ func New(config Config) ([]resource.Interface, error) {
Provider: config.Provider,
Region: config.Region,
Version: config.PrometheusVersion,

PrometheusAgentShardingStrategy: config.PrometheusAgentShardingStrategy,
}

remoteWriteConfigResource, err = remotewriteconfig.New(c)
Expand Down
3 changes: 3 additions & 0 deletions service/controller/managementcluster/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (

"github.com/giantswarm/prometheus-meta-operator/v2/pkg/cluster"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/project"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/monitoring/remotewriteconfig"
"github.com/giantswarm/prometheus-meta-operator/v2/service/key"
)

Expand Down Expand Up @@ -54,6 +55,8 @@ type ControllerConfig struct {
PrometheusImageRepository string
PrometheusVersion string

PrometheusAgentShardingStrategy remotewriteconfig.PrometheusAgentShardingStrategy

RestrictedAccessEnabled bool
WhitelistedSubnets string

Expand Down
4 changes: 4 additions & 0 deletions service/controller/managementcluster/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ type resourcesConfig struct {
PrometheusImageRepository string
PrometheusVersion string

PrometheusAgentShardingStrategy remotewriteconfig.PrometheusAgentShardingStrategy

RestrictedAccessEnabled bool
WhitelistedSubnets string

Expand Down Expand Up @@ -352,6 +354,8 @@ func newResources(config resourcesConfig) ([]resource.Interface, error) {
Provider: config.Provider,
Region: config.Region,
Version: config.PrometheusVersion,

PrometheusAgentShardingStrategy: config.PrometheusAgentShardingStrategy,
}

remoteWriteConfigResource, err = remotewriteconfig.New(c)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ func (r *Resource) EnsureCreated(ctx context.Context, obj interface{}) error {
return microerror.Mask(err)
}

shards, err := r.getShardsCountForCluster(ctx, cluster, currentShards)
shards, err := r.getShardsCountForCluster(cluster, currentShards)
if err != nil {
return microerror.Mask(err)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ type Config struct {
Provider cluster.Provider
Region string
Version string

PrometheusAgentShardingStrategy PrometheusAgentShardingStrategy
}

type Resource struct {
Expand All @@ -48,6 +50,8 @@ type Resource struct {
provider cluster.Provider
region string
version string

prometheusAgentShardingStrategy PrometheusAgentShardingStrategy
}

func New(config Config) (*Resource, error) {
Expand Down Expand Up @@ -87,6 +91,8 @@ func New(config Config) (*Resource, error) {
provider: config.Provider,
region: config.Region,
version: config.Version,

prometheusAgentShardingStrategy: config.PrometheusAgentShardingStrategy,
}

return r, nil
Expand Down Expand Up @@ -150,22 +156,22 @@ func (r *Resource) desiredConfigMap(ctx context.Context, cluster metav1.Object,
}

// We want to compute the number of shards based on the number of nodes.
func (r *Resource) getShardsCountForCluster(ctx context.Context, cluster metav1.Object, currentShardCount int) (int, error) {
func (r *Resource) getShardsCountForCluster(cluster metav1.Object, currentShardCount int) (int, error) {
headSeries, err := prometheusquerier.QueryTSDBHeadSeries(key.ClusterID(cluster))
if err != nil {
// If prometheus is not accessible (for instance, not running because this is a new cluster, we check if prometheus is accessible)
var dnsError *net.DNSError
if errors.As(err, &dnsError) {
return computeShards(currentShardCount, 1), nil
return r.prometheusAgentShardingStrategy.ComputeShards(currentShardCount, 1), nil
}

return 0, microerror.Mask(err)
}
return computeShards(currentShardCount, headSeries), nil
return r.prometheusAgentShardingStrategy.ComputeShards(currentShardCount, headSeries), nil
}

func (r *Resource) createConfigMap(ctx context.Context, cluster metav1.Object, name string, namespace string, version string) error {
shards, err := r.getShardsCountForCluster(ctx, cluster, 1)
shards, err := r.getShardsCountForCluster(cluster, 1)
if err != nil {
return microerror.Mask(err)
}
Expand Down
28 changes: 0 additions & 28 deletions service/controller/resource/monitoring/remotewriteconfig/shards.go

This file was deleted.

30 changes: 30 additions & 0 deletions service/controller/resource/monitoring/remotewriteconfig/types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package remotewriteconfig

import "math"

type PrometheusAgentShardingStrategy struct {
// Configures the number of series needed to add a new shard. Computation is number of series / ShardScaleUpSeriesCount
ShardScaleUpSeriesCount float64
// Percentage of needed series based on ShardScaleUpSeriesCount to scale down agents
ShardScaleDownPercentage float64
}

// We want to start with 1 prometheus-agent for each 1M time series with a scale down 20% threshold.
func (pass PrometheusAgentShardingStrategy) ComputeShards(currentShardCount int, timeSeries float64) int {
shardScaleDownThreshold := pass.ShardScaleDownPercentage * pass.ShardScaleUpSeriesCount
desiredShardCount := int(math.Ceil(timeSeries / pass.ShardScaleUpSeriesCount))

// Compute Scale Down
if currentShardCount > desiredShardCount {
// We get the rest of a division of timeSeries by shardStep and we compare it with the scale down threshold
if math.Mod(timeSeries, pass.ShardScaleUpSeriesCount) > pass.ShardScaleUpSeriesCount-shardScaleDownThreshold {
desiredShardCount = currentShardCount
}
}

// We always have a minimum of 1 agent, even if there is no worker node
if desiredShardCount <= 0 {
return 1
}
return desiredShardCount
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,61 +8,66 @@ import (
var _ = flag.Bool("update", false, "update the output file")

func TestShardComputationScaleUp(t *testing.T) {
pass := PrometheusAgentShardingStrategy{ShardScaleUpSeriesCount: float64(1_000_000), ShardScaleDownPercentage: float64(0.20)}

expected := 1
result := computeShards(0, float64(1_000_000))
result := pass.ComputeShards(0, float64(1_000_000))
if result != expected {
t.Errorf(`expected computeShards(0, 1_000_000) to be %d, got %d`, expected, result)
}

expected = 2
result = computeShards(0, float64(1_000_001))
result = pass.ComputeShards(0, float64(1_000_001))
if result != expected {
t.Errorf(`expected computeShards(0, 1_000_001) to be %d, got %d`, expected, result)
}

expected = 3
result = computeShards(0, float64(2_000_001))
result = pass.ComputeShards(0, float64(2_000_001))
if result != expected {
t.Errorf(`expected computeShards(0, 2_000_001) to be %d, got %d`, expected, result)
}
}

func TestShardComputationReturnsAtLeast1Shart(t *testing.T) {
pass := PrometheusAgentShardingStrategy{ShardScaleUpSeriesCount: float64(1_000_000), ShardScaleDownPercentage: float64(0.20)}

expected := 1
result := computeShards(0, 0)
result := pass.ComputeShards(0, 0)
if result != expected {
t.Errorf(`expected computeShards(0, 0) to be %d, got %d`, expected, result)
}

expected = 1
result = computeShards(0, -5)
result = pass.ComputeShards(0, -5)
if result != expected {
t.Errorf(`expected computeShards(0, -5) to be %d, got %d`, expected, result)
}
}

func TestShardComputationScaleDown(t *testing.T) {
pass := PrometheusAgentShardingStrategy{ShardScaleUpSeriesCount: float64(1_000_000), ShardScaleDownPercentage: float64(0.20)}
expected := 2
result := computeShards(1, 1_000_001)
result := pass.ComputeShards(1, 1_000_001)
if result != expected {
t.Errorf(`expected computeShards(1, 1_000_001) to be %d, got %d`, expected, result)
}

expected = 2
result = computeShards(2, 999_999)
result = pass.ComputeShards(2, 999_999)
if result != expected {
t.Errorf(`expected computeShards(2, 999_999) to be %d, got %d`, expected, result)
}

expected = 2
result = computeShards(2, 800_001)
result = pass.ComputeShards(2, 800_001)
if result != expected {
t.Errorf(`expected computeShards(2, 800_001) to be %d, got %d`, expected, result)
}

// threshold hit
expected = 1
result = computeShards(2, 800_000)
result = pass.ComputeShards(2, 800_000)
if result != expected {
t.Errorf(`expected computeShards(2, 800_000) to be %d, got %d`, expected, result)
}
Expand Down
15 changes: 12 additions & 3 deletions service/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/clusterapi"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/managementcluster"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/remotewrite"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/monitoring/remotewriteconfig"
)

// Config represents the configuration used to create a new service.
Expand Down Expand Up @@ -147,6 +148,10 @@ func New(config Config) (*Service, error) {
Flavor: config.Viper.GetString(config.Flag.Service.Provider.Flavor),
}

var prometheusAgentShardingStrategy = remotewriteconfig.PrometheusAgentShardingStrategy{
ShardScaleUpSeriesCount: config.Viper.GetFloat64(config.Flag.Service.PrometheusAgent.ShardScaleUpSeriesCount),
ShardScaleDownPercentage: config.Viper.GetFloat64(config.Flag.Service.PrometheusAgent.ShardScaleDownPercentage),
}
var proxyConfig = httpproxy.FromEnvironment()
var clusterapiController *clusterapi.Controller
{
Expand Down Expand Up @@ -180,6 +185,8 @@ func New(config Config) (*Service, error) {
PrometheusImageRepository: config.Viper.GetString(config.Flag.Service.Prometheus.ImageRepository),
PrometheusVersion: config.Viper.GetString(config.Flag.Service.Prometheus.Version),

PrometheusAgentShardingStrategy: prometheusAgentShardingStrategy,

RestrictedAccessEnabled: config.Viper.GetBool(config.Flag.Service.Security.RestrictedAccess.Enabled),
WhitelistedSubnets: config.Viper.GetString(config.Flag.Service.Security.RestrictedAccess.Subnets),

Expand Down Expand Up @@ -225,11 +232,13 @@ func New(config Config) (*Service, error) {
PrometheusEvaluationInterval: config.Viper.GetString(config.Flag.Service.Prometheus.EvaluationInterval),
PrometheusLogLevel: config.Viper.GetString(config.Flag.Service.Prometheus.LogLevel),
PrometheusImageRepository: config.Viper.GetString(config.Flag.Service.Prometheus.ImageRepository),
PrometheusScrapeInterval: config.Viper.GetString(config.Flag.Service.Prometheus.ScrapeInterval),
PrometheusVersion: config.Viper.GetString(config.Flag.Service.Prometheus.Version),

RestrictedAccessEnabled: config.Viper.GetBool(config.Flag.Service.Security.RestrictedAccess.Enabled),
PrometheusScrapeInterval: config.Viper.GetString(config.Flag.Service.Prometheus.ScrapeInterval),
WhitelistedSubnets: config.Viper.GetString(config.Flag.Service.Security.RestrictedAccess.Subnets),
PrometheusAgentShardingStrategy: prometheusAgentShardingStrategy,

RestrictedAccessEnabled: config.Viper.GetBool(config.Flag.Service.Security.RestrictedAccess.Enabled),
WhitelistedSubnets: config.Viper.GetString(config.Flag.Service.Security.RestrictedAccess.Subnets),

ExternalDNS: config.Viper.GetBool(config.Flag.Service.Ingress.ExternalDNS.Enabled),

Expand Down

0 comments on commit 200c0e0

Please sign in to comment.