From 5bddc8b01298d23942deaba242a52c292ce880d9 Mon Sep 17 00:00:00 2001 From: Tim Olow Date: Thu, 14 Mar 2024 09:27:22 -0500 Subject: [PATCH 1/7] Update to latest stable ceph release (#149) Bump 18.2.1 to 18.2.2 latest stable ceph release --- kustomize/rook-cluster/rook-cluster.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kustomize/rook-cluster/rook-cluster.yaml b/kustomize/rook-cluster/rook-cluster.yaml index 632d052a..c95de39f 100644 --- a/kustomize/rook-cluster/rook-cluster.yaml +++ b/kustomize/rook-cluster/rook-cluster.yaml @@ -21,7 +21,7 @@ spec: # versions running within the cluster. See tags available at https://hub.docker.com/r/ceph/ceph/tags/. # If you want to be more precise, you can always use a timestamp tag such as quay.io/ceph/ceph:v17.2.6-20231027 # This tag might not contain a new Ceph version, just security fixes from the underlying operating system, which will reduce vulnerabilities - image: quay.io/ceph/ceph:v18.2.1 + image: quay.io/ceph/ceph:v18.2.2 # Whether to allow unsupported versions of Ceph. Currently `quincy` and `reef` are supported. # Future versions such as `squid` (v19) would require this to be set to `true`. # Do not set to true in production. From ca2c9d5cf7d8f496789d0c149f362d12093c9912 Mon Sep 17 00:00:00 2001 From: Don Norton <5455792+donnorton@users.noreply.github.com> Date: Fri, 15 Mar 2024 13:40:59 -0400 Subject: [PATCH 2/7] Update genestack-getting-started.md (#153) Minor grammatical improvements --- docs/genestack-getting-started.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/genestack-getting-started.md b/docs/genestack-getting-started.md index 70f3ea7b..5b2e80fb 100644 --- a/docs/genestack-getting-started.md +++ b/docs/genestack-getting-started.md @@ -2,11 +2,11 @@ # What is Genestack? -Genestack is a complete operations and deployment ecosystem for Kubernetes and OpenStack. The purpose is of +Genestack is a complete operations and deployment ecosystem for Kubernetes and OpenStack. The purpose of this project is to allow hobbyists, operators, and cloud service providers the ability to build, scale, and leverage Open-Infrastructure in new and exciting ways. -Genestack’s inner workings are a blend dark magic — crafted with [Kustomize](https://kustomize.io) and +Genestack’s inner workings are a blend of dark magic — crafted with [Kustomize](https://kustomize.io) and [Helm](https://helm.sh). It’s like cooking with cloud. Want to spice things up? Tweak the `kustomization.yaml` files or add those extra 'toppings' using Helm's style overrides. However, the platform is ready to go with batteries included. @@ -18,7 +18,7 @@ to manage cloud infrastructure in the way you need it. ## Getting Started -Before you can do anything we need to get the code. Because we've sold our soul to the submodule devil, you're going to need to recursively clone the repo into your location. +Before you can do anything, you need to get the code. Because we've sold our soul to the submodule devil, you're going to need to recursively clone the repo into your location. !!! info From 984d125a8db5d9d121d1ca19679e9b051bea4fd4 Mon Sep 17 00:00:00 2001 From: "phillip.toohill" Date: Mon, 18 Mar 2024 09:37:52 -0500 Subject: [PATCH 3/7] Monitoring: Adding postgres exporter (#154) --- ...ustomize-prometheus-postgres-exporter.yaml | 33 +++ docs/prometheus-postgres-exporter.md | 18 ++ docs/prometheus-rabbitmq-exporter.md | 3 +- .../postgresql/postgresql-helm-overrides.yaml | 16 +- .../kustomization.yaml | 8 + .../prometheus-postgres-exporter/values.yaml | 259 ++++++++++++++++++ mkdocs.yml | 3 +- 7 files changed, 329 insertions(+), 11 deletions(-) create mode 100644 .github/workflows/kustomize-prometheus-postgres-exporter.yaml create mode 100644 docs/prometheus-postgres-exporter.md create mode 100644 kustomize/prometheus-postgres-exporter/kustomization.yaml create mode 100644 kustomize/prometheus-postgres-exporter/values.yaml diff --git a/.github/workflows/kustomize-prometheus-postgres-exporter.yaml b/.github/workflows/kustomize-prometheus-postgres-exporter.yaml new file mode 100644 index 00000000..bb9d3025 --- /dev/null +++ b/.github/workflows/kustomize-prometheus-postgres-exporter.yaml @@ -0,0 +1,33 @@ +name: Kustomize GitHub Actions for Prometheus PostgresSQL exporter + +on: + pull_request: + paths: + - kustomize/prometheus-postgres-exporter/** + - .github/workflows/kustomize-prometheus-postgres-exporter.yaml +jobs: + kustomize: + name: Kustomize + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - uses: azure/setup-helm@v3 + with: + version: latest + token: "${{ secrets.GITHUB_TOKEN }}" + id: helm + - name: Kustomize Install + working-directory: /usr/local/bin/ + run: | + if [ ! -f /usr/local/bin/kustomize ]; then + curl -s "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" | sudo bash + fi + - name: Run Kustomize Build + run: | + kustomize build kustomize/prometheus-postgres-exporter/ --enable-helm --helm-command ${{ steps.helm.outputs.helm-path }} > /tmp/rendered.yaml + - name: Return Kustomize Build + uses: actions/upload-artifact@v2 + with: + name: kustomize-prometheus-postgres-exporter-artifact + path: /tmp/rendered.yaml diff --git a/docs/prometheus-postgres-exporter.md b/docs/prometheus-postgres-exporter.md new file mode 100644 index 00000000..d765821b --- /dev/null +++ b/docs/prometheus-postgres-exporter.md @@ -0,0 +1,18 @@ +# PostgresSQL Exporter + +PostgresSQL Exporter is used to expose metrics from a running PostgresSQL deployment. + +!!! note + + To deploy metric exporters you will first need to deploy the Prometheus Operator, see: ([Deploy Prometheus](prometheus.md)). + +## Installation + +Install the PostgresSQL Exporter + +``` shell +kubectl kustomize --enable-helm /opt/genestack/kustomize/prometheus-postgres-exporter | kubectl -n openstack apply -f - +``` + +!!! success + If the installation is successful, you should see the exporter pod in the openstack namespace. diff --git a/docs/prometheus-rabbitmq-exporter.md b/docs/prometheus-rabbitmq-exporter.md index bcec5324..1c2479b5 100644 --- a/docs/prometheus-rabbitmq-exporter.md +++ b/docs/prometheus-rabbitmq-exporter.md @@ -11,8 +11,7 @@ RabbitMQ Exporter is used to expose metrics from a running RabbitMQ deployment. Install the RabbitMQ Exporter ``` shell -kubectl kustomize --enable-helm /opt/genestack/kustomize/prometheus-rabbitmq-exporter | \ - kubectl -n openstack apply --server-side -f - +kubectl kustomize --enable-helm /opt/genestack/kustomize/prometheus-rabbitmq-exporter | kubectl -n openstack apply --server-side -f - ``` !!! success diff --git a/helm-configs/postgresql/postgresql-helm-overrides.yaml b/helm-configs/postgresql/postgresql-helm-overrides.yaml index 798aad04..679228c1 100644 --- a/helm-configs/postgresql/postgresql-helm-overrides.yaml +++ b/helm-configs/postgresql/postgresql-helm-overrides.yaml @@ -224,9 +224,9 @@ dependencies: monitoring: prometheus: - enabled: true + enabled: false postgresql_exporter: - scrape: true + scrape: false volume: backup: @@ -478,10 +478,10 @@ manifests: pvc_backup: false monitoring: prometheus: - configmap_bin: true - configmap_etc: true - deployment_exporter: true - job_user_create: true - secret_etc: true - service_exporter: true + configmap_bin: false + configmap_etc: false + deployment_exporter: false + job_user_create: false + secret_etc: false + service_exporter: false ... diff --git a/kustomize/prometheus-postgres-exporter/kustomization.yaml b/kustomize/prometheus-postgres-exporter/kustomization.yaml new file mode 100644 index 00000000..4461e2c4 --- /dev/null +++ b/kustomize/prometheus-postgres-exporter/kustomization.yaml @@ -0,0 +1,8 @@ +helmCharts: + - name: prometheus-postgres-exporter + repo: https://prometheus-community.github.io/helm-charts + releaseName: prometheus-postgres-exporter + namespace: openstack + version: 6.0.0 + includeCRDs: true + valuesFile: values.yaml diff --git a/kustomize/prometheus-postgres-exporter/values.yaml b/kustomize/prometheus-postgres-exporter/values.yaml new file mode 100644 index 00000000..01ebe1b9 --- /dev/null +++ b/kustomize/prometheus-postgres-exporter/values.yaml @@ -0,0 +1,259 @@ +replicaCount: 1 + +image: + registry: quay.io + repository: prometheuscommunity/postgres-exporter + # if not set appVersion field from Chart.yaml is used + tag: "" + pullPolicy: IfNotPresent + + ## Optionally specify an array of imagePullSecrets. + ## Secrets must be manually created in the namespace. + ## ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ + ## + # pullSecrets: + # - myRegistrKeySecretName + +command: [] + +service: + type: ClusterIP + port: 9187 + targetPort: 9187 + name: http + labels: {} + annotations: {} + +automountServiceAccountToken: false + +serviceMonitor: + # When set true then use a ServiceMonitor to configure scraping + enabled: true + # Set the namespace the ServiceMonitor should be deployed + namespace: openstack + # Set how frequently Prometheus should scrape + # interval: 30s + # Set path to cloudwatch-exporter telemtery-path + # telemetryPath: /metrics + # Set labels for the ServiceMonitor, use this to define your scrape label for Prometheus Operator + # labels: + # Set timeout for scrape + # timeout: 10s + # Set of labels to transfer from the Kubernetes Service onto the target + # targetLabels: [] + # MetricRelabelConfigs to apply to samples before ingestion + # metricRelabelings: [] + # Set relabel_configs as per https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config + # relabelings: [] + +prometheusRule: + enabled: false + additionalLabels: {} + namespace: "" + rules: [] + ## These are just examples rules, please adapt them to your needs. + ## Make sure to constraint the rules to the current prometheus-postgres-exporter service. + # - alert: HugeReplicationLag + # expr: pg_replication_lag{service="{{ template "prometheus-postgres-exporter.fullname" . }}"} / 3600 > 1 + # for: 1m + # labels: + # severity: critical + # annotations: + # description: replication for {{ template "prometheus-postgres-exporter.fullname" . }} PostgreSQL is lagging by {{ "{{ $value }}" }} hour(s). + # summary: PostgreSQL replication is lagging by {{ "{{ $value }}" }} hour(s). + +priorityClassName: "" + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m +# memory: 128Mi + +rbac: + # Specifies whether RBAC resources should be created + create: true + +serviceAccount: + # Specifies whether a ServiceAccount should be created + create: true + # The name of the ServiceAccount to use. + # If not set and create is true, a name is generated using the fullname template + name: + # Add annotations to the ServiceAccount, useful for EKS IAM Roles for Service Accounts or Google Workload Identity. + annotations: {} + +# Add a default ingress to allow namespace access to service.targetPort +# Helpful if other NetworkPolicies are configured in the namespace +networkPolicy: + # Specifies whether a NetworkPolicy should be created + enabled: false + # Set labels for the NetworkPolicy + labels: {} + +# The securityContext of the pod. +# See https://kubernetes.io/docs/concepts/policy/security-context/ for more. +podSecurityContext: + runAsGroup: 1001 + runAsUser: 1001 + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + +# The securityContext of the container. +# See https://kubernetes.io/docs/concepts/policy/security-context/ for more. +securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + +hostAliases: [] + # Set Host Aliases as per https://kubernetes.io/docs/tasks/network/customize-hosts-file-for-pods/ + # - ip: "127.0.0.1" + # hostnames: + # - "foo.local" +# - "bar.local" + +config: + ## The datasource properties on config are passed through helm tpl function. + ## ref: https://helm.sh/docs/developing_charts/#using-the-tpl-function + datasource: + # Specify one of both datasource or datasourceSecret + host: postgresql + user: postgres + userSecret: {} + # Secret name + # name: + # User key inside secret + # key: + # Only one of password, passwordFile, passwordSecret and pgpassfile can be specified + password: + # Specify passwordFile if DB password is stored in a file. + # For example, to use with vault-injector from Hashicorp + passwordFile: '' + # Specify passwordSecret if DB password is stored in secret. + passwordSecret: + name: postgresql-db-admin + key: password + # Secret name + # name: + # Password key inside secret + # key: + pgpassfile: '' + # If pgpassfile is set, it is used to initialize the PGPASSFILE environment variable. + # See https://www.postgresql.org/docs/14/libpq-pgpass.html for more info. + port: "5432" + database: '' + sslmode: disable + extraParams: '' + datasourceSecret: {} + # Specifies if datasource should be sourced from secret value in format: postgresql://login:password@hostname:port/dbname?sslmode=disable + # Multiple Postgres databases can be configured by comma separated postgres connection strings + # Secret name + # name: + # Connection string key inside secret + # key: + disableCollectorDatabase: false + disableCollectorBgwriter: false + disableDefaultMetrics: false + disableSettingsMetrics: false + + # possible values debug, info, warn, error, fatal + logLevel: "" + # possible values logfmt, json + logFormat: "" + extraArgs: [] + + # postgres_exporter.yml + postgresExporter: "" + # auth_modules: + # first: + # type: userpass + # userpass: + # username: first + # password: firstpass + # options: + # sslmode: disable + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +annotations: { + prometheus.io/scrape: "true", + prometheus.io/path: "/metrics", + prometheus.io/port: "9187", +} + +podLabels: {} + +# Configurable health checks +livenessProbe: + initialDelaySeconds: 0 + timeoutSeconds: 3 + +readinessProbe: + initialDelaySeconds: 0 + timeoutSeconds: 1 + +# Labels and annotations to attach to the deployment resource +deployment: + labels: {} + annotations: { + prometheus.io/scrape: "true", + prometheus.io/path: "/metrics", + prometheus.io/port: "9187", + } + +# ExtraEnvs +extraEnvs: [] + # - name: EXTRA_ENV + # value: value + # - name: POD_NAMESPACE + # valueFrom: + # fieldRef: +# fieldPath: metadata.namespace + +# Init containers, e. g. for secrets creation before the exporter +initContainers: [] + # - name: + # image: + # volumeMounts: + # - name: creds +# mountPath: /creds + +# Additional sidecar containers, e. g. for a database proxy, such as Google's cloudsql-proxy +extraContainers: [] + +# Additional volumes, e. g. for secrets used in an extraContainer +extraVolumes: [] +# Uncomment for mounting custom ca-certificates +# - name: ssl-certs +# secret: +# defaultMode: 420 +# items: +# - key: ca-certificates.crt +# path: ca-certificates.crt +# secretName: ssl-certs + +# Additional volume mounts +extraVolumeMounts: [] +# Uncomment for mounting custom ca-certificates file into container +# - name: ssl-certs +# mountPath: /etc/ssl/certs/ca-certificates.crt +# subPath: ca-certificates.crt + +podDisruptionBudget: + enabled: false + maxUnavailable: 1 diff --git a/mkdocs.yml b/mkdocs.yml index 0c30a84a..d553449c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -189,7 +189,8 @@ nav: - MySQL Exporter: prometheus-mysql-exporter.md - RabbitMQ Exporter: prometheus-rabbitmq-exporter.md - Memcached Exporter: prometheus-memcached-exporter.md - - Postgres Exporter: prometheus-openstack-metrics-exporter.md + - Postgres Exporter: prometheus-postgres-exporter.md + - Openstack Exporter: prometheus-openstack-metrics-exporter.md - Operational Guide: - Running Genestack Upgrade: genestack-upgrade.md - Running Kubespray Upgrade: k8s-kubespray-upgrade.md From 0df2e227097f945e12ef1adbc8956419341c1490 Mon Sep 17 00:00:00 2001 From: Chris Blumentritt Date: Mon, 18 Mar 2024 11:28:59 -0500 Subject: [PATCH 4/7] Update examples with setting to forget unhealthy ingestors (#157) If one of the loki-write pods moves due to a different node the hash ring can become unhealthy. This will cause logs not to be sent to the backend. This will further lead to the other write pods to start filling up the volumes that they use and eventually cause dropped logs. Example error ``` ubuntu@overseer01:~$ k -n grafana logs daemonset.apps/loki-logs --tail 2 -f Found 37 pods, using pod/loki-logs-m9xvs ts=2024-03-15T14:42:17.533917001Z caller=client.go:419 level=warn component=logs logs_config=grafana/loki component=client host=loki-gateway.grafana.svc.cluster.local msg="error sending batch, will retry" status=500 tenant= error="server returned HTTP status 500 Internal Server Error (500): at least 2 live replicas required, could only find 1 - unhealthy instances: 10.233.82.56:9095,10.233.82.57:9095" ts=2024-03-15T14:44:14.190670342Z caller=client.go:419 level=warn component=logs logs_config=grafana/loki component=client host=loki-gateway.grafana.svc.cluster.local msg="error sending batch, will retry" status=500 tenant= error="server returned HTTP status 500 Internal Server Error (500): at least 2 live replicas required, could only find 1 - unhealthy instances: 10.233.82.56:9095,10.233.82.54:9095" ts=2024-03-15T14:47:22.746099384Z caller=client.go:419 level=warn component=logs logs_config=grafana/loki component=client host=loki-gateway.grafana.svc.cluster.local msg="error sending batch, will retry" status=500 tenant= error="server returned HTTP status 500 Internal Server Error (500): at least 2 live replicas required, could only find 1 - unhealthy instances: 10.233.82.56:9095,10.233.82.57:9095" ts=2024-03-15T14:47:22.746172806Z caller=client.go:430 level=error component=logs logs_config=grafana/loki component=client host=loki-gateway.grafana.svc.cluster.local msg="final error sending batch" status=500 tenant= error="server returned HTTP status 500 Internal Server Error (500): at least 2 live replicas required, could only find 1 - unhealthy instances: 10.233.82.56:9095,10.233.82.57:9095" ts=2024-03-15T14:47:23.806786166Z caller=client.go:419 level=warn component=logs logs_config=grafana/loki component=client host=loki-gateway.grafana.svc.cluster.local msg="error sending batch, will retry" status=500 tenant= error="server returned HTTP status 500 Internal Server Error (500): at least 2 live replicas required, could only find 1 - unhealthy instances: 10.233.82.56:9095,10.233.82.54:9095" ts=2024-03-15T14:47:24.644865006Z caller=client.go:419 level=warn component=logs logs_config=grafana/loki component=client host=loki-gateway.grafana.svc.cluster.local msg="error sending batch, will retry" status=500 tenant= error="server returned HTTP status 500 Internal Server Error (500): at least 2 live replicas required, could only find 1 - unhealthy instances: 10.233.82.56:9095,10.233.82.57:9095" ts=2024-03-15T14:47:25.886090072Z caller=client.go:419 level=warn component=logs logs_config=grafana/loki component=client host=loki-gateway.grafana.svc.cluster.local msg="error sending batch, will retry" status=500 tenant= error="server returned HTTP status 500 Internal Server Error (500): at least 2 live replicas required, could only find 1 - unhealthy instances: 10.233.82.56:9095,10.233.82.54:9095" ts=2024-03-15T14:47:29.833266958Z caller=client.go:419 level=warn component=logs logs_config=grafana/loki component=client host=loki-gateway.grafana.svc.cluster.local msg="error sending batch, will retry" status=500 tenant= error="server returned HTTP status 500 Internal Server Error (500): at least 2 live replicas required, could only find 1 - unhealthy instances: 10.233.82.56:9095,10.233.82.54:9095" ts=2024-03-15T14:47:34.541167878Z caller=client.go:419 level=warn component=logs logs_config=grafana/loki component=client host=loki-gateway.grafana.svc.cluster.local msg="error sending batch, will retry" status=500 tenant= error="server returned HTTP status 500 Internal Server Error (500): at least 2 live replicas required, could only find 1 - unhealthy instances: 10.233.82.56:9095,10.233.82.57:9095" ts=2024-03-15T14:47:44.494616126Z caller=client.go:419 level=warn component=logs logs_config=grafana/loki component=client host=loki-gateway.grafana.svc.cluster.local msg="error sending batch, will retry" status=500 tenant= error="server returned HTTP status 500 Internal Server Error (500): at least 2 live replicas required, could only find 1 - unhealthy instances: 10.233.82.56:9095,10.233.82.54:9095" ts=2024-03-15T14:48:08.686557194Z caller=client.go:419 level=warn component=logs logs_config=grafana/loki component=client host=loki-gateway.grafana.svc.cluster.local msg="error sending batch, will retry" status=500 tenant= error="server returned HTTP status 500 Internal Server Error (500): at least 2 live replicas required, could only find 1 - unhealthy instances: 10.233.82.56:9095,10.233.82.54:9095" ``` Signed-off-by: Chris Blumentritt --- helm-configs/loki/loki-helm-minio-overrides-example.yaml | 2 ++ helm-configs/loki/loki-helm-s3-overrides-example.yaml | 2 ++ helm-configs/loki/loki-helm-swift-overrides-example.yaml | 2 ++ 3 files changed, 6 insertions(+) diff --git a/helm-configs/loki/loki-helm-minio-overrides-example.yaml b/helm-configs/loki/loki-helm-minio-overrides-example.yaml index c77ada3a..83d76f72 100644 --- a/helm-configs/loki/loki-helm-minio-overrides-example.yaml +++ b/helm-configs/loki/loki-helm-minio-overrides-example.yaml @@ -5,3 +5,5 @@ minio: loki: auth_enabled: false configStorageType: Secret + ingester: + autoforget_unhealthy: true diff --git a/helm-configs/loki/loki-helm-s3-overrides-example.yaml b/helm-configs/loki/loki-helm-s3-overrides-example.yaml index 09730acf..e9a95086 100644 --- a/helm-configs/loki/loki-helm-s3-overrides-example.yaml +++ b/helm-configs/loki/loki-helm-s3-overrides-example.yaml @@ -5,6 +5,8 @@ minio: loki: auth_enabled: false configStorageType: Secret + ingester: + autoforget_unhealthy: true storage: bucketNames: chunks: < CHUNKS BUCKET NAME > # TODO: Update with relevant bucket name for chunks diff --git a/helm-configs/loki/loki-helm-swift-overrides-example.yaml b/helm-configs/loki/loki-helm-swift-overrides-example.yaml index 5e4155be..a28ae500 100644 --- a/helm-configs/loki/loki-helm-swift-overrides-example.yaml +++ b/helm-configs/loki/loki-helm-swift-overrides-example.yaml @@ -5,6 +5,8 @@ minio: loki: auth_enabled: false configStorageType: Secret + ingester: + autoforget_unhealthy: true storage: bucketNames: chunks: chunks From 7bd34470aefdcb7caff91233b94ea1ad859c4fa5 Mon Sep 17 00:00:00 2001 From: phillip-toohill Date: Mon, 18 Mar 2024 20:07:07 -0500 Subject: [PATCH 5/7] Updating prometheus with persistent volume claims --- kustomize/prometheus/values.yaml | 37 ++++++++++++++------------------ 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/kustomize/prometheus/values.yaml b/kustomize/prometheus/values.yaml index b4ae552f..8f579718 100644 --- a/kustomize/prometheus/values.yaml +++ b/kustomize/prometheus/values.yaml @@ -742,16 +742,14 @@ alertmanager: ## Storage is the definition of how storage will be used by the Alertmanager instances. ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/user-guides/storage.md ## - storage: {} - # volumeClaimTemplate: - # spec: - # storageClassName: gluster - # accessModes: ["ReadWriteOnce"] - # resources: - # requests: - # storage: 50Gi - # selector: {} - + storage: + volumeClaimTemplate: + spec: + storageClassName: general + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 15Gi ## The external URL the Alertmanager instances will be available under. This is necessary to generate correct URLs. This is necessary if Alertmanager is not served from root of a DNS name. string false ## @@ -3572,17 +3570,14 @@ prometheus: ## Prometheus StorageSpec for persistent data ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/user-guides/storage.md ## - storageSpec: {} - ## Using PersistentVolumeClaim - ## - # volumeClaimTemplate: - # spec: - # storageClassName: gluster - # accessModes: ["ReadWriteOnce"] - # resources: - # requests: - # storage: 50Gi - # selector: {} + storageSpec: + volumeClaimTemplate: + spec: + storageClassName: general + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 15Gi ## Using tmpfs volume ## From 34a3356fe75ff1f4b02b7b7f2637119cf0ff2170 Mon Sep 17 00:00:00 2001 From: Kevin Carter Date: Mon, 18 Mar 2024 22:45:07 -0500 Subject: [PATCH 6/7] Fix: Upgrade the mariadb operator to 0.0.27 Signed-off-by: Kevin Carter --- kustomize/mariadb-cluster/base/mariadb-galera.yaml | 4 ++-- kustomize/mariadb-operator/kustomization.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kustomize/mariadb-cluster/base/mariadb-galera.yaml b/kustomize/mariadb-cluster/base/mariadb-galera.yaml index 55ea560d..6fc77a9e 100644 --- a/kustomize/mariadb-cluster/base/mariadb-galera.yaml +++ b/kustomize/mariadb-cluster/base/mariadb-galera.yaml @@ -54,7 +54,7 @@ spec: galeraLibPath: /usr/lib/galera/libgalera_smm.so replicaThreads: 1 agent: - image: ghcr.io/mariadb-operator/mariadb-operator:v0.0.26 + image: ghcr.io/mariadb-operator/mariadb-operator:v0.0.27 port: 5555 kubernetesAuth: enabled: true @@ -67,7 +67,7 @@ spec: podRecoveryTimeout: 3m podSyncTimeout: 3m initContainer: - image: ghcr.io/mariadb-operator/mariadb-operator:v0.0.26 + image: ghcr.io/mariadb-operator/mariadb-operator:v0.0.27 config: reuseStorageVolume: false volumeClaimTemplate: diff --git a/kustomize/mariadb-operator/kustomization.yaml b/kustomize/mariadb-operator/kustomization.yaml index 737d7986..4f78600f 100644 --- a/kustomize/mariadb-operator/kustomization.yaml +++ b/kustomize/mariadb-operator/kustomization.yaml @@ -14,5 +14,5 @@ helmCharts: metrics: enabled: true includeCRDs: true - version: 0.26.0 + version: 0.27.0 namespace: mariadb-system From 1fc182cdae268ac64a6025ba024d46085717faf3 Mon Sep 17 00:00:00 2001 From: Luke Repko Date: Tue, 19 Mar 2024 08:50:17 -0500 Subject: [PATCH 7/7] feat: introduce ceilometer helm chart overrides (#128) * feat: introduce ceilometer helm chart overrides This begins to add the overrides for the Ceilometer helm chart. Ceilometer provides metering, monitoring, and alarming capabilities in Openstack for billing, performance, optimization, and capacity planning purposes. * fix: ceilometer-api image is deprecated Trying to pull the wallaby image for ceilometer-api results in a 404 not found now. ceilometer-api is in the base image as of commit cd67930 per the upstream kolla repo, so pull that instead. * fix: remove ceilometer-collector from config ceilometer collector was removed from ceilometer code base[1] [1] https://review.openstack.org/504244 * fix: ceilometer-api is fully deprecated so rm it This disables deployment of the api pod and removes related api configuration as ceilometer no longer has a rest API. It is simply a worker service at this point. Gnocchi API is preferred over ceilometer. * fix: set database keys to fake values The database section is not used, but the base chart still tries to set some sane default values, so to avoid confusion, just override those to a string value that makes it obvious this section is not used. The recommended storage location for meters and events is Gnocchi, which is automatically discovered and used by means of keystone. * fix: set gnocchi as the publisher This was explicitly set to notify:// without any context as to what that is or does. The configuration does not list that as a valid value, so let's replace the publisher with the default, `gnocchi`. * fix: disable the ks-endpoint job There is no endpoint for ceilometer anymore so remove the related job that makes a service in keystone for one. * fix: bump ceilometer images to yoga This was the newest tagged image that I could find for Ceilometer. We will need to investigate building our own Ceilometer images for a later release of Genstack. * fix: enable db-sync to init gnocchi resource types The helm chart has a db_sync job which executes ceilometer-upgrade which executes the storage upgrade function that initiates the resource types in gnocchi with their attributes. * fix: add updated event definitions from yoga The event definitions defined in the helm chart were very dated, update them to match those found in the yoga release. * fix: update gnocchi resources to yoga The gnocchi resources were outdated. This updates them to match what was released with Yoga. * fix: update ceilometer meters to yoga The existing meters were outdated. This brings them up to date with the yoga release. * fix: simplify pipeline sinks for now This removes some complexity that the original helm chart introduced which defines custom meter sinks relating to instance cpu, disk, and net metrics. We may find ourselves disabling pollsters for individual instances, so let's not inundate the pipeline with un-necessary complexity yet. If we find they are useful or needed, we can re-enable them after verifying their proper operation. The polled metrics will still be stored in Gnocchi, just not transformed according to the defined sinks. Iff re-introduced, these pipeline sinks may need to be further tweaked to work with the updated event defs. * fix: enable postgresql backup jobs * fix: add gnocchi API replicas & enable daemonsets This should make Gnocchi more reliable and have better overall perf. * fix: disable resource limits for ceilometer We don't enforce pod resource limits in other helm charts so set this to false as the default. * fix: remove apache2 config for ceilometer Ceilometer no longer has a rest API so let's remove this section from the overrides. * fix: Add default loglevels to aid troubleshooting When troubleshooting, it helps to raise or lower default log levels of specific modules, setting requests related loggers to DEBUG for example can help one diagnose ceilometer CRUD operations. * doc: add openstack ceilometer installation * fix: set postgresql cron backup to 0015 once a day The default was midnight but a lot of jobs run then; kick this off a little later to help avoid the thundering herd affect. --- docs/openstack-ceilometer.md | 80 + .../ceilometer/ceilometer-helm-overrides.yaml | 2182 +++++++++++++++++ .../gnocchi/gnocchi-helm-overrides.yaml | 6 +- .../postgresql/postgresql-helm-overrides.yaml | 12 +- mkdocs.yml | 1 + 5 files changed, 2272 insertions(+), 9 deletions(-) create mode 100644 docs/openstack-ceilometer.md create mode 100644 helm-configs/ceilometer/ceilometer-helm-overrides.yaml diff --git a/docs/openstack-ceilometer.md b/docs/openstack-ceilometer.md new file mode 100644 index 00000000..d43f1879 --- /dev/null +++ b/docs/openstack-ceilometer.md @@ -0,0 +1,80 @@ +# Deploy Ceilometer + +## Create Secrets + +```shell +kubectl --namespace openstack create secret generic ceilometer-keystone-admin-password \ + --type Opaque \ + --from-literal=password="$(< /dev/urandom tr -dc _A-Za-z0-9 | head -c${1:-32};echo;)" +kubectl --namespace openstack create secret generic ceilometer-keystone-test-password \ + --type Opaque \ + --from-literal=password="$(< /dev/urandom tr -dc _A-Za-z0-9 | head -c${1:-32};echo;)" +kubectl --namespace openstack create secret generic ceilometer-rabbitmq-password \ + --type Opaque \ + --from-literal=password="$(< /dev/urandom tr -dc _A-Za-z0-9 | head -c${1:-32};echo;)" +``` + +## Run the package deployment + +```shell +cd /opt/genestack/submodules/openstack-helm +helm upgrade --install ceilometer ./ceilometer \ + --namespace=openstack \ + --wait \ + --timeout 10m \ + -f /opt/genestack/helm-configs/ceilometer/ceilometer-helm-overrides.yaml \ + --set endpoints.identity.auth.admin.password="$(kubectl --namespace openstack get secret keystone-admin -o jsonpath='{.data.password}' | base64 -d)" \ + --set endpoints.identity.auth.ceilometer.password="$(kubectl --namespace openstack get secret ceilometer-keystone-admin-password -o jsonpath='{.data.password}' | base64 -d)" \ + --set endpoints.identity.auth.test.password="$(kubectl --namespace openstack get secret ceilometer-keystone-test-password -o jsonpath='{.data.password}' | base64 -d)" \ + --set endpoints.oslo_messaging.auth.admin.username="$(kubectl --namespace openstack get secret rabbitmq-default-user -o jsonpath='{.data.username}' | base64 -d)" \ + --set endpoints.oslo_messaging.auth.admin.password="$(kubectl --namespace openstack get secret rabbitmq-default-user -o jsonpath='{.data.password}' | base64 -d)" \ + --set endpoints.oslo_messaging.auth.ceilometer.password="$(kubectl --namespace openstack get secret ceilometer-rabbitmq-password -o jsonpath='{.data.password}' | base64 -d)" \ + --set conf.ceilometer.oslo_messaging_notifications.transport_url="\ +rabbit://ceilometer:$(kubectl --namespace openstack get secret ceilometer-rabbitmq-password -o jsonpath='{.data.password}' | base64 -d)@rabbitmq.openstack.svc.cluster.local:5672/ceilometer"\ + --set conf.ceilometer.notification.messaging_urls.values="{\ +rabbit://ceilometer:$(kubectl --namespace openstack get secret ceilometer-rabbitmq-password -o jsonpath='{.data.password}' | base64 -d)@rabbitmq.openstack.svc.cluster.local:5672/ceilometer,\ +rabbit://cinder:$(kubectl --namespace openstack get secret cinder-rabbitmq-password -o jsonpath='{.data.password}' | base64 -d)@rabbitmq.openstack.svc.cluster.local:5672/cinder,\ +rabbit://glance:$(kubectl --namespace openstack get secret glance-rabbitmq-password -o jsonpath='{.data.password}' | base64 -d)@rabbitmq.openstack.svc.cluster.local:5672/glance,\ +rabbit://heat:$(kubectl --namespace openstack get secret heat-rabbitmq-password -o jsonpath='{.data.password}' | base64 -d)@rabbitmq.openstack.svc.cluster.local:5672/heat,\ +rabbit://keystone:$(kubectl --namespace openstack get secret keystone-rabbitmq-password -o jsonpath='{.data.password}' | base64 -d)@rabbitmq.openstack.svc.cluster.local:5672/keystone,\ +rabbit://neutron:$(kubectl --namespace openstack get secret neutron-rabbitmq-password -o jsonpath='{.data.password}' | base64 -d)@rabbitmq.openstack.svc.cluster.local:5672/neutron,\ +rabbit://nova:$(kubectl --namespace openstack get secret nova-rabbitmq-password -o jsonpath='{.data.password}' | base64 -d)@rabbitmq.openstack.svc.cluster.local:5672/nova}" +``` + +!!! tip + + In a production like environment you may need to include production specific files like the example variable file found in `helm-configs/prod-example-openstack-overrides.yaml`. + +## Verify Ceilometer Workers + +As there is no Ceilometer API, we will do a quick validation against the +Gnocchi API via a series of `openstack metric` commands to confirm that +Ceilometer workers are ingesting metric and event data then persisting them +storage. + +### Verify metric resource types exist + +The Ceilomter db-sync job will create the various resource types in Gnocchi. +Without them, metrics can't be stored, so let's verify they exist. The +output should include named resource types and some attributes for resources +like `instance`, `instance_disk`, `network`, `volume`, etc. + +```shell +kubectl exec -it openstack-admin-client -n openstack -- openstack metric resource-type list +``` + +### Verify metric resources + +Confirm that resources are populating in Gnocchi + +```shell +kubectl exec -it openstack-admin-client -n openstack -- openstack metric resource list +``` + +### Verify metrics + +Confirm that metrics can be retrieved from Gnocchi + +```shell +kubectl exec -it openstack-admin-client -n openstack -- openstack metric list +``` diff --git a/helm-configs/ceilometer/ceilometer-helm-overrides.yaml b/helm-configs/ceilometer/ceilometer-helm-overrides.yaml new file mode 100644 index 00000000..952324d7 --- /dev/null +++ b/helm-configs/ceilometer/ceilometer-helm-overrides.yaml @@ -0,0 +1,2182 @@ +--- +release_group: null + +labels: + compute: + node_selector_key: openstack-compute-node + node_selector_value: enabled + central: + node_selector_key: openstack-control-plane + node_selector_value: enabled + ipmi: + node_selector_key: openstack-node + node_selector_value: enabled + notification: + node_selector_key: openstack-control-plane + node_selector_value: enabled + job: + node_selector_key: openstack-control-plane + node_selector_value: enabled + test: + node_selector_key: openstack-control-plane + node_selector_value: enabled + +images: + tags: + test: docker.io/xrally/xrally-openstack:2.0.0 + ceilometer_db_sync: docker.io/kolla/ubuntu-source-ceilometer-base:yoga + rabbit_init: docker.io/rabbitmq:3.7-management + ks_user: docker.io/openstackhelm/heat:wallaby-ubuntu_focal + ks_service: docker.io/openstackhelm/heat:wallaby-ubuntu_focal + ceilometer_central: docker.io/kolla/ubuntu-source-ceilometer-central:yoga + ceilometer_compute: docker.io/kolla/ubuntu-source-ceilometer-compute:yoga + ceilometer_ipmi: docker.io/kolla/ubuntu-source-ceilometer-base:yoga + ceilometer_notification: docker.io/kolla/ubuntu-source-ceilometer-notification:yoga + dep_check: quay.io/airshipit/kubernetes-entrypoint:v1.0.0 + image_repo_sync: docker.io/docker:17.07.0 + pull_policy: "IfNotPresent" + local_registry: + active: false + exclude: + - dep_check + - image_repo_sync + +ipmi_device: /dev/ipmi0 + +conf: + ceilometer: + DEFAULT: + debug: "false" +# default_log_levels: >- +# amqp=WARN,amqplib=WARN,boto=WARN,qpid=WARN,sqlalchemy=WARN,suds=INFO,oslo.messaging=INFO, +# oslo_messaging=INFO,iso8601=WARN,requests.packages.urllib3.connectionpool=DEBUG, +# urllib3.connectionpool=DEBUG,websocket=WARN,requests.packages.urllib3.util.retry=DEBUG, +# urllib3.util.retry=DEBUG,keystonemiddleware=WARN,routes.middleware=WARN,stevedore=WARN, +# taskflow=WARN,keystoneauth=WARN,oslo.cache=INFO,oslo_policy=INFO,dogpile.core.dogpile=INFO + event_dispatchers: + type: multistring + values: + - gnocchi + meter_dispatchers: + type: multistring + values: + - gnocchi + api: + aodh_is_enabled: "False" + aodh_url: "NotUsed" + dispatcher_gnocchi: + filter_service_activity: False + archive_policy: low + resources_definition_file: /etc/ceilometer/gnocchi_resources.yaml + database: + connection: "NotUsed" + event_connection: "NotUsed" + metering_connection: "NotUsed" + max_retries: -1 + dispatcher: + archive_policy: low + filter_project: service + keystone_authtoken: + auth_type: password + auth_version: v3 + service_credentials: + auth_type: password + interface: internal + notification: + messaging_urls: + type: multistring + values: + - rabbit://rabbitmq:password@rabbitmq.openstack.svc.cluster.local:5672/ceilometer + - rabbit://rabbitmq:password@rabbitmq.openstack.svc.cluster.local:5672/cinder + - rabbit://rabbitmq:password@rabbitmq.openstack.svc.cluster.local:5672/glance + - rabbit://rabbitmq:password@rabbitmq.openstack.svc.cluster.local:5672/nova + - rabbit://rabbitmq:password@rabbitmq.openstack.svc.cluster.local:5672/keystone + - rabbit://rabbitmq:password@rabbitmq.openstack.svc.cluster.local:5672/neutron + - rabbit://rabbitmq:password@rabbitmq.openstack.svc.cluster.local:5672/heat + oslo_messaging_notifications: + driver: messagingv2 + topics: + - notifications + - profiler + oslo_policy: + policy_file: /etc/ceilometer/policy.yaml + cache: + enabled: true + backend: dogpile.cache.memcached + expiration_time: 86400 + event_definitions: + - event_type: 'compute.instance.*' + traits: &instance_traits + tenant_id: + fields: payload.tenant_id + user_id: + fields: payload.user_id + instance_id: + fields: payload.instance_id + display_name: + fields: payload.display_name + resource_id: + fields: payload.instance_id + cell_name: + fields: payload.cell_name + host: + fields: publisher_id.`split(., 1, 1)` + service: + fields: publisher_id.`split(., 0, -1)` + memory_mb: + type: int + fields: payload.memory_mb + disk_gb: + type: int + fields: payload.disk_gb + root_gb: + type: int + fields: payload.root_gb + ephemeral_gb: + type: int + fields: payload.ephemeral_gb + vcpus: + type: int + fields: payload.vcpus + instance_type_id: + fields: payload.instance_type_id + instance_type: + fields: payload.instance_type + state: + fields: payload.state + os_architecture: + fields: payload.image_meta.'org.openstack__1__architecture' + os_version: + fields: payload.image_meta.'org.openstack__1__os_version' + os_distro: + fields: payload.image_meta.'org.openstack__1__os_distro' + launched_at: + type: datetime + fields: payload.launched_at + deleted_at: + type: datetime + fields: payload.deleted_at + - event_type: compute.instance.create.end + traits: + <<: *instance_traits + availability_zone: + fields: payload.availability_zone + - event_type: compute.instance.update + traits: + <<: *instance_traits + old_state: + fields: payload.old_state + - event_type: compute.instance.exists + traits: + <<: *instance_traits + audit_period_beginning: + type: datetime + fields: payload.audit_period_beginning + audit_period_ending: + type: datetime + fields: payload.audit_period_ending + - event_type: ['volume.exists', 'volume.retype', 'volume.create.*', 'volume.delete.*', 'volume.resize.*', 'volume.attach.*', 'volume.detach.*', 'volume.update.*', 'snapshot.exists', 'snapshot.create.*', 'snapshot.delete.*', 'snapshot.update.*', 'volume.transfer.accept.end', 'snapshot.transfer.accept.end'] + traits: &cinder_traits + user_id: + fields: payload.user_id + project_id: + fields: payload.tenant_id + availability_zone: + fields: payload.availability_zone + display_name: + fields: payload.display_name + replication_status: + fields: payload.replication_status + status: + fields: payload.status + created_at: + type: datetime + fields: payload.created_at + image_id: + fields: payload.glance_metadata[?key=image_id].value + instance_id: + fields: payload.volume_attachment[0].server_id + - event_type: ['volume.transfer.*', 'volume.exists', 'volume.retype', 'volume.create.*', 'volume.delete.*', 'volume.resize.*', 'volume.attach.*', 'volume.detach.*', 'volume.update.*', 'snapshot.transfer.accept.end'] + traits: + <<: *cinder_traits + resource_id: + fields: payload.volume_id + host: + fields: payload.host + size: + type: int + fields: payload.size + type: + fields: payload.volume_type + replication_status: + fields: payload.replication_status + - event_type: ['snapshot.transfer.accept.end'] + traits: + <<: *cinder_traits + resource_id: + fields: payload.snapshot_id + project_id: + fields: payload.tenant_id + - event_type: ['share.create.*', 'share.delete.*', 'share.extend.*', 'share.shrink.*'] + traits: &share_traits + share_id: + fields: payload.share_id + user_id: + fields: payload.user_id + project_id: + fields: payload.tenant_id + snapshot_id: + fields: payload.snapshot_id + availability_zone: + fields: payload.availability_zone + status: + fields: payload.status + created_at: + type: datetime + fields: payload.created_at + share_group_id: + fields: payload.share_group_id + size: + type: int + fields: payload.size + name: + fields: payload.name + proto: + fields: payload.proto + is_public: + fields: payload.is_public + description: + fields: payload.description + host: + fields: payload.host + - event_type: ['snapshot.exists', 'snapshot.create.*', 'snapshot.delete.*', 'snapshot.update.*'] + traits: + <<: *cinder_traits + resource_id: + fields: payload.snapshot_id + volume_id: + fields: payload.volume_id + - event_type: ['image_volume_cache.*'] + traits: + image_id: + fields: payload.image_id + host: + fields: payload.host + - event_type: ['image.create', 'image.update', 'image.upload', 'image.delete'] + traits: &glance_crud + project_id: + fields: payload.owner + resource_id: + fields: payload.id + name: + fields: payload.name + status: + fields: payload.status + created_at: + type: datetime + fields: payload.created_at + user_id: + fields: payload.owner + deleted_at: + type: datetime + fields: payload.deleted_at + size: + type: int + fields: payload.size + - event_type: image.send + traits: &glance_send + receiver_project: + fields: payload.receiver_tenant_id + receiver_user: + fields: payload.receiver_user_id + user_id: + fields: payload.owner_id + image_id: + fields: payload.image_id + destination_ip: + fields: payload.destination_ip + bytes_sent: + type: int + fields: payload.bytes_sent + - event_type: orchestration.stack.* + traits: &orchestration_crud + project_id: + fields: payload.tenant_id + user_id: + fields: ['ctxt.trustor_user_id', 'ctxt.user_id'] + resource_id: + fields: payload.stack_identity + name: + fields: payload.name + - event_type: sahara.cluster.* + traits: &sahara_crud + project_id: + fields: payload.project_id + user_id: + fields: ctxt.user_id + resource_id: + fields: payload.cluster_id + name: + fields: payload.name + - event_type: sahara.cluster.health + traits: &sahara_health + <<: *sahara_crud + verification_id: + fields: payload.verification_id + health_check_status: + fields: payload.health_check_status + health_check_name: + fields: payload.health_check_name + health_check_description: + fields: payload.health_check_description + created_at: + type: datetime + fields: payload.created_at + updated_at: + type: datetime + fields: payload.updated_at + - event_type: ['identity.user.*', 'identity.project.*', 'identity.group.*', 'identity.role.*', 'identity.OS-TRUST:trust.*', + 'identity.region.*', 'identity.service.*', 'identity.endpoint.*', 'identity.policy.*'] + traits: &identity_crud + resource_id: + fields: payload.resource_info + initiator_id: + fields: payload.initiator.id + project_id: + fields: payload.initiator.project_id + domain_id: + fields: payload.initiator.domain_id + - event_type: identity.role_assignment.* + traits: &identity_role_assignment + role: + fields: payload.role + group: + fields: payload.group + domain: + fields: payload.domain + user: + fields: payload.user + project: + fields: payload.project + - event_type: identity.authenticate + traits: &identity_authenticate + typeURI: + fields: payload.typeURI + id: + fields: payload.id + action: + fields: payload.action + eventType: + fields: payload.eventType + eventTime: + type: datetime + fields: payload.eventTime + outcome: + fields: payload.outcome + initiator_typeURI: + fields: payload.initiator.typeURI + initiator_id: + fields: payload.initiator.id + initiator_name: + fields: payload.initiator.name + initiator_host_agent: + fields: payload.initiator.host.agent + initiator_host_addr: + fields: payload.initiator.host.address + target_typeURI: + fields: payload.target.typeURI + target_id: + fields: payload.target.id + observer_typeURI: + fields: payload.observer.typeURI + observer_id: + fields: payload.observer.id + - event_type: objectstore.http.request + traits: &objectstore_request + typeURI: + fields: payload.typeURI + id: + fields: payload.id + action: + fields: payload.action + eventType: + fields: payload.eventType + eventTime: + type: datetime + fields: payload.eventTime + outcome: + fields: payload.outcome + initiator_typeURI: + fields: payload.initiator.typeURI + initiator_id: + fields: payload.initiator.id + initiator_project_id: + fields: payload.initiator.project_id + target_typeURI: + fields: payload.target.typeURI + target_id: + fields: payload.target.id + target_action: + fields: payload.target.action + target_metadata_path: + fields: payload.target.metadata.path + target_metadata_version: + fields: payload.target.metadata.version + target_metadata_container: + fields: payload.target.metadata.container + target_metadata_object: + fields: payload.target.metadata.object + observer_id: + fields: payload.observer.id + - event_type: ['network.*', 'subnet.*', 'port.*', 'router.*', 'floatingip.*', 'pool.*', 'vip.*', 'member.*', 'health_monitor.*', 'healthmonitor.*', 'listener.*', 'loadbalancer.*', 'firewall.*', 'firewall_policy.*', 'firewall_rule.*', 'vpnservice.*', 'ipsecpolicy.*', 'ikepolicy.*', 'ipsec_site_connection.*'] + traits: &network_traits + user_id: + fields: ctxt.user_id + project_id: + fields: ctxt.tenant_id + - event_type: network.* + traits: + <<: *network_traits + name: + fields: payload.network.name + resource_id: + fields: ['payload.network.id', 'payload.id'] + - event_type: subnet.* + traits: + <<: *network_traits + name: + fields: payload.subnet.name + resource_id: + fields: ['payload.subnet.id', 'payload.id'] + - event_type: port.* + traits: + <<: *network_traits + name: + fields: payload.port.name + resource_id: + fields: ['payload.port.id', 'payload.id'] + - event_type: router.* + traits: + <<: *network_traits + name: + fields: payload.router.name + resource_id: + fields: ['payload.router.id', 'payload.id'] + - event_type: floatingip.* + traits: + <<: *network_traits + resource_id: + fields: ['payload.floatingip.id', 'payload.id'] + - event_type: pool.* + traits: + <<: *network_traits + name: + fields: payload.pool.name + resource_id: + fields: ['payload.pool.id', 'payload.id'] + - event_type: vip.* + traits: + <<: *network_traits + resource_id: + fields: ['payload.vip.id', 'payload.id'] + - event_type: member.* + traits: + <<: *network_traits + resource_id: + fields: ['payload.member.id', 'payload.id'] + - event_type: health_monitor.* + traits: + <<: *network_traits + name: + fields: payload.health_monitor.name + resource_id: + fields: ['payload.health_monitor.id', 'payload.id'] + - event_type: healthmonitor.* + traits: + <<: *network_traits + name: + fields: payload.healthmonitor.name + resource_id: + fields: ['payload.healthmonitor.id', 'payload.id'] + - event_type: listener.* + traits: + <<: *network_traits + name: + fields: payload.listener.name + resource_id: + fields: ['payload.listener.id', 'payload.id'] + - event_type: loadbalancer.* + traits: + <<: *network_traits + name: + fields: payload.loadbalancer.name + resource_id: + fields: ['payload.loadbalancer.id', 'payload.id'] + - event_type: firewall.* + traits: + <<: *network_traits + name: + fields: payload.firewall.name + resource_id: + fields: ['payload.firewall.id', 'payload.id'] + - event_type: firewall_policy.* + traits: + <<: *network_traits + name: + fields: payload.firewall_policy.name + resource_id: + fields: ['payload.firewall_policy.id', 'payload.id'] + - event_type: firewall_rule.* + traits: + <<: *network_traits + name: + fields: payload.firewall_rule.name + resource_id: + fields: ['payload.firewall_rule.id', 'payload.id'] + - event_type: vpnservice.* + traits: + <<: *network_traits + name: + fields: payload.vpnservice.name + resource_id: + fields: ['payload.vpnservice.id', 'payload.id'] + - event_type: ipsecpolicy.* + traits: + <<: *network_traits + name: + fields: payload.ipsecpolicy.name + resource_id: + fields: ['payload.ipsecpolicy.id', 'payload.id'] + - event_type: ikepolicy.* + traits: + <<: *network_traits + name: + fields: payload.ikepolicy.name + resource_id: + fields: ['payload.ikepolicy.id', 'payload.id'] + - event_type: ipsec_site_connection.* + traits: + <<: *network_traits + resource_id: + fields: ['payload.ipsec_site_connection.id', 'payload.id'] + - event_type: '*http.*' + traits: &http_audit + project_id: + fields: payload.initiator.project_id + user_id: + fields: payload.initiator.id + typeURI: + fields: payload.typeURI + eventType: + fields: payload.eventType + action: + fields: payload.action + outcome: + fields: payload.outcome + id: + fields: payload.id + eventTime: + type: datetime + fields: payload.eventTime + requestPath: + fields: payload.requestPath + observer_id: + fields: payload.observer.id + target_id: + fields: payload.target.id + target_typeURI: + fields: payload.target.typeURI + target_name: + fields: payload.target.name + initiator_typeURI: + fields: payload.initiator.typeURI + initiator_id: + fields: payload.initiator.id + initiator_name: + fields: payload.initiator.name + initiator_host_address: + fields: payload.initiator.host.address + - event_type: '*http.response' + traits: + <<: *http_audit + reason_code: + fields: payload.reason.reasonCode + - event_type: ['dns.domain.create', 'dns.domain.update', 'dns.domain.delete'] + traits: &dns_domain_traits + status: + fields: payload.status + retry: + fields: payload.retry + description: + fields: payload.description + expire: + fields: payload.expire + email: + fields: payload.email + ttl: + fields: payload.ttl + action: + fields: payload.action + name: + fields: payload.name + resource_id: + fields: payload.id + created_at: + type: datetime + fields: payload.created_at + updated_at: + type: datetime + fields: payload.updated_at + version: + fields: payload.version + parent_domain_id: + fields: parent_domain_id + serial: + fields: payload.serial + - event_type: dns.domain.exists + traits: + <<: *dns_domain_traits + audit_period_beginning: + type: datetime + fields: payload.audit_period_beginning + audit_period_ending: + type: datetime + fields: payload.audit_period_ending + - event_type: trove.* + traits: &trove_base_traits + instance_type: + fields: payload.instance_type + user_id: + fields: payload.user_id + resource_id: + fields: payload.instance_id + instance_type_id: + fields: payload.instance_type_id + launched_at: + type: datetime + fields: payload.launched_at + instance_name: + fields: payload.instance_name + state: + fields: payload.state + nova_instance_id: + fields: payload.nova_instance_id + service_id: + fields: payload.service_id + created_at: + type: datetime + fields: payload.created_at + region: + fields: payload.region + - event_type: ['trove.instance.create', 'trove.instance.modify_volume', 'trove.instance.modify_flavor', 'trove.instance.delete'] + traits: &trove_common_traits + name: + fields: payload.name + availability_zone: + fields: payload.availability_zone + instance_size: + type: int + fields: payload.instance_size + volume_size: + type: int + fields: payload.volume_size + nova_volume_id: + fields: payload.nova_volume_id + - event_type: trove.instance.create + traits: + <<: [*trove_base_traits, *trove_common_traits] + - event_type: trove.instance.modify_volume + traits: + <<: [*trove_base_traits, *trove_common_traits] + old_volume_size: + type: int + fields: payload.old_volume_size + modify_at: + type: datetime + fields: payload.modify_at + - event_type: trove.instance.modify_flavor + traits: + <<: [*trove_base_traits, *trove_common_traits] + old_instance_size: + type: int + fields: payload.old_instance_size + modify_at: + type: datetime + fields: payload.modify_at + - event_type: trove.instance.delete + traits: + <<: [*trove_base_traits, *trove_common_traits] + deleted_at: + type: datetime + fields: payload.deleted_at + - event_type: trove.instance.exists + traits: + <<: *trove_base_traits + display_name: + fields: payload.display_name + audit_period_beginning: + type: datetime + fields: payload.audit_period_beginning + audit_period_ending: + type: datetime + fields: payload.audit_period_ending + - event_type: profiler.* + traits: + project: + fields: payload.project + service: + fields: payload.service + name: + fields: payload.name + base_id: + fields: payload.base_id + trace_id: + fields: payload.trace_id + parent_id: + fields: payload.parent_id + timestamp: + type: datetime + fields: payload.timestamp + host: + fields: payload.info.host + path: + fields: payload.info.request.path + query: + fields: payload.info.request.query + method: + fields: payload.info.request.method + scheme: + fields: payload.info.request.scheme + db.statement: + fields: payload.info.db.statement + db.params: + fields: payload.info.db.params + - event_type: 'magnum.cluster.*' + traits: &magnum_cluster_crud + id: + fields: payload.id + typeURI: + fields: payload.typeURI + eventType: + fields: payload.eventType + eventTime: + type: datetime + fields: payload.eventTime + action: + fields: payload.action + outcome: + fields: payload.outcome + initiator_id: + fields: payload.initiator.id + initiator_typeURI: + fields: payload.initiator.typeURI + initiator_name: + fields: payload.initiator.name + initiator_host_agent: + fields: payload.initiator.host.agent + initiator_host_address: + fields: payload.initiator.host.address + target_id: + fields: payload.target.id + target_typeURI: + fields: payload.target.typeURI + observer_id: + fields: payload.observer.id + observer_typeURI: + fields: payload.observer.typeURI + - event_type: 'alarm.*' + traits: + id: + fields: payload.alarm_id + user_id: + fields: payload.user_id + project_id: + fields: payload.project_id + on_behalf_of: + fields: payload.on_behalf_of + severity: + fields: payload.severity + detail: + fields: payload.detail + type: + fields: payload.type + + gnocchi_resources: + archive_policy_default: ceilometer-low + archive_policies: + # NOTE(sileht): We keep "mean" for now to not break all gating that + # use the current tempest scenario. + - name: ceilometer-low + aggregation_methods: + - mean + back_window: 0 + definition: + - granularity: 5 minutes + timespan: 30 days + - name: ceilometer-low-rate + aggregation_methods: + - mean + - rate:mean + back_window: 0 + definition: + - granularity: 5 minutes + timespan: 30 days + - name: ceilometer-high + aggregation_methods: + - mean + back_window: 0 + definition: + - granularity: 1 second + timespan: 1 hour + - granularity: 1 minute + timespan: 1 day + - granularity: 1 hour + timespan: 365 days + - name: ceilometer-high-rate + aggregation_methods: + - mean + - rate:mean + back_window: 0 + definition: + - granularity: 1 second + timespan: 1 hour + - granularity: 1 minute + timespan: 1 day + - granularity: 1 hour + timespan: 365 days + + resources: + - resource_type: identity + metrics: + identity.authenticate.success: + identity.authenticate.pending: + identity.authenticate.failure: + identity.user.created: + identity.user.deleted: + identity.user.updated: + identity.group.created: + identity.group.deleted: + identity.group.updated: + identity.role.created: + identity.role.deleted: + identity.role.updated: + identity.project.created: + identity.project.deleted: + identity.project.updated: + identity.trust.created: + identity.trust.deleted: + identity.role_assignment.created: + identity.role_assignment.deleted: + + - resource_type: ceph_account + metrics: + radosgw.objects: + radosgw.objects.size: + radosgw.objects.containers: + radosgw.api.request: + radosgw.containers.objects: + radosgw.containers.objects.size: + + - resource_type: instance + metrics: + memory: + memory.usage: + memory.resident: + memory.swap.in: + memory.swap.out: + memory.bandwidth.total: + memory.bandwidth.local: + vcpus: + cpu: + archive_policy_name: ceilometer-low-rate + cpu_l3_cache: + disk.root.size: + disk.ephemeral.size: + disk.latency: + disk.iops: + disk.capacity: + disk.allocation: + disk.usage: + compute.instance.booting.time: + perf.cpu.cycles: + perf.instructions: + perf.cache.references: + perf.cache.misses: + attributes: + host: resource_metadata.(instance_host|host) + image_ref: resource_metadata.image_ref + launched_at: resource_metadata.launched_at + created_at: resource_metadata.created_at + deleted_at: resource_metadata.deleted_at + display_name: resource_metadata.display_name + flavor_id: resource_metadata.(instance_flavor_id|(flavor.id)|flavor_id) + flavor_name: resource_metadata.(instance_type|(flavor.name)|flavor_name) + server_group: resource_metadata.user_metadata.server_group + event_delete: compute.instance.delete.start + event_create: compute.instance.create.end + event_attributes: + id: instance_id + display_name: display_name + host: host + availability_zone: availability_zone + flavor_id: instance_type_id + flavor_name: instance_type + user_id: user_id + project_id: project_id + event_associated_resources: + instance_network_interface: '{"=": {"instance_id": "%s"}}' + instance_disk: '{"=": {"instance_id": "%s"}}' + + - resource_type: instance_network_interface + metrics: + network.outgoing.packets: + archive_policy_name: ceilometer-low-rate + network.incoming.packets: + archive_policy_name: ceilometer-low-rate + network.outgoing.packets.drop: + archive_policy_name: ceilometer-low-rate + network.incoming.packets.drop: + archive_policy_name: ceilometer-low-rate + network.outgoing.packets.error: + archive_policy_name: ceilometer-low-rate + network.incoming.packets.error: + archive_policy_name: ceilometer-low-rate + network.outgoing.bytes: + archive_policy_name: ceilometer-low-rate + network.incoming.bytes: + archive_policy_name: ceilometer-low-rate + attributes: + name: resource_metadata.vnic_name + instance_id: resource_metadata.instance_id + + - resource_type: instance_disk + metrics: + disk.device.read.requests: + archive_policy_name: ceilometer-low-rate + disk.device.write.requests: + archive_policy_name: ceilometer-low-rate + disk.device.read.bytes: + archive_policy_name: ceilometer-low-rate + disk.device.write.bytes: + archive_policy_name: ceilometer-low-rate + disk.device.latency: + disk.device.read.latency: + disk.device.write.latency: + disk.device.iops: + disk.device.capacity: + disk.device.allocation: + disk.device.usage: + attributes: + name: resource_metadata.disk_name + instance_id: resource_metadata.instance_id + + - resource_type: image + metrics: + image.size: + image.download: + image.serve: + attributes: + name: resource_metadata.name + container_format: resource_metadata.container_format + disk_format: resource_metadata.disk_format + event_delete: image.delete + event_attributes: + id: resource_id + + - resource_type: ipmi + metrics: + hardware.ipmi.node.power: + hardware.ipmi.node.temperature: + hardware.ipmi.node.inlet_temperature: + hardware.ipmi.node.outlet_temperature: + hardware.ipmi.node.fan: + hardware.ipmi.node.current: + hardware.ipmi.node.voltage: + hardware.ipmi.node.airflow: + hardware.ipmi.node.cups: + hardware.ipmi.node.cpu_util: + hardware.ipmi.node.mem_util: + hardware.ipmi.node.io_util: + + - resource_type: ipmi_sensor + metrics: + - 'hardware.ipmi.power' + - 'hardware.ipmi.temperature' + - 'hardware.ipmi.current' + - 'hardware.ipmi.voltage' + attributes: + node: resource_metadata.node + + - resource_type: network + metrics: + bandwidth: + ip.floating: + event_delete: floatingip.delete.end + event_attributes: + id: resource_id + + - resource_type: stack + metrics: + stack.create: + stack.update: + stack.delete: + stack.resume: + stack.suspend: + + - resource_type: swift_account + metrics: + storage.objects.incoming.bytes: + storage.objects.outgoing.bytes: + storage.objects.size: + storage.objects: + storage.objects.containers: + storage.containers.objects: + storage.containers.objects.size: + + - resource_type: volume + metrics: + volume: + volume.size: + snapshot.size: + volume.snapshot.size: + volume.backup.size: + backup.size: + volume.manage_existing.start: + volume.manage_existing.end: + volume.manage_existing_snapshot.start: + volume.manage_existing_snapshot.end: + attributes: + display_name: resource_metadata.(display_name|name) + volume_type: resource_metadata.volume_type + image_id: resource_metadata.image_id + instance_id: resource_metadata.instance_id + event_delete: + - volume.delete.end + - snapshot.delete.end + event_update: + - volume.transfer.accept.end + - snapshot.transfer.accept.end + event_attributes: + id: resource_id + project_id: project_id + + - resource_type: volume_provider + metrics: + volume.provider.capacity.total: + volume.provider.capacity.free: + volume.provider.capacity.allocated: + volume.provider.capacity.provisioned: + volume.provider.capacity.virtual_free: + + - resource_type: volume_provider_pool + metrics: + volume.provider.pool.capacity.total: + volume.provider.pool.capacity.free: + volume.provider.pool.capacity.allocated: + volume.provider.pool.capacity.provisioned: + volume.provider.pool.capacity.virtual_free: + attributes: + provider: resource_metadata.provider + + - resource_type: host + metrics: + hardware.cpu.load.1min: + hardware.cpu.load.5min: + hardware.cpu.load.15min: + hardware.cpu.util: + hardware.cpu.user: + archive_policy_name: ceilometer-low-rate + hardware.cpu.nice: + archive_policy_name: ceilometer-low-rate + hardware.cpu.system: + archive_policy_name: ceilometer-low-rate + hardware.cpu.idle: + archive_policy_name: ceilometer-low-rate + hardware.cpu.wait: + archive_policy_name: ceilometer-low-rate + hardware.cpu.kernel: + archive_policy_name: ceilometer-low-rate + hardware.cpu.interrupt: + archive_policy_name: ceilometer-low-rate + hardware.memory.total: + hardware.memory.used: + hardware.memory.swap.total: + hardware.memory.swap.avail: + hardware.memory.buffer: + hardware.memory.cached: + hardware.network.ip.outgoing.datagrams: + hardware.network.ip.incoming.datagrams: + hardware.system_stats.cpu.idle: + hardware.system_stats.io.outgoing.blocks: + hardware.system_stats.io.incoming.blocks: + attributes: + host_name: resource_metadata.resource_url + + - resource_type: host_disk + metrics: + hardware.disk.size.total: + hardware.disk.size.used: + hardware.disk.read.bytes: + hardware.disk.write.bytes: + hardware.disk.read.requests: + hardware.disk.write.requests: + attributes: + host_name: resource_metadata.resource_url + device_name: resource_metadata.device + + - resource_type: host_network_interface + metrics: + hardware.network.incoming.bytes: + hardware.network.outgoing.bytes: + hardware.network.outgoing.errors: + attributes: + host_name: resource_metadata.resource_url + device_name: resource_metadata.name + + - resource_type: nova_compute + metrics: + compute.node.cpu.frequency: + compute.node.cpu.idle.percent: + compute.node.cpu.idle.time: + compute.node.cpu.iowait.percent: + compute.node.cpu.iowait.time: + compute.node.cpu.kernel.percent: + compute.node.cpu.kernel.time: + compute.node.cpu.percent: + compute.node.cpu.user.percent: + compute.node.cpu.user.time: + attributes: + host_name: resource_metadata.host + + - resource_type: manila_share + metrics: + manila.share.size: + attributes: + name: resource_metadata.name + host: resource_metadata.host + status: resource_metadata.status + availability_zone: resource_metadata.availability_zone + protocol: resource_metadata.protocol + + - resource_type: switch + metrics: + switch: + switch.ports: + attributes: + controller: resource_metadata.controller + + - resource_type: switch_port + metrics: + switch.port: + switch.port.uptime: + switch.port.receive.packets: + switch.port.transmit.packets: + switch.port.receive.bytes: + switch.port.transmit.bytes: + switch.port.receive.drops: + switch.port.transmit.drops: + switch.port.receive.errors: + switch.port.transmit.errors: + switch.port.receive.frame_error: + switch.port.receive.overrun_error: + switch.port.receive.crc_error: + switch.port.collision.count: + attributes: + switch: resource_metadata.switch + port_number_on_switch: resource_metadata.port_number_on_switch + neutron_port_id: resource_metadata.neutron_port_id + controller: resource_metadata.controller + + - resource_type: port + metrics: + port: + port.uptime: + port.receive.packets: + port.transmit.packets: + port.receive.bytes: + port.transmit.bytes: + port.receive.drops: + port.receive.errors: + attributes: + controller: resource_metadata.controller + + - resource_type: switch_table + metrics: + switch.table.active.entries: + attributes: + controller: resource_metadata.controller + switch: resource_metadata.switch + + - resource_type: loadbalancer + metrics: + network.services.lb.outgoing.bytes: + network.services.lb.incoming.bytes: + network.services.lb.pool: + network.services.lb.listener: + network.services.lb.member: + network.services.lb.health_monitor: + network.services.lb.loadbalancer: + network.services.lb.total.connections: + network.services.lb.active.connections: + meters: + metric: + # Image + - name: "image.size" + event_type: + - "image.upload" + - "image.delete" + - "image.update" + type: "gauge" + unit: B + volume: $.payload.size + resource_id: $.payload.id + project_id: $.payload.owner + + - name: "image.download" + event_type: "image.send" + type: "delta" + unit: "B" + volume: $.payload.bytes_sent + resource_id: $.payload.image_id + user_id: $.payload.receiver_user_id + project_id: $.payload.receiver_tenant_id + + - name: "image.serve" + event_type: "image.send" + type: "delta" + unit: "B" + volume: $.payload.bytes_sent + resource_id: $.payload.image_id + project_id: $.payload.owner_id + + - name: 'volume.provider.capacity.total' + event_type: 'capacity.backend.*' + type: 'gauge' + unit: 'GB' + volume: $.payload.total + resource_id: $.payload.name_to_id + + - name: 'volume.provider.capacity.free' + event_type: 'capacity.backend.*' + type: 'gauge' + unit: 'GB' + volume: $.payload.free + resource_id: $.payload.name_to_id + + - name: 'volume.provider.capacity.allocated' + event_type: 'capacity.backend.*' + type: 'gauge' + unit: 'GB' + volume: $.payload.allocated + resource_id: $.payload.name_to_id + + - name: 'volume.provider.capacity.provisioned' + event_type: 'capacity.backend.*' + type: 'gauge' + unit: 'GB' + volume: $.payload.provisioned + resource_id: $.payload.name_to_id + + - name: 'volume.provider.capacity.virtual_free' + event_type: 'capacity.backend.*' + type: 'gauge' + unit: 'GB' + volume: $.payload.virtual_free + resource_id: $.payload.name_to_id + + - name: 'volume.provider.pool.capacity.total' + event_type: 'capacity.pool.*' + type: 'gauge' + unit: 'GB' + volume: $.payload.total + resource_id: $.payload.name_to_id + metadata: &provider_pool_meta + provider: $.payload.name_to_id.`split(#, 0, 1)` + + - name: 'volume.provider.pool.capacity.free' + event_type: 'capacity.pool.*' + type: 'gauge' + unit: 'GB' + volume: $.payload.free + resource_id: $.payload.name_to_id + metadata: + <<: *provider_pool_meta + + - name: 'volume.provider.pool.capacity.allocated' + event_type: 'capacity.pool.*' + type: 'gauge' + unit: 'GB' + volume: $.payload.allocated + resource_id: $.payload.name_to_id + metadata: + <<: *provider_pool_meta + + - name: 'volume.provider.pool.capacity.provisioned' + event_type: 'capacity.pool.*' + type: 'gauge' + unit: 'GB' + volume: $.payload.provisioned + resource_id: $.payload.name_to_id + metadata: + <<: *provider_pool_meta + + - name: 'volume.provider.pool.capacity.virtual_free' + event_type: 'capacity.pool.*' + type: 'gauge' + unit: 'GB' + volume: $.payload.virtual_free + resource_id: $.payload.name_to_id + metadata: + <<: *provider_pool_meta + + - name: 'volume.size' + event_type: + - 'volume.exists' + - 'volume.retype' + - 'volume.create.*' + - 'volume.delete.*' + - 'volume.resize.*' + - 'volume.attach.*' + - 'volume.detach.*' + - 'volume.update.*' + - 'volume.manage.*' + type: 'gauge' + unit: 'GB' + volume: $.payload.size + user_id: $.payload.user_id + project_id: $.payload.tenant_id + resource_id: $.payload.volume_id + metadata: + display_name: $.payload.display_name + volume_type: $.payload.volume_type + image_id: $.payload.glance_metadata[?key=image_id].value + instance_id: $.payload.volume_attachment[0].server_id + + - name: 'snapshot.size' + event_type: + - 'snapshot.exists' + - 'snapshot.create.*' + - 'snapshot.delete.*' + - 'snapshot.manage.*' + type: 'gauge' + unit: 'GB' + volume: $.payload.volume_size + user_id: $.payload.user_id + project_id: $.payload.tenant_id + resource_id: $.payload.snapshot_id + metadata: + display_name: $.payload.display_name + + - name: 'backup.size' + event_type: + - 'backup.exists' + - 'backup.create.*' + - 'backup.delete.*' + - 'backup.restore.*' + type: 'gauge' + unit: 'GB' + volume: $.payload.size + user_id: $.payload.user_id + project_id: $.payload.tenant_id + resource_id: $.payload.backup_id + metadata: + display_name: $.payload.display_name + + # Magnum + - name: $.payload.metrics.[*].name + event_type: 'magnum.bay.metrics.*' + type: 'gauge' + unit: $.payload.metrics.[*].unit + volume: $.payload.metrics.[*].value + user_id: $.payload.user_id + project_id: $.payload.project_id + resource_id: $.payload.resource_id + lookup: ['name', 'unit', 'volume'] + + # Swift + - name: $.payload.measurements.[*].metric.[*].name + event_type: 'objectstore.http.request' + type: 'delta' + unit: $.payload.measurements.[*].metric.[*].unit + volume: $.payload.measurements.[*].result + resource_id: $.payload.target.id + user_id: $.payload.initiator.id + project_id: $.payload.initiator.project_id + lookup: ['name', 'unit', 'volume'] + + - name: 'memory' + event_type: &instance_events compute.instance.(?!create.start|update).* + type: 'gauge' + unit: 'MB' + volume: $.payload.memory_mb + user_id: $.payload.user_id + project_id: $.payload.tenant_id + resource_id: $.payload.instance_id + user_metadata: $.payload.metadata + metadata: &instance_meta + host: $.payload.host + flavor_id: $.payload.instance_flavor_id + flavor_name: $.payload.instance_type + display_name: $.payload.display_name + image_ref: $.payload.image_meta.base_image_ref + launched_at: $.payload.launched_at + created_at: $.payload.created_at + deleted_at: $.payload.deleted_at + + - name: 'vcpus' + event_type: *instance_events + type: 'gauge' + unit: 'vcpu' + volume: $.payload.vcpus + user_id: $.payload.user_id + project_id: $.payload.tenant_id + resource_id: $.payload.instance_id + user_metadata: $.payload.metadata + metadata: + <<: *instance_meta + + - name: 'compute.instance.booting.time' + event_type: 'compute.instance.create.end' + type: 'gauge' + unit: 'sec' + volume: + fields: [$.payload.created_at, $.payload.launched_at] + plugin: 'timedelta' + project_id: $.payload.tenant_id + resource_id: $.payload.instance_id + user_metadata: $.payload.metadata + metadata: + <<: *instance_meta + + - name: 'disk.root.size' + event_type: *instance_events + type: 'gauge' + unit: 'GB' + volume: $.payload.root_gb + user_id: $.payload.user_id + project_id: $.payload.tenant_id + resource_id: $.payload.instance_id + user_metadata: $.payload.metadata + metadata: + <<: *instance_meta + + - name: 'disk.ephemeral.size' + event_type: *instance_events + type: 'gauge' + unit: 'GB' + volume: $.payload.ephemeral_gb + user_id: $.payload.user_id + project_id: $.payload.tenant_id + resource_id: $.payload.instance_id + user_metadata: $.payload.metadata + metadata: + <<: *instance_meta + + - name: 'bandwidth' + event_type: 'l3.meter' + type: 'delta' + unit: 'B' + volume: $.payload.bytes + project_id: $.payload.tenant_id + resource_id: $.payload.label_id + + - name: 'compute.node.cpu.frequency' + event_type: 'compute.metrics.update' + type: 'gauge' + unit: 'MHz' + volume: $.payload.metrics[?(@.name='cpu.frequency')].value + resource_id: $.payload.host + "_" + $.payload.nodename + timestamp: $.payload.metrics[?(@.name='cpu.frequency')].timestamp + metadata: + event_type: $.event_type + host: $.publisher_id + source: $.payload.metrics[?(@.name='cpu.frequency')].source + + - name: 'compute.node.cpu.user.time' + event_type: 'compute.metrics.update' + type: 'cumulative' + unit: 'ns' + volume: $.payload.metrics[?(@.name='cpu.user.time')].value + resource_id: $.payload.host + "_" + $.payload.nodename + timestamp: $.payload.metrics[?(@.name='cpu.user.time')].timestamp + metadata: + event_type: $.event_type + host: $.publisher_id + source: $.payload.metrics[?(@.name='cpu.user.time')].source + + - name: 'compute.node.cpu.kernel.time' + event_type: 'compute.metrics.update' + type: 'cumulative' + unit: 'ns' + volume: $.payload.metrics[?(@.name='cpu.kernel.time')].value + resource_id: $.payload.host + "_" + $.payload.nodename + timestamp: $.payload.metrics[?(@.name='cpu.kernel.time')].timestamp + metadata: + event_type: $.event_type + host: $.publisher_id + source: $.payload.metrics[?(@.name='cpu.kernel.time')].source + + - name: 'compute.node.cpu.idle.time' + event_type: 'compute.metrics.update' + type: 'cumulative' + unit: 'ns' + volume: $.payload.metrics[?(@.name='cpu.idle.time')].value + resource_id: $.payload.host + "_" + $.payload.nodename + timestamp: $.payload.metrics[?(@.name='cpu.idle.time')].timestamp + metadata: + event_type: $.event_type + host: $.publisher_id + source: $.payload.metrics[?(@.name='cpu.idle.time')].source + + - name: 'compute.node.cpu.iowait.time' + event_type: 'compute.metrics.update' + type: 'cumulative' + unit: 'ns' + volume: $.payload.metrics[?(@.name='cpu.iowait.time')].value + resource_id: $.payload.host + "_" + $.payload.nodename + timestamp: $.payload.metrics[?(@.name='cpu.iowait.time')].timestamp + metadata: + event_type: $.event_type + host: $.publisher_id + source: $.payload.metrics[?(@.name='cpu.iowait.time')].source + + - name: 'compute.node.cpu.kernel.percent' + event_type: 'compute.metrics.update' + type: 'gauge' + unit: 'percent' + volume: $.payload.metrics[?(@.name='cpu.kernel.percent')].value * 100 + resource_id: $.payload.host + "_" + $.payload.nodename + timestamp: $.payload.metrics[?(@.name='cpu.kernel.percent')].timestamp + metadata: + event_type: $.event_type + host: $.publisher_id + source: $.payload.metrics[?(@.name='cpu.kernel.percent')].source + + - name: 'compute.node.cpu.idle.percent' + event_type: 'compute.metrics.update' + type: 'gauge' + unit: 'percent' + volume: $.payload.metrics[?(@.name='cpu.idle.percent')].value * 100 + resource_id: $.payload.host + "_" + $.payload.nodename + timestamp: $.payload.metrics[?(@.name='cpu.idle.percent')].timestamp + metadata: + event_type: $.event_type + host: $.publisher_id + source: $.payload.metrics[?(@.name='cpu.idle.percent')].source + + - name: 'compute.node.cpu.user.percent' + event_type: 'compute.metrics.update' + type: 'gauge' + unit: 'percent' + volume: $.payload.metrics[?(@.name='cpu.user.percent')].value * 100 + resource_id: $.payload.host + "_" + $.payload.nodename + timestamp: $.payload.metrics[?(@.name='cpu.user.percent')].timestamp + metadata: + event_type: $.event_type + host: $.publisher_id + source: $.payload.metrics[?(@.name='cpu.user.percent')].source + + - name: 'compute.node.cpu.iowait.percent' + event_type: 'compute.metrics.update' + type: 'gauge' + unit: 'percent' + volume: $.payload.metrics[?(@.name='cpu.iowait.percent')].value * 100 + resource_id: $.payload.host + "_" + $.payload.nodename + timestamp: $.payload.metrics[?(@.name='cpu.iowait.percent')].timestamp + metadata: + event_type: $.event_type + host: $.publisher_id + source: $.payload.metrics[?(@.name='cpu.iowait.percent')].source + + - name: 'compute.node.cpu.percent' + event_type: 'compute.metrics.update' + type: 'gauge' + unit: 'percent' + volume: $.payload.metrics[?(@.name='cpu.percent')].value * 100 + resource_id: $.payload.host + "_" + $.payload.nodename + timestamp: $.payload.metrics[?(@.name='cpu.percent')].timestamp + metadata: + event_type: $.event_type + host: $.publisher_id + source: $.payload.metrics[?(@.name='cpu.percent')].source + + # Identity + # NOTE(gordc): hack because jsonpath-rw-ext can't concat starting with string. + - name: $.payload.outcome - $.payload.outcome + 'identity.authenticate.' + $.payload.outcome + type: 'delta' + unit: 'user' + volume: 1 + event_type: + - 'identity.authenticate' + resource_id: $.payload.initiator.id + user_id: $.payload.initiator.id + + # DNS + - name: 'dns.domain.exists' + event_type: 'dns.domain.exists' + type: 'cumulative' + unit: 's' + volume: + fields: [$.payload.audit_period_beginning, $.payload.audit_period_ending] + plugin: 'timedelta' + project_id: $.payload.tenant_id + resource_id: $.payload.id + user_id: $.ctxt.user + metadata: + status: $.payload.status + pool_id: $.payload.pool_id + host: $.publisher_id + + # Trove + - name: 'trove.instance.exists' + event_type: 'trove.instance.exists' + type: 'cumulative' + unit: 's' + volume: + fields: [$.payload.audit_period_beginning, $.payload.audit_period_ending] + plugin: 'timedelta' + project_id: $.payload.tenant_id + resource_id: $.payload.instance_id + user_id: $.payload.user_id + metadata: + nova_instance_id: $.payload.nova_instance_id + state: $.payload.state + service_id: $.payload.service_id + instance_type: $.payload.instance_type + instance_type_id: $.payload.instance_type_id + + # Manila + - name: 'manila.share.size' + event_type: + - 'share.create.*' + - 'share.delete.*' + - 'share.extend.*' + - 'share.shrink.*' + type: 'gauge' + unit: 'GB' + volume: $.payload.size + user_id: $.payload.user_id + project_id: $.payload.project_id + resource_id: $.payload.share_id + metadata: + name: $.payload.name + host: $.payload.host + status: $.payload.status + availability_zone: $.payload.availability_zone + protocol: $.payload.proto + + polling: + sources: + - name: all_pollsters + interval: 300 + meters: + - "*" + pipeline: + sources: + - name: meter_source + meters: + - "*" + sinks: + - meter_sink + sinks: + - name: meter_sink + publishers: + - gnocchi + policy: {} + audit_api_map: + DEFAULT: + target_endpoint_type: None + path_keywords: + meters: meter_name + resources: resource_id + statistics: None + samples: sample_id + service_endpoints: + metering: service/metering + rally_tests: + CeilometerStats.create_meter_and_get_stats: + - args: + user_id: user-id + resource_id: resource-id + counter_volume: 1 + counter_unit: '' + counter_type: cumulative + runner: + type: constant + times: 1 + concurrency: 1 + sla: + failure_rate: + max: 0 + CeilometerMeters.list_meters: + - runner: + type: constant + times: 1 + concurrency: 1 + sla: + failure_rate: + max: 0 + context: + ceilometer: + counter_name: benchmark_meter + counter_type: gauge + counter_unit: "%" + counter_volume: 1 + resources_per_tenant: 1 + samples_per_resource: 1 + timestamp_interval: 10 + metadata_list: + - status: active + name: rally benchmark on + deleted: 'false' + - status: terminated + name: rally benchmark off + deleted: 'true' + args: + limit: 5 + metadata_query: + status: terminated + CeilometerQueries.create_and_query_samples: + - args: + filter: + "=": + counter_unit: instance + orderby: + limit: 10 + counter_name: cpu_util + counter_type: gauge + counter_unit: instance + counter_volume: 1 + resource_id: resource_id + runner: + type: constant + times: 1 + concurrency: 1 + sla: + failure_rate: + max: 0 + +dependencies: + dynamic: + common: + local_image_registry: + jobs: + - ceilometer-image-repo-sync + services: + - endpoint: node + service: local_image_registry + static: + central: + jobs: + - ceilometer-db-sync + - ceilometer-rabbit-init + - ceilometer-ks-user + services: + - endpoint: internal + service: identity + - endpoint: internal + service: metric + ipmi: + jobs: + - ceilometer-db-sync + - ceilometer-rabbit-init + - ceilometer-ks-user + services: + - endpoint: internal + service: identity + - endpoint: internal + service: metric + services: + - endpoint: internal + service: identity + - endpoint: internal + service: metric + compute: + jobs: + - ceilometer-db-sync + - ceilometer-rabbit-init + - ceilometer-ks-user + services: + - endpoint: internal + service: identity + - endpoint: internal + service: metric + db_sync: + jobs: [] + services: [] + ks_service: + services: + - endpoint: internal + service: identity + ks_user: + services: + - endpoint: internal + service: identity + rabbit_init: + services: + - service: oslo_messaging + endpoint: internal + notification: + jobs: + - ceilometer-db-sync + - ceilometer-rabbit-init + - ceilometer-ks-user + services: + - endpoint: internal + service: identity + - endpoint: internal + service: metric + tests: + services: + - endpoint: internal + service: identity + - endpoint: internal + service: metering + - endpoint: internal + service: metric + image_repo_sync: + services: + - endpoint: internal + service: local_image_registry + +# Names of secrets used by bootstrap and environmental checks +secrets: + identity: + admin: ceilometer-keystone-admin + ceilometer: ceilometer-keystone-user + test: ceilometer-keystone-test + oslo_messaging: + admin: ceilometer-rabbitmq-admin + ceilometer: ceilometer-rabbitmq-user + oci_image_registry: + ceilometer: ceilometer-oci-image-registry + +bootstrap: + enabled: false + ks_user: ceilometer + script: | + openstack token issue + +# typically overridden by environmental +# values, but should include all endpoints +# required by this chart +endpoints: + cluster_domain_suffix: cluster.local + local_image_registry: + name: docker-registry + namespace: docker-registry + hosts: + default: localhost + internal: docker-registry + node: localhost + host_fqdn_override: + default: null + port: + registry: + node: 5000 + oci_image_registry: + name: oci-image-registry + namespace: oci-image-registry + auth: + enabled: false + ceilometer: + username: ceilometer + password: password + hosts: + default: localhost + host_fqdn_override: + default: null + port: + registry: + default: null + identity: + name: keystone + auth: + admin: + region_name: RegionOne + username: admin + password: password + project_name: admin + user_domain_name: default + project_domain_name: default + ceilometer: + role: admin + region_name: RegionOne + username: ceilometer + password: password + project_name: service + user_domain_name: service + project_domain_name: service + test: + role: admin + region_name: RegionOne + username: ceilometer-test + password: password + project_name: test + user_domain_name: service + project_domain_name: service + hosts: + default: keystone + internal: keystone-api + host_fqdn_override: + default: null + path: + default: /v3 + scheme: + default: 'http' + port: + api: + default: 5000 + public: 80 + internal: 5000 + service: 5000 + metric: + name: gnocchi + hosts: + default: gnocchi-api + public: gnocchi + host_fqdn_override: + default: null + path: + default: null + scheme: + default: 'http' + port: + api: + default: 8041 + public: 80 + internal: 8041 + service: 8041 + alarming: + name: aodh + hosts: + default: aodh-api + public: aodh + host_fqdn_override: + default: null + path: + default: null + scheme: + default: 'http' + port: + api: + default: 8042 + public: 80 + oslo_cache: + auth: + # NOTE(portdirect): this is used to define the value for keystone + # authtoken cache encryption key, if not set it will be populated + # automatically with a random value, but to take advantage of + # this feature all services should be set to use the same key, + # and memcache service. + memcache_secret_key: null + hosts: + default: memcached + host_fqdn_override: + default: null + port: + memcache: + default: 11211 + oslo_messaging: + auth: + admin: + username: rabbitmq + password: password + ceilometer: + username: ceilometer + password: password + statefulset: + replicas: 2 + name: rabbitmq-rabbitmq + hosts: + default: rabbitmq + host_fqdn_override: + default: null + path: /ceilometer + scheme: rabbit + port: + amqp: + default: 5672 + http: + default: 15672 + +pod: + affinity: + anti: + type: + default: preferredDuringSchedulingIgnoredDuringExecution + topologyKey: + default: kubernetes.io/hostname + weight: + default: 10 + tolerations: + ceilometer: + enabled: false + tolerations: + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + mounts: + ceilometer_tests: + init_container: null + ceilometer_tests: + volumeMounts: + volumes: + ceilometer_compute: + init_container: null + ceilometer_compute: + volumeMounts: + volumes: + ceilometer_central: + init_container: null + ceilometer_central: + volumeMounts: + volumes: + ceilometer_ipmi: + init_container: null + ceilometer_ipmi: + volumeMounts: + volumes: + ceilometer_notification: + init_container: null + ceilometer_notification: + volumeMounts: + volumes: + ceilometer_db_sync: + ceilometer_db_sync: + volumeMounts: + volumes: + replicas: + central: 1 + notification: 1 + lifecycle: + upgrades: + deployments: + revision_history: 3 + pod_replacement_strategy: RollingUpdate + rolling_update: + max_unavailable: 1 + max_surge: 3 + daemonsets: + pod_replacement_strategy: RollingUpdate + compute: + enabled: true + min_ready_seconds: 0 + max_unavailable: 1 + resources: + enabled: false + compute: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "1024Mi" + cpu: "2000m" + notification: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "1024Mi" + cpu: "2000m" + central: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "1024Mi" + cpu: "2000m" + ipmi: + requests: + memory: "124Mi" + cpu: "100m" + limits: + memory: "1024Mi" + cpu: "2000m" + jobs: + db_sync: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "1024Mi" + cpu: "2000m" + rabbit_init: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "1024Mi" + cpu: "2000m" + ks_service: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "1024Mi" + cpu: "2000m" + ks_user: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "1024Mi" + cpu: "2000m" + tests: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "1024Mi" + cpu: "2000m" + image_repo_sync: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "1024Mi" + cpu: "2000m" + +network_policy: + ceilometer: + ingress: + - {} + egress: + - {} + +manifests: + configmap_bin: true + configmap_etc: true + deployment_api: false + deployment_central: true + deployment_collector: false + daemonset_compute: true + daemonset_ipmi: false + deployment_notification: true + ingress_api: false + job_bootstrap: true + job_db_drop: false + # using gnocchi so no db init + job_db_init: false + job_db_init_mongodb: false + # runs ceilometer-upgrade which inits resource types in gnocchi! + job_db_sync: true + job_image_repo_sync: true + job_ks_endpoints: false + job_ks_service: true + job_ks_user: true + job_rabbit_init: true + pdb_api: true + pod_rally_test: true + network_policy: false + secret_db: true + secret_keystone: true + secret_mongodb: false + secret_rabbitmq: true + secret_registry: true + service_api: true + service_ingress_api: true +... diff --git a/helm-configs/gnocchi/gnocchi-helm-overrides.yaml b/helm-configs/gnocchi/gnocchi-helm-overrides.yaml index 7ade5b93..db1c37bb 100644 --- a/helm-configs/gnocchi/gnocchi-helm-overrides.yaml +++ b/helm-configs/gnocchi/gnocchi-helm-overrides.yaml @@ -234,7 +234,7 @@ pod: init_container: null gnocchi_tests: replicas: - api: 1 + api: 3 lifecycle: upgrades: deployments: @@ -246,11 +246,11 @@ pod: daemonsets: pod_replacement_strategy: RollingUpdate metricd: - enabled: false + enabled: true min_ready_seconds: 0 max_unavailable: 1 statsd: - enabled: false + enabled: true min_ready_seconds: 0 max_unavailable: 1 disruption_budget: diff --git a/helm-configs/postgresql/postgresql-helm-overrides.yaml b/helm-configs/postgresql/postgresql-helm-overrides.yaml index 679228c1..ad41ea06 100644 --- a/helm-configs/postgresql/postgresql-helm-overrides.yaml +++ b/helm-configs/postgresql/postgresql-helm-overrides.yaml @@ -239,7 +239,7 @@ jobs: # activeDeadlineSeconds == 0 means no deadline activeDeadlineSeconds: 0 backoffLimit: 6 - cron: "0 0 * * *" + cron: "15 0 * * *" history: success: 3 failed: 1 @@ -300,12 +300,12 @@ conf: hba_file: '/tmp/pg_hba.conf' ident_file: '/tmp/pg_ident.conf' backup: - enabled: false + enabled: true base_path: /var/backup days_to_keep: 3 pg_dumpall_options: '--inserts --clean' remote_backup: - enabled: false + enabled: true container_name: postgresql days_to_keep: 14 storage_policy: default-placement @@ -466,7 +466,7 @@ manifests: configmap_etc: true job_image_repo_sync: true network_policy: false - job_ks_user: false + job_ks_user: true secret_admin: true secret_etc: true secret_audit: true @@ -474,8 +474,8 @@ manifests: secret_registry: true service: true statefulset: true - cron_job_postgresql_backup: false - pvc_backup: false + cron_job_postgresql_backup: true + pvc_backup: true monitoring: prometheus: configmap_bin: false diff --git a/mkdocs.yml b/mkdocs.yml index d553449c..ce4435f5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -182,6 +182,7 @@ nav: - skyline: openstack-skyline.md - Octavia: openstack-octavia.md - Gnocchi: openstack-gnocchi.md + - Ceilometer: openstack-ceilometer.md - Monitoring: - Monitoring Overview: prometheus-monitoring-overview.md - Prometheus: prometheus.md