Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Export Trino metrics to Prometheus and enable autoscalling with KEDA #712

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions distributed-databases/trino/addons.tf
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,17 @@ module "eks_blueprints_addons" {
],
}

helm_releases = {
keda = {
chart = "keda"
chart_version = "2.16.0"
repository = "https://kedacore.github.io/charts"
description = "Keda helm Chart deployment"
namespace = "keda"
create_namespace = true
}
}

tags = local.tags
}

Expand Down
2 changes: 1 addition & 1 deletion distributed-databases/trino/examples/hive-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ echo "The name of your bucket is: ${BUCKET}"
echo "Now copying the 2022 NY Taxi data into the S3 bucket..."

## Copy the 2022 NY Taxi data into the S3 bucket
aws s3 cp "s3://nyc-tlc/trip data/" s3://$BUCKET/hive/ --exclude "*" --include "yellow_tripdata_2022*" --recursive
aws s3 cp "s3://aws-data-analytics-workshops/shared_datasets/tripdata/" s3://$BUCKET/hive/ --recursive

ratnopamc marked this conversation as resolved.
Show resolved Hide resolved
sleep 2
echo "Now we create the Glue Database..."
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
prometheus:
serviceAccount:
create: true
name: ${amp_sa}
annotations:
eks.amazonaws.com/role-arn: ${amp_irsa}
prometheusSpec:
remoteWrite:
- url: ${amp_remotewrite_url}
sigv4:
region: ${region}
queue_config:
max_samples_per_send: 1000
max_shards: 200
capacity: 2500
retention: 5h
scrapeInterval: 30s
evaluationInterval: 30s
scrapeTimeout: 10s
storageSpec:
volumeClaimTemplate:
metadata:
name: data
spec:
storageClassName: gp2
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 50Gi
serviceMonitorSelectorNilUsesHelmValues: false
alertmanager:
enabled: false

grafana:
enabled: true
defaultDashboardsEnabled: true
# Adding AMP datasource to Grafana config
serviceAccount:
create: false
name: ${amp_sa}
grafana.ini:
auth:
sigv4_auth_enabled: true
additionalDataSources:
- name: AMP
editable: true
jsonData:
sigV4Auth: true
sigV4Region: ${region}
type: prometheus
isDefault: false
url: ${amp_url}
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ prometheus:
resources:
requests:
storage: 50Gi
serviceMonitorSelectorNilUsesHelmValues: false
alertmanager:
enabled: false

Expand Down
55 changes: 53 additions & 2 deletions distributed-databases/trino/helm-values/trino.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,17 @@
# └── Total: 80.6GB < 89GB ✓
---
image:
tag: "427"
repository: trinodb/trino
tag: 447
pullPolicy: IfNotPresent
server:
workers: 3
exchangeManager:
name: filesystem
baseDir: "s3://${exchange_bucket_id}"
autoscaling:
enabled: true
enabled: false
minReplicas: 1
maxReplicas: 20
targetCPUUtilizationPercentage: 75
targetMemoryUtilizationPercentage: 80
Expand Down Expand Up @@ -106,7 +110,7 @@
- "query.remote-task.max-error-duration=1m"
- "query.max-hash-partition-count=100" # Updated from query.hash-partition-count
- "spill-enabled=true" # Updated from experimental.spill-enabled
- "spiller-spill-path=/tmp/spill" # Chagne this to SSD mount for faster

Check failure on line 113 in distributed-databases/trino/helm-values/trino.yaml

View workflow job for this annotation

GitHub Actions / Check for spelling errors

Chagne ==> Change
- "memory.heap-headroom-per-node=9.6GB"
- "optimizer.join-reordering-strategy=AUTOMATIC" # Updated from join-reordering-strategy
- "query.max-history=100"
Expand Down Expand Up @@ -146,3 +150,50 @@
name: ${sa}
ingress:
enabled: false
jmx:
enabled: true
registryPort: 9080
serverPort: 9081
exporter:
# jmx.exporter.enabled -- Set to true to export JMX Metrics via HTTP for [Prometheus](https://github.com/prometheus/jmx_exporter) consumption
enabled: true
image: bitnami/jmx-exporter:latest
pullPolicy: Always
port: 5556
configProperties: |-
hostPort: localhost:{{- .Values.jmx.registryPort }}
startDelaySeconds: 0
ssl: false
lowercaseOutputName: false
lowercaseOutputLabelNames: false
whitelistObjectNames: ["trino.execution:name=QueryManager","trino.execution:name=SqlTaskManager","trino.execution.executor:name=TaskExecutor","trino.memory:name=ClusterMemoryManager","java.lang:type=Runtime","trino.memory:type=ClusterMemoryPool,name=general","java.lang:type=Memory","trino.memory:type=MemoryPool,name=general"]
autoExcludeObjectNameAttributes: true
excludeObjectNameAttributes:
"java.lang:type=OperatingSystem":
- "ObjectName"
"java.lang:type=Runtime":
- "ClassPath"
- "SystemProperties"
rules:
- pattern: ".*"
resources:
limits:
cpu: 200m
memory: 512Mi
requests:
cpu: 200m
memory: 512Mi

serviceMonitor:
enabled: true
labels:
prometheus: kube-prometheus
interval: "30s"
coordinator:
enabled: true
labels:
prometheus: kube-prometheus
worker:
enabled: true
labels:
prometheus: kube-prometheus
42 changes: 42 additions & 0 deletions distributed-databases/trino/trino-keda.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
name: keda-scaler-trino-worker
namespace: ${trino_namespace}
spec:
scaleTargetRef:
name: trino-worker
minReplicaCount: 3
maxReplicaCount: 15
pollingInterval: 30 # Seconds
cooldownPeriod: 600 # Seconds
fallback:
failureThreshold: 3
replicas: 6
advanced:
horizontalPodAutoscalerConfig:
behavior:
scaleDown:
stabilizationWindowSeconds: 600
policies:
- type: Pods
value: 1
periodSeconds: 300
scaleUp:
stabilizationWindowSeconds: 0
policies:
- type: Pods
value: 1
periodSeconds: 120
triggers:
- type: cpu
metricType: Utilization
metadata:
value: '80' # Target CPU utilization percentage
- type: prometheus
metricType: Value
metadata:
serverAddress: http://kube-prometheus-stack-prometheus.kube-prometheus-stack.svc.cluster.local:9090
threshold: '1'
metricName: queued_queries
query: sum by (job) (avg_over_time(trino_execution_QueryManager_QueuedQueries{job="trino"}[1m]))
22 changes: 21 additions & 1 deletion distributed-databases/trino/trino.tf
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,15 @@ resource "aws_iam_policy" "trino_exchange_bucket_policy" {
# Trino Helm Add-on
#---------------------------------------
module "trino_addon" {
depends_on = [
module.eks_blueprints_addons,
]

source = "aws-ia/eks-blueprints-addon/aws"
version = "~> 1.1.1" #ensure to update this to the latest/desired version

chart = "trino"
chart_version = "0.33.0"
chart_version = "0.34.0"
repository = "https://trinodb.github.io/charts"
description = "Trino Helm Chart deployment"
namespace = local.trino_namespace
Expand Down Expand Up @@ -190,3 +194,19 @@ module "trino_addon" {
}
}
}


#---------------------------------------------------------------
# KEDA ScaleObject - Trino Prometheus
#---------------------------------------------------------------
resource "kubectl_manifest" "trino_keda" {

yaml_body = templatefile("${path.module}/trino-keda.yaml", {
trino_namespace = local.trino_namespace
})

depends_on = [
module.eks_blueprints_addons,
module.trino_addon
]
}
Loading