Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add logging support to JARK blueprint #709

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions ai-ml/jark-stack/terraform/addons.tf
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,27 @@ module "eks_blueprints_addons" {
values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})]
}

#---------------------------------------
# Enable Fluentbit logging
#---------------------------------------
enable_aws_for_fluentbit = true
aws_for_fluentbit_cw_log_group = {
use_name_prefix = false
name = "/${local.name}/aws-fluentbit-logs" # Add-on creates this log group
retention_in_days = 30
}
aws_for_fluentbit = {
s3_bucket_arns = [
module.s3_bucket.s3_bucket_arn,
"${module.s3_bucket.s3_bucket_arn}/*"
]
values = [templatefile("${path.module}/helm-values/aws-for-fluentbit-values.yaml", {
region = local.region,
cloudwatch_log_group = "/${local.name}/aws-fluentbit-logs"
s3_bucket_name = module.s3_bucket.s3_bucket_id
cluster_name = module.eks.cluster_name
})]
}
}

#---------------------------------------------------------------
Expand Down Expand Up @@ -458,3 +479,27 @@ data "aws_iam_policy_document" "karpenter_controller_policy" {
sid = "KarpenterControllerAdditionalPolicy"
}
}

#---------------------------------------------------------------
# S3 bucket for Logs and Example Data
#---------------------------------------------------------------
#tfsec:ignore:*
module "s3_bucket" {
source = "terraform-aws-modules/s3-bucket/aws"
version = "~> 3.0"

bucket_prefix = "${local.name}-jark-logs-"

# For example only - please evaluate for your environment
force_destroy = true

server_side_encryption_configuration = {
rule = {
apply_server_side_encryption_by_default = {
sse_algorithm = "AES256"
}
}
}

tags = local.tags
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,8 @@ server:
enabled: true
minReplicas: 1
serviceType: LoadBalancer
serviceAnnotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-scheme: internal
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip

101 changes: 101 additions & 0 deletions ai-ml/jark-stack/terraform/helm-values/aws-for-fluentbit-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
global:

#hostNetwork and dnsPolicy are critical for enabling large clusters to avoid making calls to API server
# see this link https://docs.fluentbit.io/manual/pipeline/filters/kubernetes#optional-feature-using-kubelet-to-get-metadata
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet

service:
parsersFiles:
- /fluent-bit/parsers/parsers.conf
extraParsers: |
[PARSER]
Name kubernetes
Format regex
Regex ^(?<namespace_name>[^_]+)\.(?<container_name>.+)\.(?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)\.(?<docker_id>[a-z0-9]{64})-$

input:
name: "tail"
enabled: true
tag: "systempods.<namespace_name>.<container_name>.<pod_name>.<docker_id>-"
path: "/var/log/containers/*.log"
db: "/var/log/flb_kube.db"
memBufLimit: 5MB
skipLongLines: "On"
refreshInterval: 10
extraInputs: |
multiline.parser docker, cri
Tag_Regex (?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?<namespace_name>[^_]+)_(?<container_name>.+)-(?<docker_id>[a-z0-9]{64})\.log$


# NOTE: extraFilters config for using Kubelet to get the Metadata instead of talking to API server for large clusters
filter:
name: "kubernetes"
match: "systempods.*"
kubeURL: "https://kubernetes.default.svc.cluster.local:443"
mergeLog: "On"
mergeLogKey: "log_processed"
keepLog: "On"
k8sLoggingParser: "On"
k8sLoggingExclude: "Off"
bufferSize: "0"
extraFilters: |
Kube_Tag_Prefix systempods.
Regex_Parser kubernetes
Labels On
Annotations Off
Use_Kubelet true
Kubelet_Port 10250
Kube_CA_File /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
Kube_Token_File /var/run/secrets/kubernetes.io/serviceaccount/token

# CATION: Do not use `cloudwatch` plugin. This Golang Plugin is not recommended by AWS anymore instead use C plugin(`cloudWatchLogs`) for better performance.
# cloudWatch:
# enabled: false

# This is a new high performance C Plugin for CloudWatchLogs. See docs here https://docs.fluentbit.io/manual/pipeline/outputs/cloudwatch
cloudWatchLogs:
enabled: true
match: "*"
region: ${region}
logGroupName: ${cloudwatch_log_group}
autoCreateGroup: false
extraOutputs: |
log_key log

#----------------------------------------------------------#
# OUTPUT logs to S3
#----------------------------------------------------------#

# This is an example for writing logs to S3 bucket.
# This example writes system pod logs and spark logs into dedicated prefix.
# This second output is using the rewrite_tag filter commented above

additionalOutputs: |
[OUTPUT]
Name s3
Match systempods.*
region ${region}
bucket ${s3_bucket_name}
total_file_size 100M
s3_key_format /${cluster_name}/system-pod-logs/$TAG[1]/$TAG[2]/$TAG[3]/$TAG[3]_%H%M%S_$UUID.log
s3_key_format_tag_delimiters ..
store_dir /home/ec2-user/buffer
upload_timeout 10m
log_key log

# Resource config for large clusters
resources:
limits:
cpu: 1000m
memory: 1500Mi
requests:
cpu: 500m
memory: 500Mi

## Assign a PriorityClassName to pods if set
priorityClassName: system-node-critical

# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints.
tolerations:
- operator: Exists
24 changes: 0 additions & 24 deletions ai-ml/jark-stack/terraform/helm-values/kube-prometheus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,27 +22,3 @@ alertmanager:
grafana:
enabled: true
defaultDashboardsEnabled: true
prometheus:
prometheusSpec:
retention: 5h
scrapeInterval: 30s
evaluationInterval: 30s
scrapeTimeout: 10s
serviceMonitorSelectorNilUsesHelmValues: false # This is required to use the serviceMonitorSelector
storageSpec:
volumeClaimTemplate:
metadata:
name: data
spec:
storageClassName: ${storage_class_type}
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 50Gi
alertmanager:
enabled: false

grafana:
enabled: true
defaultDashboardsEnabled: true
2 changes: 1 addition & 1 deletion ai-ml/jark-stack/terraform/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 3.72"
version = ">= 5.76"
}
kubernetes = {
source = "hashicorp/kubernetes"
Expand Down
Loading