From 70405b97320e867f5526bd7db9fdc5a172435177 Mon Sep 17 00:00:00 2001 From: Ruchika Modi <106240341+ruchimo@users.noreply.github.com> Date: Thu, 2 Nov 2023 17:23:32 +0530 Subject: [PATCH] Adding Module and Example for ECS cluster monitoring with ecs_observer (#211) * Adding Module and Example for ECS cluster monitoring with ecs_observer * Adding Module and Example for ECS cluster monitoring with ecs_observer * Incorporating PR comments * Restructuring Examples and modules folder for ECS, Added content in main Readme * Fixing path as per PR comments * Parameterzing the config files, incorporated PR review comments * Adding condition for AMP WS and fixing AMP endpoint * Adding Document for ECS Monitoring and parameterized some variables * Added sample dashboard * Adding Document for ECS Monitoring and parameterized some variables * Fixing failures detected by pre-commit * Fixing failures detected by pre-commit * Fixing failures detected by pre-commit * Pre-commit fixes * Fixing failures detected by pre-commit * Fixing failures detected by pre-commit * Pre-commit * Fixing HIGH security alerts detected by pre-commit * Fixing HIGH security alerts detected by pre-commit * Fixing HIGH security alerts detected by pre-commit, 31stOct * Add links after merge * 2ndNov - Added condiotnal creation for Grafana WS and module versions for AMG, AMP --------- Co-authored-by: Rodrigue Koffi --- README.md | 12 + docs/ecs/ecs-monitoring-on-ec2.md | 62 +++++ docs/helpers/ecs-cluster-with-vpc.md | 45 ++++ examples/ecs-cluster-with-vpc/README.md | 67 +++++ examples/ecs-cluster-with-vpc/main.tf | 246 ++++++++++++++++++ examples/ecs-cluster-with-vpc/outputs.tf | 0 examples/ecs-cluster-with-vpc/variables.tf | 0 examples/ecs-cluster-with-vpc/versions.tf | 10 + mkdocs.yml | 3 + modules/ecs-monitoring/README.md | 78 ++++++ modules/ecs-monitoring/configs/config.yaml | 130 +++++++++ modules/ecs-monitoring/locals.tf | 32 +++ modules/ecs-monitoring/main.tf | 53 ++++ modules/ecs-monitoring/outputs.tf | 19 ++ .../task-definitions/otel_collector.json | 21 ++ modules/ecs-monitoring/variables.tf | 75 ++++++ modules/ecs-monitoring/versions.tf | 10 + 17 files changed, 863 insertions(+) create mode 100644 docs/ecs/ecs-monitoring-on-ec2.md create mode 100644 docs/helpers/ecs-cluster-with-vpc.md create mode 100644 examples/ecs-cluster-with-vpc/README.md create mode 100644 examples/ecs-cluster-with-vpc/main.tf create mode 100644 examples/ecs-cluster-with-vpc/outputs.tf create mode 100644 examples/ecs-cluster-with-vpc/variables.tf create mode 100644 examples/ecs-cluster-with-vpc/versions.tf create mode 100644 modules/ecs-monitoring/README.md create mode 100644 modules/ecs-monitoring/configs/config.yaml create mode 100644 modules/ecs-monitoring/locals.tf create mode 100644 modules/ecs-monitoring/main.tf create mode 100644 modules/ecs-monitoring/outputs.tf create mode 100644 modules/ecs-monitoring/task-definitions/otel_collector.json create mode 100644 modules/ecs-monitoring/variables.tf create mode 100644 modules/ecs-monitoring/versions.tf diff --git a/README.md b/README.md index 0e450fe3..78df8d24 100644 --- a/README.md +++ b/README.md @@ -127,6 +127,18 @@ module "eks_monitoring" { } ``` +#### Amazon ECS monitoring +ECS cluster with VPC and EC2 can be created using the example [here](./examples/ecs_cluster_with_vpc) + +```hcl +module "ecs_monitoring" { + source = "github.com/aws-observability/terraform-aws-observability-accelerator//modules/ecs-monitoring" + + aws_ecs_cluster_name = module.ecs_cluster.cluster_name + task_role_arn = module.ecs_cluster.task_exec_iam_role_arn + execution_role_arn = module.ecs_cluster.task_exec_iam_role_arn +} +``` Grafana Dashboards image diff --git a/docs/ecs/ecs-monitoring-on-ec2.md b/docs/ecs/ecs-monitoring-on-ec2.md new file mode 100644 index 00000000..11e50a85 --- /dev/null +++ b/docs/ecs/ecs-monitoring-on-ec2.md @@ -0,0 +1,62 @@ +# Amazon ECS on EC2 cluster monitoring + +This example demonstrates how to monitor your Amazon Elastic Container Service on EC2 +(Amazon ECS) cluster with the Observability Accelerator's ECS monitoring module + +The module collects Prometheus metrics from tasks running on ECS and sends it to Prometheus using AWS Distro for OpenTelemetry Collector (ADOT). +You can either run the collector as a sidecar or deploy the collector as its own ECS service for entire cluster. +ECS tasks with Prometheus endpoints are discovered using extension +[ecsobserver](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/extension/observer/ecsobserver/README.md). +(Unlike EKS, there is no builtin discovery for ECS inside prometheus) + +Additionally, you can optionally collect custom Prometheus metrics from your applications running +on your ECS cluster. + +## Prerequisites + +!!! note + Make sure to complete the [prerequisites section](https://aws-observability.github.io/terraform-aws-observability-accelerator/concepts/#prerequisites) before proceeding. + +## Available Samples for various Worklods +Make sure to update your exisitng Application Task Definitions based on the workload type :- + +#### 1. [Java/JMX workload for ECS Clusters](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/ContainerInsights-Prometheus-Sample-Workloads-ECS-javajmx.html) +#### 2. [NGINX workload for Amazon ECS clusters](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/ContainerInsights-Prometheus-Setup-nginx-ecs.html) +#### 3. [App Mesh workload](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/ContainerInsights-Prometheus-Sample-Workloads-ECS-appmesh.html) + +## Setup + +#### 1. Add the ECS Monitoring Module to your exisitng ECS CLuster + +``` +module "ecs_monitoring" { + source = "../../modules/ecs-monitoring" + aws_ecs_cluster_name = module.ecs_cluster.cluster_name + task_role_arn = module.ecs_cluster.task_exec_iam_role_arn + execution_role_arn = module.ecs_cluster.task_exec_iam_role_arn + + depends_on = [ + module.ecs_cluster + ] +} +``` + +## Deploy + +Simply run this command to deploy the example + +```bash +terraform apply +``` + +## Visualization +![image](https://github.com/ruchimo/terraform-aws-observability-accelerator/assets/106240341/006c387e-92e8-45c8-ae2e-825900990741) + + +## Cleanup + +To clean up your environment, destroy the Terraform example by running + +```sh +terraform destroy +``` diff --git a/docs/helpers/ecs-cluster-with-vpc.md b/docs/helpers/ecs-cluster-with-vpc.md new file mode 100644 index 00000000..a1d44f62 --- /dev/null +++ b/docs/helpers/ecs-cluster-with-vpc.md @@ -0,0 +1,45 @@ +# Example Amazon ECS Cluster with VPC +This example deploys an AWS ECS Cluster with VPC and also add the ECS Monitoring module + +## Prerequisites + +!!! note + Make sure to complete the [prerequisites section](https://aws-observability.github.io/terraform-aws-observability-accelerator/concepts/#prerequisites) before proceeding. + +## Setup +#### 1. Download sources and initialize TerraformĀ¶ + +``` +git clone https://github.com/aws-observability/terraform-aws-observability-accelerator.git +cd terraform-aws-observability-accelerator/examples/ecs-cluster-with-vpc +terraform init +``` + +#### 2. AWS RegionĀ¶ +Specify the AWS Region where the resources will be deployed: + +``` +export TF_VAR_aws_region=xxx +``` + +#### 3. Terraform Plan to validate the changes/updates + +``` +terraform plan +``` + +## Deploy + +Simply run this command to deploy the example + +```bash +terraform apply +``` + +## Cleanup + +To clean up your environment, destroy the Terraform example by running + +```sh +terraform destroy +``` diff --git a/examples/ecs-cluster-with-vpc/README.md b/examples/ecs-cluster-with-vpc/README.md new file mode 100644 index 00000000..224ec350 --- /dev/null +++ b/examples/ecs-cluster-with-vpc/README.md @@ -0,0 +1,67 @@ +# ECS Cluster w/ EC2 Autoscaling + +Configuration in this directory creates: + +- ECS cluster using EC2 autoscaling groups +- Autoscaling groups with IAM instance profile to be used by ECS cluster +- Example ECS service that utilizes + - Mounts a host volume into the container definition + - Load balancer target group attachment + - Security group for access to the example service + +## Usage + +To run this example you need to execute: + +```bash +$ terraform init +$ terraform plan +$ terraform apply +``` + +Note that this example may create resources which will incur monetary charges on your AWS bill. Run `terraform destroy` when you no longer need these resources. + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.0 | +| [aws](#requirement\_aws) | >= 4.55 | + +## Providers + +| Name | Version | +|------|---------| +| [aws](#provider\_aws) | >= 4.55 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [alb\_sg](#module\_alb\_sg) | terraform-aws-modules/security-group/aws | ~> 5.0 | +| [autoscaling](#module\_autoscaling) | terraform-aws-modules/autoscaling/aws | ~> 6.5 | +| [autoscaling\_sg](#module\_autoscaling\_sg) | terraform-aws-modules/security-group/aws | ~> 5.0 | +| [ecs\_cluster](#module\_ecs\_cluster) | terraform-aws-modules/ecs/aws | 5.2.2 | +| [ecs\_monitoring](#module\_ecs\_monitoring) | ../../modules/ecs-monitoring | n/a | +| [vpc](#module\_vpc) | terraform-aws-modules/vpc/aws | ~> 5.0 | + +## Resources + +| Name | Type | +|------|------| +| [aws_availability_zones.available](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/availability_zones) | data source | +| [aws_ssm_parameter.ecs_optimized_ami](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ssm_parameter) | data source | + +## Inputs + +No inputs. + +## Outputs + +No outputs. + + +## License + +Apache-2.0 Licensed. See [LICENSE](https://github.com/terraform-aws-modules/terraform-aws-ecs/blob/master/LICENSE). diff --git a/examples/ecs-cluster-with-vpc/main.tf b/examples/ecs-cluster-with-vpc/main.tf new file mode 100644 index 00000000..a08e0e38 --- /dev/null +++ b/examples/ecs-cluster-with-vpc/main.tf @@ -0,0 +1,246 @@ +provider "aws" { + region = local.region +} + +data "aws_availability_zones" "available" {} + +locals { + region = "us-east-1" + name = "ex-${basename(path.cwd)}" + + vpc_cidr = "10.0.0.0/16" + azs = slice(data.aws_availability_zones.available.names, 0, 3) + + container_name = "ecs-sample" + container_port = 80 + + tags = { + Name = local.name + Example = local.name + Repository = "https://github.com/terraform-aws-modules/terraform-aws-ecs" + } + + network_acls = { + public_inbound = [ + { + rule_number = 100 + rule_action = "allow" + from_port = 80 + to_port = 80 + protocol = "tcp" + cidr_block = "10.0.0.0/16" + }, + { + rule_number = 110 + rule_action = "allow" + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_block = "10.0.0.0/16" + }, + { + rule_number = 120 + rule_action = "allow" + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_block = "10.0.0.0/16" + } + ] + public_outbound = [ + { + rule_number = 100 + rule_action = "allow" + from_port = 80 + to_port = 80 + protocol = "tcp" + cidr_block = "10.0.0.0/16" + }, + { + rule_number = 110 + rule_action = "allow" + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_block = "10.0.0.0/16" + } + ] + } +} + +################################################################################ +# Cluster +################################################################################ + +module "ecs_cluster" { + source = "terraform-aws-modules/ecs/aws" + version = "5.2.2" + + cluster_name = local.name + + # Capacity provider - autoscaling groups + default_capacity_provider_use_fargate = false + create_task_exec_iam_role = true + task_exec_iam_role_name = "ecs_monitor_task_exec_role" + task_exec_iam_role_policies = { "module.ecs_cluster.module.cluster.aws_iam_policy.task_exec[0]" : "arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess" } + autoscaling_capacity_providers = { + # On-demand instances + ex-1 = { + auto_scaling_group_arn = module.autoscaling["ex-1"].autoscaling_group_arn + managed_termination_protection = "ENABLED" + + managed_scaling = { + maximum_scaling_step_size = 5 + minimum_scaling_step_size = 1 + status = "ENABLED" + target_capacity = 60 + } + + default_capacity_provider_strategy = { + weight = 60 + base = 20 + } + } + } + + tags = local.tags +} + + +module "autoscaling" { + source = "terraform-aws-modules/autoscaling/aws" + version = "~> 6.5" + + for_each = { + # On-demand instances + ex-1 = { + instance_type = "t3.large" + use_mixed_instances_policy = false + mixed_instances_policy = {} + user_data = <<-EOT + #!/bin/bash + cat <<'EOF' >> /etc/ecs/ecs.config + ECS_CLUSTER=${local.name} + ECS_LOGLEVEL=debug + ECS_CONTAINER_INSTANCE_TAGS=${jsonencode(local.tags)} + ECS_ENABLE_TASK_IAM_ROLE=true + EOF + EOT + } + } + + name = "${local.name}-${each.key}" + + image_id = jsondecode(data.aws_ssm_parameter.ecs_optimized_ami.value)["image_id"] + instance_type = each.value.instance_type + + security_groups = [module.autoscaling_sg.security_group_id] + user_data = base64encode(each.value.user_data) + ignore_desired_capacity_changes = true + + create_iam_instance_profile = true + iam_role_name = local.name + iam_role_description = "ECS role for ${local.name}" + iam_role_policies = { + AmazonEC2ContainerServiceforEC2Role = "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role" + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + + vpc_zone_identifier = module.vpc.private_subnets + health_check_type = "EC2" + min_size = 1 + max_size = 5 + desired_capacity = 2 + + # https://github.com/hashicorp/terraform-provider-aws/issues/12582 + autoscaling_group_tags = { + AmazonECSManaged = true + } + + # Required for managed_termination_protection = "ENABLED" + protect_from_scale_in = true + + # Spot instances + use_mixed_instances_policy = each.value.use_mixed_instances_policy + mixed_instances_policy = each.value.mixed_instances_policy + + metadata_options = { + http_tokens = "required" + } + + tags = local.tags +} + +module "autoscaling_sg" { + source = "terraform-aws-modules/security-group/aws" + version = "~> 5.0" + + name = local.name + description = "Autoscaling group security group" + vpc_id = module.vpc.vpc_id + + computed_ingress_with_source_security_group_id = [ + { + rule = "http-80-tcp" + source_security_group_id = module.alb_sg.security_group_id + } + ] + number_of_computed_ingress_with_source_security_group_id = 1 + + tags = local.tags +} + +module "vpc" { + source = "terraform-aws-modules/vpc/aws" + version = "~> 5.0" + + name = local.name + cidr = local.vpc_cidr + + azs = local.azs + private_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 4, k)] + public_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 48)] + + public_dedicated_network_acl = true + public_inbound_acl_rules = local.network_acls["public_inbound"] + public_outbound_acl_rules = local.network_acls["public_outbound"] + private_dedicated_network_acl = true + private_inbound_acl_rules = local.network_acls["public_inbound"] + private_outbound_acl_rules = local.network_acls["public_outbound"] + + manage_default_network_acl = true + enable_nat_gateway = true + single_nat_gateway = true + + tags = local.tags +} + +data "aws_ssm_parameter" "ecs_optimized_ami" { + name = "/aws/service/ecs/optimized-ami/amazon-linux-2/recommended" +} + +module "alb_sg" { + source = "terraform-aws-modules/security-group/aws" + version = "~> 5.0" + + name = "${local.name}-service" + description = "Service security group" + vpc_id = module.vpc.vpc_id + + ingress_rules = ["http-80-tcp"] + ingress_cidr_blocks = ["10.0.0.0/16"] + egress_cidr_blocks = module.vpc.private_subnets_cidr_blocks + + tags = local.tags +} + +module "ecs_monitoring" { + source = "../../modules/ecs-monitoring" + aws_ecs_cluster_name = module.ecs_cluster.cluster_name + task_role_arn = module.ecs_cluster.task_exec_iam_role_arn + execution_role_arn = module.ecs_cluster.task_exec_iam_role_arn + + depends_on = [ + module.ecs_cluster + ] +} diff --git a/examples/ecs-cluster-with-vpc/outputs.tf b/examples/ecs-cluster-with-vpc/outputs.tf new file mode 100644 index 00000000..e69de29b diff --git a/examples/ecs-cluster-with-vpc/variables.tf b/examples/ecs-cluster-with-vpc/variables.tf new file mode 100644 index 00000000..e69de29b diff --git a/examples/ecs-cluster-with-vpc/versions.tf b/examples/ecs-cluster-with-vpc/versions.tf new file mode 100644 index 00000000..290d2218 --- /dev/null +++ b/examples/ecs-cluster-with-vpc/versions.tf @@ -0,0 +1,10 @@ +terraform { + required_version = ">= 1.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 4.55" + } + } +} diff --git a/mkdocs.yml b/mkdocs.yml index 49b14c07..918978a0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -44,9 +44,12 @@ nav: - Amazon CloudWatch Container Insights: - Amazon EKS: container-insights/eks.md - Monitoring Managed Service for Prometheus Workspaces: workloads/managed-prometheus.md + - Amazon ECS: + - Cluster Monitoring: ecs/ecs-monitoring-on-ec2.md - Supporting Examples: - EKS Cluster with VPC: helpers/new-eks-cluster.md - Amazon Managed Grafana setup: helpers/managed-grafana.md + - ECS Cluster with VPC: helpers/ecs-cluster-with-vpc.md - Support & Feedback: support.md - Contributors: contributors.md diff --git a/modules/ecs-monitoring/README.md b/modules/ecs-monitoring/README.md new file mode 100644 index 00000000..e89f31ab --- /dev/null +++ b/modules/ecs-monitoring/README.md @@ -0,0 +1,78 @@ +# Observability Module for ECS Monitoring using ecs_observer + +This module provides ECS cluster monitoring with the following resources: + +- AWS Distro For OpenTelemetry Operator and Collector for Metrics and Traces +- Creates Grafana Dashboards on Amazon Managed Grafana. +- Creates SSM Parameter to store and distribute the ADOT config file + +## Pre-requisites +* ECS Cluster with EC2 using examples --> ecs-cluster-with-vpc +* Create a `Prometheus Workspace` either using the Console or using the commented code under modules/ecs-monitoring/main.tf. +* Update your exisitng App(workload) *ECS Task Definition* to add below label/environment variable + - Set ***ECS_PROMETHEUS_EXPORTER_PORT*** to point to the containerPort where the Prometheus metrics are exposed + - Set ***Java_EMF_Metrics*** to true. The CloudWatch agent uses this flag to generated the embedded metric format in the log event. + +This module makes use of the below open source projects: +* [aws-managed-grafana](https://github.com/terraform-aws-modules/terraform-aws-managed-service-grafana) +* [aws-managed-prometheus](https://github.com/terraform-aws-modules/terraform-aws-managed-service-prometheus) + +See examples using this Terraform modules in the **Amazon ECS** section of [this documentation](https://aws-observability.github.io/terraform-aws-observability-accelerator/) + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.0.0 | +| [aws](#requirement\_aws) | >= 5.0.0 | + +## Providers + +| Name | Version | +|------|---------| +| [aws](#provider\_aws) | >= 5.0.0 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [managed\_grafana\_default](#module\_managed\_grafana\_default) | terraform-aws-modules/managed-service-grafana/aws | 2.1.0 | +| [managed\_prometheus\_default](#module\_managed\_prometheus\_default) | terraform-aws-modules/managed-service-prometheus/aws | 2.2.2 | + +## Resources + +| Name | Type | +|------|------| +| [aws_ecs_service.adot_ecs_prometheus](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ecs_service) | resource | +| [aws_ecs_task_definition.adot_ecs_prometheus](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ecs_task_definition) | resource | +| [aws_ssm_parameter.adot_config](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ssm_parameter) | resource | +| [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [aws\_ecs\_cluster\_name](#input\_aws\_ecs\_cluster\_name) | Name of your ECS cluster | `string` | n/a | yes | +| [container\_name](#input\_container\_name) | Container Name for Adot | `string` | `"adot_new"` | no | +| [create\_managed\_grafana\_ws](#input\_create\_managed\_grafana\_ws) | Creates a Workspace for Amazon Managed Grafana | `bool` | `true` | no | +| [create\_managed\_prometheus\_ws](#input\_create\_managed\_prometheus\_ws) | Creates a Workspace for Amazon Managed Prometheus | `bool` | `true` | no | +| [ecs\_adot\_cpu](#input\_ecs\_adot\_cpu) | CPU to be allocated for the ADOT ECS TASK | `string` | `"256"` | no | +| [ecs\_adot\_mem](#input\_ecs\_adot\_mem) | Memory to be allocated for the ADOT ECS TASK | `string` | `"512"` | no | +| [ecs\_metrics\_collection\_interval](#input\_ecs\_metrics\_collection\_interval) | Collection interval for ecs metrics | `string` | `"15s"` | no | +| [execution\_role\_arn](#input\_execution\_role\_arn) | ARN of the IAM Execution Role | `string` | n/a | yes | +| [otel\_image\_ver](#input\_otel\_image\_ver) | Otel Docker Image version | `string` | `"v0.31.0"` | no | +| [otlp\_grpc\_endpoint](#input\_otlp\_grpc\_endpoint) | otlpGrpcEndpoint | `string` | `"0.0.0.0:4317"` | no | +| [otlp\_http\_endpoint](#input\_otlp\_http\_endpoint) | otlpHttpEndpoint | `string` | `"0.0.0.0:4318"` | no | +| [refresh\_interval](#input\_refresh\_interval) | Refresh interval for ecs\_observer | `string` | `"60s"` | no | +| [task\_role\_arn](#input\_task\_role\_arn) | ARN of the IAM Task Role | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [grafana\_workspace\_endpoint](#output\_grafana\_workspace\_endpoint) | The endpoint of the Grafana workspace | +| [grafana\_workspace\_id](#output\_grafana\_workspace\_id) | The ID of the Grafana workspace | +| [prometheus\_workspace\_id](#output\_prometheus\_workspace\_id) | Identifier of the workspace | +| [prometheus\_workspace\_prometheus\_endpoint](#output\_prometheus\_workspace\_prometheus\_endpoint) | Prometheus endpoint available for this workspace | + diff --git a/modules/ecs-monitoring/configs/config.yaml b/modules/ecs-monitoring/configs/config.yaml new file mode 100644 index 00000000..9a46484d --- /dev/null +++ b/modules/ecs-monitoring/configs/config.yaml @@ -0,0 +1,130 @@ +extensions: + sigv4auth: + region: "${aws_region}" + service: "aps" + ecs_observer: # extension type is ecs_observer + cluster_name: "${cluster_name}" # cluster name need to configured manually + cluster_region: "${cluster_region}" # region can be configured directly or use AWS_REGION env var + result_file: "/etc/ecs_sd_targets.yaml" # the directory for file must already exists + refresh_interval: ${refresh_interval} + job_label_name: prometheus_job + # JMX + docker_labels: + - port_label: "ECS_PROMETHEUS_EXPORTER_PORT" + +receivers: + otlp: + protocols: + grpc: + endpoint: ${otlp_grpc_endpoint} + http: + endpoint: ${otlp_http_endpoint} + prometheus: + config: + scrape_configs: + - job_name: "ecssd" + file_sd_configs: + - files: + - "/etc/ecs_sd_targets.yaml" + relabel_configs: + - source_labels: [__meta_ecs_cluster_name] + action: replace + target_label: ClusterName + - source_labels: [__meta_ecs_service_name] + action: replace + target_label: ServiceName + - source_labels: [__meta_ecs_task_definition_family] + action: replace + target_label: TaskDefinitionFamily + - source_labels: [__meta_ecs_task_launch_type] + action: replace + target_label: LaunchType + - source_labels: [__meta_ecs_container_name] + action: replace + target_label: container_name + - action: labelmap + regex: ^__meta_ecs_container_labels_(.+)$ + replacement: "$$1" + awsecscontainermetrics: + collection_interval: ${ecs_metrics_collection_interval} + +processors: + resource: + attributes: + - key: receiver + value: "prometheus" + action: insert + filter: + metrics: + include: + match_type: strict + metric_names: + - ecs.task.memory.utilized + - ecs.task.memory.reserved + - ecs.task.memory.usage + - ecs.task.cpu.utilized + - ecs.task.cpu.reserved + - ecs.task.cpu.usage.vcpu + - ecs.task.network.rate.rx + - ecs.task.network.rate.tx + - ecs.task.storage.read_bytes + - ecs.task.storage.write_bytes + metricstransform: + transforms: + - include: ".*" + match_type: regexp + action: update + operations: + - label: prometheus_job + new_label: job + action: update_label + - include: ecs.task.memory.utilized + action: update + new_name: MemoryUtilized + - include: ecs.task.memory.reserved + action: update + new_name: MemoryReserved + - include: ecs.task.memory.usage + action: update + new_name: MemoryUsage + - include: ecs.task.cpu.utilized + action: update + new_name: CpuUtilized + - include: ecs.task.cpu.reserved + action: update + new_name: CpuReserved + - include: ecs.task.cpu.usage.vcpu + action: update + new_name: CpuUsage + - include: ecs.task.network.rate.rx + action: update + new_name: NetworkRxBytes + - include: ecs.task.network.rate.tx + action: update + new_name: NetworkTxBytes + - include: ecs.task.storage.read_bytes + action: update + new_name: StorageReadBytes + - include: ecs.task.storage.write_bytes + action: update + new_name: StorageWriteBytes + +exporters: + prometheusremotewrite: + endpoint: "${amp_remote_write_ep}" + auth: + authenticator: sigv4auth + logging: + loglevel: debug + +service: + extensions: [ecs_observer, sigv4auth] + pipelines: + metrics: + receivers: [prometheus] + processors: [resource, metricstransform] + exporters: [prometheusremotewrite] + metrics/ecs: + receivers: [awsecscontainermetrics] + processors: [filter] + exporters: [logging, prometheusremotewrite] diff --git a/modules/ecs-monitoring/locals.tf b/modules/ecs-monitoring/locals.tf new file mode 100644 index 00000000..a06ad308 --- /dev/null +++ b/modules/ecs-monitoring/locals.tf @@ -0,0 +1,32 @@ +data "aws_region" "current" {} + +locals { + region = data.aws_region.current.name + name = "amg-ex-${replace(basename(path.cwd), "_", "-")}" + description = "AWS Managed Grafana service for ${local.name}" + prometheus_ws_endpoint = module.managed_prometheus_default[0].workspace_prometheus_endpoint + + default_otel_values = { + aws_region = data.aws_region.current.name + cluster_name = var.aws_ecs_cluster_name + cluster_region = data.aws_region.current.name + refresh_interval = var.refresh_interval + ecs_metrics_collection_interval = var.ecs_metrics_collection_interval + amp_remote_write_ep = "${local.prometheus_ws_endpoint}api/v1/remote_write" + otlp_grpc_endpoint = var.otlp_grpc_endpoint + otlp_http_endpoint = var.otlp_http_endpoint + } + + ssm_param_value = yamlencode( + templatefile("${path.module}/configs/config.yaml", local.default_otel_values) + ) + + container_def_default_values = { + container_name = var.container_name + otel_image_ver = var.otel_image_ver + aws_region = data.aws_region.current.name + } + + container_definitions = templatefile("${path.module}/task-definitions/otel_collector.json", local.container_def_default_values) + +} diff --git a/modules/ecs-monitoring/main.tf b/modules/ecs-monitoring/main.tf new file mode 100644 index 00000000..537cf87a --- /dev/null +++ b/modules/ecs-monitoring/main.tf @@ -0,0 +1,53 @@ +# SSM Parameter for storing and distrivuting the ADOT config +resource "aws_ssm_parameter" "adot_config" { + name = "/terraform-aws-observability/otel_collector_config" + description = "SSM parameter for aws-observability-accelerator/otel-collector-config" + type = "String" + value = local.ssm_param_value + tier = "Intelligent-Tiering" +} + +############################################ +# Managed Grafana and Prometheus Module +############################################ + +module "managed_grafana_default" { + count = var.create_managed_grafana_ws ? 1 : 0 + + source = "terraform-aws-modules/managed-service-grafana/aws" + version = "2.1.0" + name = "${local.name}-default" + associate_license = false +} + +module "managed_prometheus_default" { + count = var.create_managed_prometheus_ws ? 1 : 0 + + source = "terraform-aws-modules/managed-service-prometheus/aws" + version = "2.2.2" + workspace_alias = "${local.name}-default" +} + +########################################### +# Task Definition for ADOT ECS Prometheus +########################################### +resource "aws_ecs_task_definition" "adot_ecs_prometheus" { + family = "adot_prometheus_td" + task_role_arn = var.task_role_arn + execution_role_arn = var.execution_role_arn + network_mode = "bridge" + requires_compatibilities = ["EC2"] + cpu = var.ecs_adot_cpu + memory = var.ecs_adot_mem + container_definitions = local.container_definitions +} + +############################################ +# ECS Service +############################################ +resource "aws_ecs_service" "adot_ecs_prometheus" { + name = "adot_prometheus_svc" + cluster = var.aws_ecs_cluster_name + task_definition = aws_ecs_task_definition.adot_ecs_prometheus.arn + desired_count = 1 +} diff --git a/modules/ecs-monitoring/outputs.tf b/modules/ecs-monitoring/outputs.tf new file mode 100644 index 00000000..9510c032 --- /dev/null +++ b/modules/ecs-monitoring/outputs.tf @@ -0,0 +1,19 @@ +output "grafana_workspace_id" { + description = "The ID of the Grafana workspace" + value = try(module.managed_grafana_default[0].workspace_id, "") +} + +output "grafana_workspace_endpoint" { + description = "The endpoint of the Grafana workspace" + value = try(module.managed_grafana_default[0].workspace_endpoint, "") +} + +output "prometheus_workspace_id" { + description = "Identifier of the workspace" + value = try(module.managed_prometheus_default[0].id, "") +} + +output "prometheus_workspace_prometheus_endpoint" { + description = "Prometheus endpoint available for this workspace" + value = try(module.managed_prometheus_default[0].prometheus_endpoint, "") +} diff --git a/modules/ecs-monitoring/task-definitions/otel_collector.json b/modules/ecs-monitoring/task-definitions/otel_collector.json new file mode 100644 index 00000000..8328fe53 --- /dev/null +++ b/modules/ecs-monitoring/task-definitions/otel_collector.json @@ -0,0 +1,21 @@ +[ + { + "name": "${container_name}", + "image": "amazon/aws-otel-collector:${otel_image_ver}", + "secrets": [ + { + "name": "AOT_CONFIG_CONTENT", + "valueFrom": "/terraform-aws-observability/otel_collector_config" + } + ], + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-create-group": "True", + "awslogs-group": "/adot/collector", + "awslogs-region": "${aws_region}", + "awslogs-stream-prefix": "ecs-prometheus" + } + } + } +] diff --git a/modules/ecs-monitoring/variables.tf b/modules/ecs-monitoring/variables.tf new file mode 100644 index 00000000..c0e782ed --- /dev/null +++ b/modules/ecs-monitoring/variables.tf @@ -0,0 +1,75 @@ +variable "aws_ecs_cluster_name" { + description = "Name of your ECS cluster" + type = string +} + +variable "task_role_arn" { + description = "ARN of the IAM Task Role" + type = string +} + +variable "execution_role_arn" { + description = "ARN of the IAM Execution Role" + type = string +} + +variable "ecs_adot_cpu" { + description = "CPU to be allocated for the ADOT ECS TASK" + type = string + default = "256" +} + +variable "ecs_adot_mem" { + description = "Memory to be allocated for the ADOT ECS TASK" + type = string + default = "512" +} + +variable "create_managed_grafana_ws" { + description = "Creates a Workspace for Amazon Managed Grafana" + type = bool + default = true +} + +variable "create_managed_prometheus_ws" { + description = "Creates a Workspace for Amazon Managed Prometheus" + type = bool + default = true +} + +variable "refresh_interval" { + description = "Refresh interval for ecs_observer" + type = string + default = "60s" +} + +variable "ecs_metrics_collection_interval" { + description = "Collection interval for ecs metrics" + type = string + default = "15s" +} + +variable "otlp_grpc_endpoint" { + description = "otlpGrpcEndpoint" + type = string + default = "0.0.0.0:4317" +} + + +variable "otlp_http_endpoint" { + description = "otlpHttpEndpoint" + type = string + default = "0.0.0.0:4318" +} + +variable "container_name" { + description = "Container Name for Adot" + type = string + default = "adot_new" +} + +variable "otel_image_ver" { + description = "Otel Docker Image version" + type = string + default = "v0.31.0" +} diff --git a/modules/ecs-monitoring/versions.tf b/modules/ecs-monitoring/versions.tf new file mode 100644 index 00000000..45dce904 --- /dev/null +++ b/modules/ecs-monitoring/versions.tf @@ -0,0 +1,10 @@ +terraform { + required_version = ">= 1.0.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0.0" + } + } +}