Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Full PRs of cost optimization topic #144

Merged
merged 26 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
305232c
Allow to enable lifecycle for snapshot bucket, load balancer logs buc…
owl-king Aug 7, 2024
42ddef9
Allow to enable/disable multi_az option on master node and remove rea…
owl-king Aug 8, 2024
b25fdc0
Reduce fullnode, fullnode_snapshot, and fullnode_backup storage to 1T…
owl-king Aug 8, 2024
0588f12
Change root_block_device_delete_on_termination = true to avoid unatta…
owl-king Aug 8, 2024
1bab89a
Set assign_public_ip = false as all the tasks are in private subnet (…
owl-king Aug 8, 2024
14fd9a4
Reduce msk storage of dev env to 500 (#117)
owl-king Aug 8, 2024
59c3b26
Reduce socks memory to 8192 (#118)
owl-king Aug 8, 2024
74e8fbb
Allow to changes ecs tasks architecture, lambda funciton. Default val…
owl-king Aug 8, 2024
ea1ba0f
Set default retetion for s3_rds_snapshot and load_balancer_logs to 14…
owl-king Aug 9, 2024
39b0eb9
Change full_node_root_block_device_size default value to 1000 GB (#122)
owl-king Aug 9, 2024
e69975b
Add new variable: msk_storage_size to set storage of kafka (#123)
owl-king Aug 9, 2024
d6b3644
Allow to disable backup_full_node (#124)
owl-king Aug 9, 2024
18b88b5
Fix image filter value to find image based on cpu architecture (#125)
owl-king Aug 9, 2024
10ff202
Change default value of create_backup_full_node to false (#127)
owl-king Aug 12, 2024
7e171e6
Change msk storage to msk_storage_size
owl-king Aug 25, 2024
8167399
Change default storage of full_node_snapshot to 1000
owl-king Aug 28, 2024
13822e4
Remove apne-1a az (#132)
roy-dydx Sep 18, 2024
bd21af5
Revert "Remove apne-1a az (#132)" (#133)
roy-dydx Sep 18, 2024
30ef8a6
[OTE-821] Add roundtable monitors for update affiliate info and updat…
jerryfan01234 Sep 23, 2024
b2c4e32
Add AWS_REGION envvar to services that connect to kafka (#136)
roy-dydx Sep 27, 2024
29a24e2
Add read replica storage variable (#137)
dydxwill Sep 30, 2024
6e058af
Remove Indexer dashboards from terraform (#139)
dydxwill Sep 30, 2024
106d9e9
Add variable for full_node root_block_device_size (#141)
roy-dydx Sep 30, 2024
105efce
set log retention hours to 120 (#142)
dydxwill Oct 2, 2024
239a52a
add default (#143)
dydxwill Oct 3, 2024
38c9680
Resolve conflict from main
owl-king Oct 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions indexer/backup_full_node_ap_northeast_1.tf
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
module "backup_full_node_ap_northeast_1" {
source = "../modules/validator"
count = var.create_backup_full_node ? 1 : 0

environment = var.environment

Expand Down Expand Up @@ -37,7 +38,16 @@ module "backup_full_node_ap_northeast_1" {

use_persistent_docker_volume = var.full_node_use_persistent_docker_volume

root_block_device_size = var.full_node_root_block_device_size
root_block_device_delete_on_termination = true
ecs_task_cpu_architecture = var.fullnode_ecs_task_cpu_architecture

providers = {
aws = aws.ap_northeast_1
}
}

moved {
from = module.backup_full_node_ap_northeast_1
to = module.backup_full_node_ap_northeast_1[0]
}
3 changes: 2 additions & 1 deletion indexer/ecs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ resource "aws_ecs_service" "main" {
aws_subnet.private_subnets[subnet_name].id
] : [for subnet in aws_subnet.private_subnets : subnet.id]
security_groups = [aws_security_group.services[each.key].id]
assign_public_ip = true
assign_public_ip = false
}

dynamic "load_balancer" {
Expand Down Expand Up @@ -162,6 +162,7 @@ resource "aws_ecs_task_definition" "main" {

runtime_platform {
operating_system_family = "LINUX"
cpu_architecture = var.indexer_ecs_task_cpu_architecture
}

tags = {
Expand Down
4 changes: 4 additions & 0 deletions indexer/full_node_ap_northeast_1.tf
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ module "full_node_ap_northeast_1" {

use_persistent_docker_volume = var.full_node_use_persistent_docker_volume

root_block_device_size = var.full_node_root_block_device_size
root_block_device_delete_on_termination = true
ecs_task_cpu_architecture = var.fullnode_ecs_task_cpu_architecture

providers = {
aws = aws.ap_northeast_1
}
Expand Down
4 changes: 2 additions & 2 deletions indexer/lambda.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ resource "aws_lambda_function" "main" {
package_type = "Image"
function_name = "${each.key}_lambda_function"
role = aws_iam_role.lambda_services[each.key].arn
architectures = ["x86_64"]
timeout = 120
architectures = [lower(var.lambda_cpu_architecture)]
timeout = 300

environment {
variables = merge(
Expand Down
20 changes: 16 additions & 4 deletions indexer/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ locals {
should_deploy_in_rds_subnet : true,
ecs_environment_variables : flatten(
[
{
name : "AWS_REGION",
value : var.region,
},
{
name : "PG_POOL_MAX",
value : "30"
Expand All @@ -51,7 +55,7 @@ locals {
),
},
"${local.service_names["comlink"]}" : {
ecs_desired_count : 5,
ecs_desired_count : var.comlink_ecs_desired_count,
task_definition_memory : 4096,
task_definition_cpu : 2048,
is_public_facing : true,
Expand Down Expand Up @@ -97,8 +101,8 @@ locals {
),
},
"${local.service_names["socks"]}" : {
ecs_desired_count : 5,
task_definition_memory : 20480,
ecs_desired_count : var.socks_ecs_desired_count,
task_definition_memory : 8192,
task_definition_cpu : 4096,
is_public_facing : true,
ports : [8080, 8000],
Expand All @@ -109,6 +113,10 @@ locals {
should_deploy_in_rds_subnet : false,
ecs_environment_variables : flatten(
[
{
name : "AWS_REGION",
value : var.region,
},
{
name : "COMLINK_URL",
value : aws_lb.public.dns_name,
Expand Down Expand Up @@ -195,7 +203,7 @@ locals {
),
},
"${local.service_names["vulcan"]}" : {
ecs_desired_count : 5,
ecs_desired_count : var.vulcan_ecs_desired_count,
task_definition_memory : 8192,
task_definition_cpu : 4096,
is_public_facing : false,
Expand All @@ -207,6 +215,10 @@ locals {
should_deploy_in_rds_subnet : false,
ecs_environment_variables : flatten(
[
{
name : "AWS_REGION",
value : var.region,
},
{
name : "PG_POOL_MAX",
value : "2"
Expand Down
6 changes: 4 additions & 2 deletions indexer/msk.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ resource "aws_msk_configuration" "main" {
message.max.bytes=4194304
unclean.leader.election.enable=true
zookeeper.session.timeout.ms=6000
replica.selector.class = org.apache.kafka.common.replica.RackAwareReplicaSelector
log.retention.hours = 120
PROPERTIES

lifecycle {
Expand All @@ -36,7 +38,7 @@ resource "aws_msk_cluster" "main" {
instance_type = var.msk_instance_type
storage_info {
ebs_storage_info {
volume_size = var.environment == "mainnet" ? 2000 : 1000 # in GB
volume_size = var.msk_storage_size
}
}
client_subnets = [
Expand All @@ -56,4 +58,4 @@ resource "aws_msk_cluster" "main" {
arn = aws_msk_configuration.main.arn
revision = aws_msk_configuration.main.latest_revision
}
}
}
5 changes: 4 additions & 1 deletion indexer/rds.tf
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ resource "aws_db_instance" "main" {
performance_insights_enabled = true
performance_insights_retention_period = 31
auto_minor_version_upgrade = false
multi_az = true
multi_az = var.enable_rds_main_multiaz

tags = {
Name = local.aws_db_instance_main_name
Expand All @@ -231,6 +231,7 @@ resource "aws_db_instance" "read_replica" {
# be specified for a replica, and will match the properties on the source db.
vpc_security_group_ids = [aws_security_group.rds.id]
parameter_group_name = aws_db_parameter_group.main.name
allocated_storage = var.rds_read_replica_db_allocated_storage_gb
publicly_accessible = false
# Set to true if any planned changes need to be applied before the next maintenance window.
apply_immediately = false
Expand All @@ -250,12 +251,14 @@ resource "aws_db_instance" "read_replica" {

# Read replica 2
resource "aws_db_instance" "read_replica_2" {
count = var.create_read_replica_2 ? 1 : 0
identifier = "${local.aws_db_instance_main_name}-read-replica-2"
instance_class = var.rds_db_instance_class
# engine, engine_version, name, username, db_subnet_group_name, allocated_storage do not have to
# be specified for a replica, and will match the properties on the source db.
vpc_security_group_ids = [aws_security_group.rds.id]
parameter_group_name = aws_db_parameter_group.main.name
allocated_storage = var.rds_read_replica_db_allocated_storage_gb
publicly_accessible = false
# Set to true if any planned changes need to be applied before the next maintenance window.
apply_immediately = false
Expand Down
3 changes: 2 additions & 1 deletion indexer/route53.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ resource "aws_route53_record" "read_replica_1" {
}

resource "aws_route53_record" "read_replica_2" {
count = var.create_read_replica_2 ? 1 : 0
zone_id = aws_route53_zone.main.zone_id
name = "postgres-main-rr.dydx-indexer.private"
type = "CNAME"
ttl = "30"
records = ["${aws_db_instance.read_replica_2.address}"]
records = ["${aws_db_instance.read_replica_2[count.index].address}"]
weighted_routing_policy {
weight = 1
}
Expand Down
9 changes: 5 additions & 4 deletions indexer/route_table.tf
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,10 @@ resource "aws_route" "full_node_route_to_indexer" {
# NOTE: This is not an individual AWS resource, but rather an attachment to the route table, and so
# no tags are added.
resource "aws_route" "backup_full_node_route_to_indexer" {
route_table_id = module.backup_full_node_ap_northeast_1.route_table_id
count = var.create_backup_full_node ? 1 : 0
route_table_id = module.backup_full_node_ap_northeast_1[0].route_table_id
destination_cidr_block = var.indexers[var.region].vpc_cidr_block
vpc_peering_connection_id = aws_vpc_peering_connection.backup_full_node_peer.id
vpc_peering_connection_id = aws_vpc_peering_connection.backup_full_node_peer[0].id
}

# Route from the Indexer's private subnets to the full node's VPC. Needed so that the full node can
Expand All @@ -88,9 +89,9 @@ resource "aws_route" "indexer_route_to_full_node" {
}

resource "aws_route" "indexer_route_to_backup_full_node" {
for_each = aws_route_table.private
for_each = var.create_backup_full_node ? aws_route_table.private : {}

route_table_id = each.value.id
destination_cidr_block = var.backup_full_node_cidr_vpc
vpc_peering_connection_id = aws_vpc_peering_connection.backup_full_node_peer.id
vpc_peering_connection_id = aws_vpc_peering_connection.backup_full_node_peer[0].id
}
43 changes: 43 additions & 0 deletions indexer/s3_bucket.tf
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,20 @@ resource "aws_s3_bucket" "load_balancer" {
}
}

resource "aws_s3_bucket_lifecycle_configuration" "load_balancer" {
count = var.enable_s3_load_balancer_logs_lifecycle ? 1 : 0
bucket = aws_s3_bucket.load_balancer.id

rule {
id = "expire-old-logs"
status = "Enabled"

expiration {
days = var.s3_load_balancer_logs_expiration_days
}
}
}

# TODO: refactor snapshotting full node into a separate module
# AWS S3 bucket to store all Indexer full node snapshots
resource "aws_s3_bucket" "indexer_full_node_snapshots" {
Expand All @@ -22,6 +36,21 @@ resource "aws_s3_bucket" "indexer_full_node_snapshots" {
}
}

resource "aws_s3_bucket_lifecycle_configuration" "indexer_full_node_snapshots" {
count = var.enable_s3_snapshot_lifecycle ? 1 : 0
bucket = aws_s3_bucket.indexer_full_node_snapshots.id

rule {
id = "expire-old-snapshots"
status = "Enabled"

expiration {
days = var.s3_snapshot_expiration_days
}
}
}


# Enable S3 bucket metrics to be sent to Datadog for monitoring
resource "aws_s3_bucket_metric" "indexer_full_node_snapshots" {
bucket = aws_s3_bucket.indexer_full_node_snapshots.id
Expand Down Expand Up @@ -64,3 +93,17 @@ resource "aws_s3_bucket" "athena_rds_snapshots" {
Environment = var.environment
}
}

resource "aws_s3_bucket_lifecycle_configuration" "athena_rds_snapshots" {
count = var.enable_s3_rds_snapshot_lifecycle ? 1 : 0
bucket = aws_s3_bucket.athena_rds_snapshots.id

rule {
id = "expire-old-snapshots"
status = "Enabled"

expiration {
days = var.s3_rds_snapshot_expiration_days
}
}
}
2 changes: 1 addition & 1 deletion indexer/security_group.tf
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ resource "aws_security_group" "msk" {
security_groups = flatten([
aws_security_group.devbox.id,
module.full_node_ap_northeast_1.aws_security_group_id,
module.backup_full_node_ap_northeast_1.aws_security_group_id,
var.create_backup_full_node ? [module.backup_full_node_ap_northeast_1[0].aws_security_group_id] : [],
# Lambda Services
[
for service in keys(local.lambda_services) :
Expand Down
4 changes: 3 additions & 1 deletion indexer/snapshot_full_node_ap_northeast_1.tf
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ module "full_node_snapshot_ap_northeast_1" {

datadog_env = "snapshot-${var.environment}"

root_block_device_size = var.full_node_snapshot_ebs_volume_size
root_block_device_size = var.full_node_snapshot_ebs_volume_size
root_block_device_delete_on_termination = true
ecs_task_cpu_architecture = var.fullnode_ecs_task_cpu_architecture

entry_point = [
"sh",
Expand Down
Loading