From ec947526db3baa37ed2e38d85c1c00bacd135d4e Mon Sep 17 00:00:00 2001 From: FinnIckler Date: Wed, 6 Nov 2024 11:34:54 +0100 Subject: [PATCH] Create Shoryuken Worker configuration (#10058) * Create Shoryuken config * example infra * add queue URL to env variables * update gemfile.lock * add sqs worker service * add building the sqs image to github action * set desired count to 0 for prod * monkeypatch enqueue_after_transaction_commit? for ShoryukenAdapter * also install aws-sdk-sqs * correct monkey_patches.rb * fix worker using the wrong task definition * add init script to set region * fix whitespace --- .github/actions/build-environment/action.yaml | 8 ++ Dockerfile | 6 + Gemfile | 2 + Gemfile.lock | 9 ++ app/jobs/add_registration_job.rb | 2 + bin/docker-entrypoint-shoryuken | 2 + config/initializers/monkey_patches.rb | 9 ++ config/initializers/shoryuken.rb | 17 +++ config/shoryuken.yml | 4 + env_config.rb | 3 + infra/wca_on_rails/production/rails.tf | 15 +++ infra/wca_on_rails/production/sqs.tf | 15 +++ infra/wca_on_rails/production/worker.tf | 103 ++++++++++++++++++ infra/wca_on_rails/staging/rails.tf | 15 +++ infra/wca_on_rails/staging/sqs.tf | 15 +++ infra/wca_on_rails/staging/worker.tf | 103 ++++++++++++++++++ 16 files changed, 328 insertions(+) create mode 100755 bin/docker-entrypoint-shoryuken create mode 100644 config/initializers/shoryuken.rb create mode 100644 config/shoryuken.yml create mode 100644 infra/wca_on_rails/production/sqs.tf create mode 100644 infra/wca_on_rails/production/worker.tf create mode 100644 infra/wca_on_rails/staging/sqs.tf create mode 100644 infra/wca_on_rails/staging/worker.tf diff --git a/.github/actions/build-environment/action.yaml b/.github/actions/build-environment/action.yaml index 3554678023..8b34aa4360 100644 --- a/.github/actions/build-environment/action.yaml +++ b/.github/actions/build-environment/action.yaml @@ -47,6 +47,14 @@ runs: tag: ${{inputs.environment}}-api build_tag: ${{ inputs.build_tag }} environment: ${{ inputs.environment }} + - name: Build and push SQS Worker + uses: ./.github/actions/build-image + with: + target: shoryuken + registry: ${{ inputs.registry }} + tag: ${{inputs.environment}}-sqs-worker + build_tag: ${{ inputs.build_tag }} + environment: ${{ inputs.environment }} - name: Build and push Image uses: ./.github/actions/build-image with: diff --git a/Dockerfile b/Dockerfile index 6cd6d04326..03ecf64aa6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -87,6 +87,12 @@ RUN gem install mailcatcher ENTRYPOINT ["/rails/bin/docker-entrypoint-sidekiq"] +FROM runtime AS shoryuken + +USER rails:rails + +ENTRYPOINT ["/rails/bin/docker-entrypoint-shoryuken"] + FROM runtime AS monolith EXPOSE 3000 diff --git a/Gemfile b/Gemfile index 9401dfe3c8..99478a2fcc 100644 --- a/Gemfile +++ b/Gemfile @@ -65,6 +65,7 @@ gem 'ostruct' gem 'selectize-rails', github: 'jfly/selectize-rails' gem 'aws-sdk-s3' +gem 'aws-sdk-sqs' gem 'aws-sdk-rds' gem 'aws-sdk-cloudfront' @@ -155,4 +156,5 @@ group :production do gem 'rack' gem 'newrelic_rpm' gem 'wkhtmltopdf-binary-ng' + gem 'shoryuken' end diff --git a/Gemfile.lock b/Gemfile.lock index 512e5391a7..c0b8fa940f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -177,6 +177,9 @@ GEM aws-sdk-core (~> 3, >= 3.210.0) aws-sdk-kms (~> 1) aws-sigv4 (~> 1.5) + aws-sdk-sqs (1.87.0) + aws-sdk-core (~> 3, >= 3.210.0) + aws-sigv4 (~> 1.5) aws-sigv4 (1.10.1) aws-eventstream (~> 1, >= 1.0.2) babel-source (5.8.35) @@ -705,6 +708,10 @@ GEM railties (>= 5.2) semantic_range (>= 2.3.0) shellany (0.0.1) + shoryuken (6.2.1) + aws-sdk-core (>= 2) + concurrent-ruby + thor sidekiq (7.3.4) connection_pool (>= 2.3.0) logger @@ -825,6 +832,7 @@ DEPENDENCIES aws-sdk-cloudfront aws-sdk-rds aws-sdk-s3 + aws-sdk-sqs better_errors binding_of_caller blocks @@ -911,6 +919,7 @@ DEPENDENCIES seedbank selectize-rails! shakapacker (= 8.0.2) + shoryuken sidekiq sidekiq-cron! simple_form diff --git a/app/jobs/add_registration_job.rb b/app/jobs/add_registration_job.rb index f587d2e767..49a3fe6c31 100644 --- a/app/jobs/add_registration_job.rb +++ b/app/jobs/add_registration_job.rb @@ -1,6 +1,8 @@ # frozen_string_literal: true class AddRegistrationJob < ApplicationJob + self.queue_adapter = :shoryuken unless Rails.env.local? + before_enqueue do |job| _, competition_id, user_id = job.arguments Rails.cache.write(CacheAccess.registration_processing_cache_key(competition_id, user_id), true) diff --git a/bin/docker-entrypoint-shoryuken b/bin/docker-entrypoint-shoryuken new file mode 100755 index 0000000000..1b8a5102eb --- /dev/null +++ b/bin/docker-entrypoint-shoryuken @@ -0,0 +1,2 @@ +#!/bin/bash -e +bundle exec shoryuken -R -C config/shoryuken.yml diff --git a/config/initializers/monkey_patches.rb b/config/initializers/monkey_patches.rb index 1e91f85705..b0ec1cd5b4 100644 --- a/config/initializers/monkey_patches.rb +++ b/config/initializers/monkey_patches.rb @@ -102,4 +102,13 @@ def self.migration_table_name end end end + # Temporary fix until https://github.com/ruby-shoryuken/shoryuken/pull/777 or + # https://github.com/rails/rails/pull/53336 is merged + if Rails.env.production? + ActiveJob::QueueAdapters::ShoryukenAdapter.class_eval do + def enqueue_after_transaction_commit? + true + end + end + end end diff --git a/config/initializers/shoryuken.rb b/config/initializers/shoryuken.rb new file mode 100644 index 0000000000..c677046af1 --- /dev/null +++ b/config/initializers/shoryuken.rb @@ -0,0 +1,17 @@ +# frozen_string_literal: true + +if Rails.env.production? + Shoryuken.configure_client do |config| + config.sqs_client = Aws::SQS::Client.new( + region: EnvConfig.DATABASE_AWS_REGION, + credentials: Aws::ECSCredentials.new, + ) + end + + Shoryuken.configure_server do |config| + config.sqs_client = Aws::SQS::Client.new( + region: EnvConfig.DATABASE_AWS_REGION, + credentials: Aws::ECSCredentials.new, + ) + end +end diff --git a/config/shoryuken.yml b/config/shoryuken.yml new file mode 100644 index 0000000000..91a32b9f12 --- /dev/null +++ b/config/shoryuken.yml @@ -0,0 +1,4 @@ +queues: + - <%= EnvConfig.REGISTRATION_QUEUE %> +delay: 1 +concurrency: 2 diff --git a/env_config.rb b/env_config.rb index ca39933a82..0662b27bf2 100644 --- a/env_config.rb +++ b/env_config.rb @@ -32,6 +32,7 @@ mandatory :WCA_REGISTRATIONS_URL, :string mandatory :ASSET_HOST, :string mandatory :CDN_ASSETS_DISTRIBUTION_ID, :string + mandatory :REGISTRATION_QUEUE, :string if is_compiling_assets mandatory :V2_REGISTRATIONS_POLL_URL, :string @@ -57,6 +58,8 @@ optional :WCA_REGISTRATIONS_POLL_URL, :string, '' optional :PAYPAL_BASE_URL, :string, '' optional :WRC_WEBHOOK_URL, :string, '' + optional :REGISTRATION_QUEUE, :string, '' + optional :V2_REGISTRATIONS_POLL_URL, :string, '' optional :V3_REGISTRATIONS_POLL_URL, :string, '' diff --git a/infra/wca_on_rails/production/rails.tf b/infra/wca_on_rails/production/rails.tf index 53291d955f..674c687840 100644 --- a/infra/wca_on_rails/production/rails.tf +++ b/infra/wca_on_rails/production/rails.tf @@ -116,6 +116,10 @@ locals { name = "VAULT_ADDR" value = var.VAULT_ADDR }, + { + name = "REGISTRATION_QUEUE" + value = aws_sqs_queue.this.url + }, { name = "VAULT_APPLICATION" value = var.VAULT_APPLICATION @@ -205,6 +209,17 @@ data "aws_iam_policy_document" "task_policy" { ] resources = ["arn:aws:rds-db:${var.region}:${var.shared.account_id}:dbuser:${var.rds_iam_identifier}/${var.DATABASE_WRT_USER}"] } + statement { + effect = "Allow" + actions = [ + "sqs:SendMessage", + "sqs:ReceiveMessage", + "sqs:DeleteMessage", + "sqs:GetQueueAttributes", + "sqs:GetQueueUrl" + ] + resources = [aws_sqs_queue.this.arn] + } } resource "aws_iam_role_policy" "task_policy" { diff --git a/infra/wca_on_rails/production/sqs.tf b/infra/wca_on_rails/production/sqs.tf new file mode 100644 index 0000000000..288467cc71 --- /dev/null +++ b/infra/wca_on_rails/production/sqs.tf @@ -0,0 +1,15 @@ +# Define the SQS FIFO queue +resource "aws_sqs_queue" "this" { + name = "registrations-monolith.fifo" + fifo_queue = true + content_based_deduplication = true + deduplication_scope = "queue" + delay_seconds = 0 + max_message_size = 262144 + message_retention_seconds = 345600 + receive_wait_time_seconds = 0 + visibility_timeout_seconds = 60 + tags = { + Env = "Production" + } +} diff --git a/infra/wca_on_rails/production/worker.tf b/infra/wca_on_rails/production/worker.tf new file mode 100644 index 0000000000..0e5928a960 --- /dev/null +++ b/infra/wca_on_rails/production/worker.tf @@ -0,0 +1,103 @@ +resource "aws_cloudwatch_log_group" "worker" { + name = "${var.name_prefix}-sqs-worker" +} + +resource "aws_ecs_task_definition" "worker" { + family = "${var.name_prefix}-sqs-worker" + + network_mode = "awsvpc" + requires_compatibilities = ["EC2"] + + # We configure the roles to allow `aws ecs execute-command` into a task, + # as in https://aws.amazon.com/blogs/containers/new-using-amazon-ecs-exec-access-your-containers-fargate-ec2 + execution_role_arn = aws_iam_role.task_execution_role.arn + task_role_arn = aws_iam_role.task_role.arn + + cpu = "256" + memory = "256" + + container_definitions = jsonencode([ + + { + name = "sqs-worker-staging" + image = "${var.shared.ecr_repository.repository_url}:production-sqs-worker" + cpu = 256 + memory = 256 + portMappings = [] + logConfiguration = { + logDriver = "awslogs" + options = { + awslogs-group = aws_cloudwatch_log_group.this.name + awslogs-region = var.region + awslogs-stream-prefix = var.name_prefix + } + } + environment = local.rails_environment + healthCheck = { + command = ["CMD-SHELL", "pgrep ruby || exit 1"] + interval = 30 + retries = 3 + startPeriod = 60 + timeout = 5 + } + }, + ]) + + tags = { + Name = var.name_prefix + } +} + +data "aws_ecs_task_definition" "worker" { + task_definition = aws_ecs_task_definition.worker.family +} + +resource "aws_ecs_service" "worker" { + name = "${var.name_prefix}-sqs-worker" + cluster = var.shared.ecs_cluster.id + # During deployment a new task revision is created with modified + # container image, so we want use data.aws_ecs_task_definition to + # always point to the active task definition + task_definition = data.aws_ecs_task_definition.auxiliary.arn + desired_count = 0 + scheduling_strategy = "REPLICA" + deployment_maximum_percent = 200 + deployment_minimum_healthy_percent = 50 + health_check_grace_period_seconds = 0 + + capacity_provider_strategy { + capacity_provider = var.shared.t3_capacity_provider.name + weight = 1 + } + + enable_execute_command = true + + deployment_circuit_breaker { + enable = true + rollback = false + } + + ordered_placement_strategy { + type = "spread" + field = "attribute:ecs.availability-zone" + } + + ordered_placement_strategy { + type = "spread" + field = "instanceId" + } + + network_configuration { + security_groups = [var.shared.cluster_security.id] + subnets = var.shared.private_subnets[*].id + } + + deployment_controller { + type = "ECS" + } + + tags = { + Name = var.name_prefix + } + +} diff --git a/infra/wca_on_rails/staging/rails.tf b/infra/wca_on_rails/staging/rails.tf index 6dd59b74d6..e5325e7a98 100644 --- a/infra/wca_on_rails/staging/rails.tf +++ b/infra/wca_on_rails/staging/rails.tf @@ -52,6 +52,10 @@ locals { name = "STORAGE_AWS_BUCKET" value = aws_s3_bucket.storage-bucket.id }, + { + name = "REGISTRATION_QUEUE" + value = aws_sqs_queue.this.url + }, { name = "STORAGE_AWS_REGION" value = var.region @@ -209,6 +213,17 @@ data "aws_iam_policy_document" "task_policy" { ] resources = ["arn:aws:rds-db:${var.region}:${var.shared.account_id}:dbuser:${var.rds_iam_identifier}/${var.DATABASE_WRT_USER}"] } + statement { + effect = "Allow" + actions = [ + "sqs:SendMessage", + "sqs:ReceiveMessage", + "sqs:DeleteMessage", + "sqs:GetQueueAttributes", + "sqs:GetQueueUrl" + ] + resources = [aws_sqs_queue.this.arn] + } } resource "aws_iam_role_policy" "task_policy" { diff --git a/infra/wca_on_rails/staging/sqs.tf b/infra/wca_on_rails/staging/sqs.tf new file mode 100644 index 0000000000..c95b2e10e7 --- /dev/null +++ b/infra/wca_on_rails/staging/sqs.tf @@ -0,0 +1,15 @@ +# Define the SQS FIFO queue +resource "aws_sqs_queue" "this" { + name = "registrations-monolith-staging.fifo" + fifo_queue = true + content_based_deduplication = true + deduplication_scope = "queue" + delay_seconds = 0 + max_message_size = 262144 + message_retention_seconds = 345600 + receive_wait_time_seconds = 0 + visibility_timeout_seconds = 60 + tags = { + Env = "staging" + } +} diff --git a/infra/wca_on_rails/staging/worker.tf b/infra/wca_on_rails/staging/worker.tf new file mode 100644 index 0000000000..e295482911 --- /dev/null +++ b/infra/wca_on_rails/staging/worker.tf @@ -0,0 +1,103 @@ +resource "aws_cloudwatch_log_group" "worker" { + name = "${var.name_prefix}-sqs-worker" +} + +resource "aws_ecs_task_definition" "worker" { + family = "${var.name_prefix}-sqs-worker" + + network_mode = "awsvpc" + requires_compatibilities = ["EC2"] + + # We configure the roles to allow `aws ecs execute-command` into a task, + # as in https://aws.amazon.com/blogs/containers/new-using-amazon-ecs-exec-access-your-containers-fargate-ec2 + execution_role_arn = aws_iam_role.task_execution_role.arn + task_role_arn = aws_iam_role.task_role.arn + + cpu = "256" + memory = "256" + + container_definitions = jsonencode([ + + { + name = "sqs-worker-staging" + image = "${var.shared.ecr_repository.repository_url}:staging-sqs-worker" + cpu = 256 + memory = 256 + portMappings = [] + logConfiguration = { + logDriver = "awslogs" + options = { + awslogs-group = aws_cloudwatch_log_group.this.name + awslogs-region = var.region + awslogs-stream-prefix = var.name_prefix + } + } + environment = local.rails_environment + healthCheck = { + command = ["CMD-SHELL", "pgrep ruby || exit 1"] + interval = 30 + retries = 3 + startPeriod = 60 + timeout = 5 + } + }, + ]) + + tags = { + Name = var.name_prefix + } +} + +data "aws_ecs_task_definition" "worker" { + task_definition = aws_ecs_task_definition.worker.family +} + +resource "aws_ecs_service" "worker" { + name = "${var.name_prefix}-sqs-worker" + cluster = var.shared.ecs_cluster.id + # During deployment a new task revision is created with modified + # container image, so we want use data.aws_ecs_task_definition to + # always point to the active task definition + task_definition = data.aws_ecs_task_definition.auxiliary.arn + desired_count = 1 + scheduling_strategy = "REPLICA" + deployment_maximum_percent = 200 + deployment_minimum_healthy_percent = 50 + health_check_grace_period_seconds = 0 + + capacity_provider_strategy { + capacity_provider = var.shared.t3_capacity_provider.name + weight = 1 + } + + enable_execute_command = true + + deployment_circuit_breaker { + enable = true + rollback = false + } + + ordered_placement_strategy { + type = "spread" + field = "attribute:ecs.availability-zone" + } + + ordered_placement_strategy { + type = "spread" + field = "instanceId" + } + + network_configuration { + security_groups = [var.shared.cluster_security.id] + subnets = var.shared.private_subnets[*].id + } + + deployment_controller { + type = "ECS" + } + + tags = { + Name = var.name_prefix + } + +}