From 87e256159c448d16f70fba2d334194dce38881b9 Mon Sep 17 00:00:00 2001 From: Johnathan Martin Date: Thu, 30 Mar 2023 22:06:10 -0700 Subject: [PATCH 1/3] okcomputer: check that sidekiq process and thread counts are as expected --- config/initializers/okcomputer.rb | 100 ++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/config/initializers/okcomputer.rb b/config/initializers/okcomputer.rb index 79a37f5a6..675f2098a 100644 --- a/config/initializers/okcomputer.rb +++ b/config/initializers/okcomputer.rb @@ -70,6 +70,106 @@ def check OkComputer::Registry.register 'ruby_version', OkComputer::RubyVersionCheck.new +# confirm that the expected number of sidekiq worker processes and threads are running +class SidekiqWorkerCountCheck < OkComputer::Check + class ExpectedEnvVarMissing < StandardError; end + + def check + actual_local_sidekiq_processes = fetch_local_sidekiq_processes + actual_local_sidekiq_process_count = actual_local_sidekiq_processes.size + actual_local_total_concurrency = actual_local_sidekiq_processes.sum { |process| process['concurrency'] } + + error_list = calculate_error_list(actual_local_sidekiq_process_count: actual_local_sidekiq_process_count, + actual_local_total_concurrency: actual_local_total_concurrency) + + if error_list.empty? + mark_message "Sidekiq worker counts as expected on this VM: #{actual_local_sidekiq_process_count} worker " \ + "processes, #{actual_local_total_concurrency} concurrent worker threads total." + else + mark_message error_list.join(' ') + mark_failure + end + rescue ExpectedEnvVarMissing => e + mark_message e.message + mark_failure + end + + private + + # @return [Array] one Sidekiq::Process object for each worker management + # process currently running on _this_ VM + def fetch_local_sidekiq_processes + fetch_global_sidekiq_process_list.select do |p| + p['hostname'] == Socket.gethostname + end + end + + # @return [Array] one Sidekiq::Process object for each worker management + # process currently running on _all_ worker VMs + def fetch_global_sidekiq_process_list + # Sidekiq::ProcessSet#each doesn't return an Enumerator, it just loops and calls the block it's passed + [].tap do |pset_list| + Sidekiq::ProcessSet.new.each { |process| pset_list << process } + end + end + + # the number of concurrent Sidekiq worker threads per process is set in config/sidekiq.yml + def expected_sidekiq_proc_concurrency(proc_num: nil) + config_filename = proc_num.present? ? "../../shared/config/sidekiq#{proc_num}.yml" : 'config/sidekiq.yml' + sidekiq_config = YAML.safe_load(Rails.root.join(config_filename).read, permitted_classes: [Symbol]) + sidekiq_config[:concurrency] + end + + # puppet runs a number of sidekiq processes using systemd, exposing the expected process count via env var + def expected_sidekiq_process_count + @expected_sidekiq_process_count ||= Integer(ENV.fetch('EXPECTED_SIDEKIQ_PROC_COUNT')) + rescue StandardError => e + err_description = 'Error retrieving EXPECTED_SIDEKIQ_PROC_COUNT and parsing to int. ' \ + "ENV['EXPECTED_SIDEKIQ_PROC_COUNT']=#{ENV.fetch('EXPECTED_SIDEKIQ_PROC_COUNT', nil)}" + Rails.logger.error("#{err_description} -- #{e.message} -- #{e.backtrace}") + raise ExpectedEnvVarMissing, err_description + end + + def expected_local_total_concurrency + # Existence of config/sidekiq.yml indicates a single config for all sidekiq processes. otherwise, each of + # the sidekiq processes, 1 through EXPECTED_SIDEKIQ_PROC_COUNT, will have its own config file. + # The number of sidekiq[N].yml files may not match the number of sidekiq processes if custom_execstart=false + # in puppet config. + @expected_local_total_concurrency ||= + if File.exist?('config/sidekiq.yml') + expected_sidekiq_process_count * expected_sidekiq_proc_concurrency + else + (1..expected_sidekiq_process_count).sum { |n| expected_sidekiq_proc_concurrency(proc_num: n) } + end + end + + def calculate_error_list(actual_local_sidekiq_process_count:, actual_local_total_concurrency:) + error_list = [] + + if actual_local_sidekiq_process_count > expected_sidekiq_process_count + error_list << <<~ERR_TXT + Actual Sidekiq worker process count (#{actual_local_sidekiq_process_count}) on this VM is greater than \ + expected (#{expected_sidekiq_process_count}). Check for stale Sidekiq processes (e.g. from old deployments). \ + It's also possible that some worker threads are finishing WIP that started before a Sidekiq restart, e.g. as \ + happens when long running job spans app deployment. Use your judgement when deciding whether to kill an old process. + ERR_TXT + end + if actual_local_sidekiq_process_count < expected_sidekiq_process_count + error_list << "Actual Sidekiq worker management process count (#{actual_local_sidekiq_process_count}) on " \ + "this VM is less than expected (#{expected_sidekiq_process_count})." + end + if actual_local_total_concurrency != expected_local_total_concurrency + error_list << "Actual worker thread count on this VM is #{actual_local_total_concurrency}, but " \ + "expected local total Sidekiq concurrency is #{expected_local_total_concurrency}." + end + + error_list + end +end + +# we don't want to register this on non-worker boxes, because it only tracks local worker processes +OkComputer::Registry.register 'sidekiq_worker_count', SidekiqWorkerCountCheck.new if worker_host? + # ------------------------------------------------------------------------------ # NON-CRUCIAL (Optional) checks, avail at /status/ From 85568ee3c534da9724f75543fdc404c901d2e0bf Mon Sep 17 00:00:00 2001 From: Johnathan Martin Date: Thu, 30 Mar 2023 22:34:10 -0700 Subject: [PATCH 2/3] config cruft cleanup --- config/settings.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/config/settings.yml b/config/settings.yml index e1bc37972..4f94391b9 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -40,7 +40,6 @@ zip_storage: '/tmp' # override in #{RAILS_ENV}.yml replication: audit_should_backfill: false -total_worker_count: 117 # for okcomputer endpoint minimum_subfolder_count: 1 # for okcomputer pres-cat mount check, verifies the storage_trunk has at least this many subfolders # NOTE: when null, no minimum check is performed, and this can be overriden per environment as needed worker_hostnames: # used for OK computer checks From c49af281de75c16524aa0954b5326e70a5e66170 Mon Sep 17 00:00:00 2001 From: Johnathan Martin Date: Tue, 20 Jun 2023 16:18:49 -0700 Subject: [PATCH 3/3] docker-compose.yml: use same version of redis as deployed envs --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index d830eaf6f..b6c4c29b9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -27,7 +27,7 @@ services: volumes: - postgres-data:/var/lib/postgresql/data redis: - image: redis:3 + image: redis:7 command: redis-server ports: - 6379:6379