Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

okcomputer: Sidekiq worker count check #2216

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions config/initializers/okcomputer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,106 @@ def check

OkComputer::Registry.register 'ruby_version', OkComputer::RubyVersionCheck.new

# confirm that the expected number of sidekiq worker processes and threads are running
class SidekiqWorkerCountCheck < OkComputer::Check
class ExpectedEnvVarMissing < StandardError; end

def check
actual_local_sidekiq_processes = fetch_local_sidekiq_processes
actual_local_sidekiq_process_count = actual_local_sidekiq_processes.size
actual_local_total_concurrency = actual_local_sidekiq_processes.sum { |process| process['concurrency'] }

error_list = calculate_error_list(actual_local_sidekiq_process_count: actual_local_sidekiq_process_count,
actual_local_total_concurrency: actual_local_total_concurrency)

if error_list.empty?
mark_message "Sidekiq worker counts as expected on this VM: #{actual_local_sidekiq_process_count} worker " \
"processes, #{actual_local_total_concurrency} concurrent worker threads total."
else
mark_message error_list.join(' ')
mark_failure
end
rescue ExpectedEnvVarMissing => e
mark_message e.message
mark_failure
end

private

# @return [Array<Sidekiq::Process>] one Sidekiq::Process object for each worker management
# process currently running on _this_ VM
def fetch_local_sidekiq_processes
fetch_global_sidekiq_process_list.select do |p|
p['hostname'] == Socket.gethostname
end
end

# @return [Array<Sidekiq::Process>] one Sidekiq::Process object for each worker management
# process currently running on _all_ worker VMs
def fetch_global_sidekiq_process_list
# Sidekiq::ProcessSet#each doesn't return an Enumerator, it just loops and calls the block it's passed
[].tap do |pset_list|
Sidekiq::ProcessSet.new.each { |process| pset_list << process }
end
end

# the number of concurrent Sidekiq worker threads per process is set in config/sidekiq.yml
def expected_sidekiq_proc_concurrency(proc_num: nil)
config_filename = proc_num.present? ? "../../shared/config/sidekiq#{proc_num}.yml" : 'config/sidekiq.yml'
sidekiq_config = YAML.safe_load(Rails.root.join(config_filename).read, permitted_classes: [Symbol])
sidekiq_config[:concurrency]
end

# puppet runs a number of sidekiq processes using systemd, exposing the expected process count via env var
def expected_sidekiq_process_count
@expected_sidekiq_process_count ||= Integer(ENV.fetch('EXPECTED_SIDEKIQ_PROC_COUNT'))
rescue StandardError => e
err_description = 'Error retrieving EXPECTED_SIDEKIQ_PROC_COUNT and parsing to int. ' \
"ENV['EXPECTED_SIDEKIQ_PROC_COUNT']=#{ENV.fetch('EXPECTED_SIDEKIQ_PROC_COUNT', nil)}"
Rails.logger.error("#{err_description} -- #{e.message} -- #{e.backtrace}")
raise ExpectedEnvVarMissing, err_description
end

def expected_local_total_concurrency
# Existence of config/sidekiq.yml indicates a single config for all sidekiq processes. otherwise, each of
# the sidekiq processes, 1 through EXPECTED_SIDEKIQ_PROC_COUNT, will have its own config file.
# The number of sidekiq[N].yml files may not match the number of sidekiq processes if custom_execstart=false
# in puppet config.
@expected_local_total_concurrency ||=
if File.exist?('config/sidekiq.yml')
expected_sidekiq_process_count * expected_sidekiq_proc_concurrency
else
(1..expected_sidekiq_process_count).sum { |n| expected_sidekiq_proc_concurrency(proc_num: n) }
end
end

def calculate_error_list(actual_local_sidekiq_process_count:, actual_local_total_concurrency:)
error_list = []

if actual_local_sidekiq_process_count > expected_sidekiq_process_count
error_list << <<~ERR_TXT
Actual Sidekiq worker process count (#{actual_local_sidekiq_process_count}) on this VM is greater than \
expected (#{expected_sidekiq_process_count}). Check for stale Sidekiq processes (e.g. from old deployments). \
It's also possible that some worker threads are finishing WIP that started before a Sidekiq restart, e.g. as \
happens when long running job spans app deployment. Use your judgement when deciding whether to kill an old process.
ERR_TXT
end
if actual_local_sidekiq_process_count < expected_sidekiq_process_count
error_list << "Actual Sidekiq worker management process count (#{actual_local_sidekiq_process_count}) on " \
"this VM is less than expected (#{expected_sidekiq_process_count})."
end
if actual_local_total_concurrency != expected_local_total_concurrency
error_list << "Actual worker thread count on this VM is #{actual_local_total_concurrency}, but " \
"expected local total Sidekiq concurrency is #{expected_local_total_concurrency}."
end

error_list
end
end

# we don't want to register this on non-worker boxes, because it only tracks local worker processes
OkComputer::Registry.register 'sidekiq_worker_count', SidekiqWorkerCountCheck.new if worker_host?

# ------------------------------------------------------------------------------

# NON-CRUCIAL (Optional) checks, avail at /status/<name-of-check>
Expand Down
1 change: 0 additions & 1 deletion config/settings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ zip_storage: '/tmp' # override in #{RAILS_ENV}.yml
replication:
audit_should_backfill: false

total_worker_count: 117 # for okcomputer endpoint
minimum_subfolder_count: 1 # for okcomputer pres-cat mount check, verifies the storage_trunk has at least this many subfolders
# NOTE: when null, no minimum check is performed, and this can be overriden per environment as needed
worker_hostnames: # used for OK computer checks
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ services:
volumes:
- postgres-data:/var/lib/postgresql/data
redis:
image: redis:3
image: redis:7
command: redis-server
ports:
- 6379:6379
Expand Down