diff --git a/lib/prometheus_exporter/instrumentation/delayed_job.rb b/lib/prometheus_exporter/instrumentation/delayed_job.rb index 01501ba..2634459 100644 --- a/lib/prometheus_exporter/instrumentation/delayed_job.rb +++ b/lib/prometheus_exporter/instrumentation/delayed_job.rb @@ -13,9 +13,11 @@ def register_plugin(client: nil) callbacks do |lifecycle| lifecycle.around(:invoke_job) do |job, *args, &block| max_attempts = Delayed::Worker.max_attempts + failed_count = Delayed::Job.where('queue = ? AND last_error is not null', job.queue).count + max_failed_count = Delayed::Job.where('queue = ? AND last_error is not null and attempts >= ?', job.queue, max_attempts).count enqueued_count = Delayed::Job.where(queue: job.queue).count pending_count = Delayed::Job.where(attempts: 0, locked_at: nil, queue: job.queue).count - instrumenter.call(job, max_attempts, enqueued_count, pending_count, *args, &block) + instrumenter.call(job, max_attempts, enqueued_count, pending_count, failed_count, max_failed_count, *args, &block) end end end @@ -28,7 +30,7 @@ def initialize(client: nil) @client = client || PrometheusExporter::Client.default end - def call(job, max_attempts, enqueued_count, pending_count, *args, &block) + def call(job, max_attempts, enqueued_count, pending_count, failed_count, max_failed_count, *args, &block) success = false start = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC) attempts = job.attempts + 1 # Increment because we're adding the current attempt @@ -46,6 +48,8 @@ def call(job, max_attempts, enqueued_count, pending_count, *args, &block) duration: duration, attempts: attempts, max_attempts: max_attempts, + failed: failed_count, + max_failed: max_failed_count, enqueued: enqueued_count, pending: pending_count ) diff --git a/lib/prometheus_exporter/server/delayed_job_collector.rb b/lib/prometheus_exporter/server/delayed_job_collector.rb index 854c7db..625f860 100644 --- a/lib/prometheus_exporter/server/delayed_job_collector.rb +++ b/lib/prometheus_exporter/server/delayed_job_collector.rb @@ -12,6 +12,8 @@ def initialize @delayed_job_attempts_summary = nil @delayed_jobs_enqueued = nil @delayed_jobs_pending = nil + @delayed_jobs_failed = nil + @delayed_jobs_max_failed = nil end def type @@ -34,13 +36,15 @@ def collect(obj) @delayed_job_attempts_summary.observe(obj["attempts"], counter_labels) if obj["success"] @delayed_jobs_enqueued.observe(obj["enqueued"], gauge_labels) @delayed_jobs_pending.observe(obj["pending"], gauge_labels) + @delayed_jobs_failed.observe(obj["failed"], labels) + @delayed_jobs_max_failed.observe(obj["max_failed"], labels) end def metrics if @delayed_jobs_total [@delayed_job_duration_seconds, @delayed_jobs_total, @delayed_failed_jobs_total, @delayed_jobs_max_attempts_reached_total, @delayed_job_duration_seconds_summary, @delayed_job_attempts_summary, - @delayed_jobs_enqueued, @delayed_jobs_pending] + @delayed_jobs_enqueued, @delayed_jobs_pending, @delayed_jobs_failed, @delayed_jobs_max_failed] else [] end @@ -67,6 +71,14 @@ def ensure_delayed_job_metrics PrometheusExporter::Metric::Gauge.new( "delayed_jobs_pending", "Number of pending delayed jobs.") + @delayed_jobs_failed = + PrometheusExporter::Metric::Gauge.new( + "delayed_jobs_failed", "Number of failed delayed jobs.") + + @delayed_jobs_max_failed = + PrometheusExporter::Metric::Gauge.new( + "delayed_jobs_max_failed", "Number of failed delayed jobs without more retries.") + @delayed_failed_jobs_total = PrometheusExporter::Metric::Counter.new( "delayed_failed_jobs_total", "Total number failed delayed jobs executed.")