From 380c876e8cb7bdf2042a2d614ac23f70ca03d57e Mon Sep 17 00:00:00 2001 From: Mike Date: Wed, 8 May 2024 09:39:52 -0400 Subject: [PATCH] Added stuck uploaded check --- lib/periodic_jobs.rb | 2 +- .../vba_documents/slack_expired_notifier.rb | 72 --------- .../vba_documents/slack_status_notifier.rb | 144 ++++++++++++++++++ .../sidekiq/vba_documents/upload_scanner.rb | 10 -- .../sidekiq/slack_expired_notifier_spec.rb | 68 ++++++++- 5 files changed, 209 insertions(+), 87 deletions(-) delete mode 100644 modules/vba_documents/app/sidekiq/vba_documents/slack_expired_notifier.rb create mode 100644 modules/vba_documents/app/sidekiq/vba_documents/slack_status_notifier.rb diff --git a/lib/periodic_jobs.rb b/lib/periodic_jobs.rb index 70bb5c94a32..a002182405c 100644 --- a/lib/periodic_jobs.rb +++ b/lib/periodic_jobs.rb @@ -152,7 +152,7 @@ # Monthly report of benefits intake submissions mgr.register('0 2,8,9,11,13,15,17,19,21 * * 1-5', 'VBADocuments::SlackNotifier') # Notifies slack channel if BI uploads start to expire with no S3 file uploads provided - mgr.register('15 * * * *', 'VBADocuments::SlackExpiredNotifier') + mgr.register('15 * * * *', 'VBADocuments::SlackStatusNotifier') # Notifies slack channel if certain benefits states get stuck mgr.register('0 2,9,16 * * 1-5', 'VBADocuments::FlipperStatusAlert') # Checks status of Flipper features expected to be enabled and alerts to Slack if any are not enabled diff --git a/modules/vba_documents/app/sidekiq/vba_documents/slack_expired_notifier.rb b/modules/vba_documents/app/sidekiq/vba_documents/slack_expired_notifier.rb deleted file mode 100644 index 036b85ef4a3..00000000000 --- a/modules/vba_documents/app/sidekiq/vba_documents/slack_expired_notifier.rb +++ /dev/null @@ -1,72 +0,0 @@ -# frozen_string_literal: true - -require 'sidekiq' - -module VBADocuments - class SlackExpiredNotifier - include Sidekiq::Job - include ActionView::Helpers::DateHelper - - # Only retry for ~30 minutes since the job is run every hour - sidekiq_options retry: 5, unique_for: 1.hour - - # Number of hours in the past to query for expired uploads - LOOKBACK_HOURS = 1 - - # Expired threshold in percent to trigger notification - EXPIRED_THRESHOLD = 3.0 - - # A large number of expired uploads could indicate an s3 issue, for example in the past a misconfigured s3 key - # allowed consumers to successfully request s3 upload urls and tracking GUIDs, but they not able to push their - # files to s3 resulting in a large number of expired upload submissions - def perform - return unless Settings.vba_documents.slack.enabled - - created_at_range = LOOKBACK_HOURS.hours.ago..DateTime.now - new_count = VBADocuments::UploadSubmission.where(created_at: created_at_range).count - expired_count = VBADocuments::UploadSubmission.where(status: 'expired', - created_at: created_at_range).count - percent_expired = (expired_count.to_f / new_count) * 100.0 - - if percent_expired > EXPIRED_THRESHOLD - message_time = created_at_range.first.in_time_zone('America/New_York') - .strftime('%Y-%m-%d %I:%M:%S %p %Z') - fail_rate = ActiveSupport::NumberHelper.number_to_percentage(percent_expired, precision: 1) - message = "#{expired_count}(#{fail_rate}) " \ - "out of #{new_count} Benefits Intake uploads created since #{message_time} " \ - 'have expired with no consumer uploads to S3' \ - "\nThis could indicate an S3 issue impacting consumers." - - notify_slack(message, consumer_details(created_at_range)) - end - end - - def consumer_details(created_at_range) - # break out expired rates by consumer - consumer_all_counts = VBADocuments::UploadSubmission.where(created_at: created_at_range) - .group(:consumer_name).count - consumer_exp_counts = VBADocuments::UploadSubmission.where(status: 'expired', created_at: created_at_range) - .group(:consumer_name).count - - # calc expired rate% by consumer - exp_rate = consumer_all_counts.map { |name, count| [name, (consumer_exp_counts[name].to_f / count) * 100] } - - # sort by Consumer expired Rate, build slack reporting string - slack_details = "\n\t(Consumer, Expired Rate)\n" - exp_rate.sort_by { |e| -e[1] }.each do |cr| - slack_details += "\t#{cr[0]}: #{ActiveSupport::NumberHelper.number_to_percentage(cr[1], precision: 1)}\n" - end - - slack_details - end - - def notify_slack(message, results) - slack_details = { - class: self.class.name, - alert: message, - details: results - } - VBADocuments::Slack::Messenger.new(slack_details).notify! - end - end -end diff --git a/modules/vba_documents/app/sidekiq/vba_documents/slack_status_notifier.rb b/modules/vba_documents/app/sidekiq/vba_documents/slack_status_notifier.rb new file mode 100644 index 00000000000..c7ee0eadcbc --- /dev/null +++ b/modules/vba_documents/app/sidekiq/vba_documents/slack_status_notifier.rb @@ -0,0 +1,144 @@ +# frozen_string_literal: true + +require 'sidekiq' + +# Triggers slack notification for Upload Submissions stuck in specific statuses if the counts exceed +# thresholds that could indicate an issue. Not really intended to track individual Upload Submissions, +# but instead is intended to find wide scale problems potentially impacting all consumers. + +module VBADocuments + class SlackStatusNotifier + include Sidekiq::Job + include ActionView::Helpers::DateHelper + + # Only retry for ~30 minutes since the job is run every hour + sidekiq_options retry: 5, unique_for: 1.hour + + # Number of hours in the past to query for expired uploads, expired is a + # terminitating status that never clears and we only want to report on recents + EXPIRED_LOOKBACK_HOURS = 1 + + # Expired threshold in percent to trigger notification + EXPIRED_THRESHOLD = 3.0 + + # Upload submisions in uploaded status for more than 100 minutes are considered stuck + # Between UploadScanner and RunUnsuccessfulSubmissions, 100 should give us ~3 submission attempts + # 15 minutes for consumer to push file + # 2 minutes for UploadScanner to pickup s3 upload + # 5 minutes run time for attempt #1 + # 35 minutes wait\run for upload processor sidekiq retry 1 + # 35 minutes wait\run for upload processor sidekiq retry 2 + UPLOADED_AGE_THRESHOLD_MINUTES = 100 + + # Consumer's who could have their Submission'ss upload to EMMS\CM delayed + DELAYED_EVIDENCE_CONSUMERS = %w[appeals_api_nod_evidence_submission appeals_api_sc_evidence_submission].freeze + + def perform + return unless Settings.vba_documents.slack.enabled + + report_expired + report_uploaded + end + + def report_expired + # A large number of expired uploads could indicate an s3 issue, for example in the past a misconfigured s3 key + # allowed consumers to successfully request s3 upload urls and tracking GUIDs, but they not able to push their + # files to s3 resulting in a large number of expired upload submissions + created_at_range = EXPIRED_LOOKBACK_HOURS.hours.ago..DateTime.now + new_count = VBADocuments::UploadSubmission.where(created_at: created_at_range).count + expired_count = VBADocuments::UploadSubmission.where(status: 'expired', + created_at: created_at_range).count + percent_expired = (expired_count.to_f / new_count) * 100.0 + + if percent_expired > EXPIRED_THRESHOLD + message_time = created_at_range.first.in_time_zone('America/New_York') + .strftime('%Y-%m-%d %I:%M:%S %p %Z') + fail_rate = ActiveSupport::NumberHelper.number_to_percentage(percent_expired, precision: 1) + message = "#{expired_count}(#{fail_rate}) " \ + "out of #{new_count} Benefits Intake uploads created since #{message_time} " \ + 'have expired with no consumer uploads to S3' \ + "\nThis could indicate an S3 issue impacting consumers." + + notify_slack(message, expired_details_rate_by_consumer(created_at_range)) + end + rescue => e + notify_slack("'Expired' status notifier exception: #{e.class}", e.message) + raise e + end + + def expired_details_rate_by_consumer(created_at_range) + # break out expired rates by consumer + consumer_all_counts = VBADocuments::UploadSubmission.where(created_at: created_at_range) + .group(:consumer_name).count + consumer_exp_counts = VBADocuments::UploadSubmission.where(created_at: created_at_range, status: 'expired') + .group(:consumer_name).count + + # calc expired rate% by consumer + exp_rate = consumer_all_counts.map { |name, count| [name, (consumer_exp_counts[name].to_f / count) * 100] } + + # sort by Consumer expired Rate, build slack reporting string + slack_details = "\n\t(Consumer, Expired Rate)\n" + exp_rate.sort_by { |e| -e[1] }.each do |cr| + slack_details += "\t#{cr[0]}: #{ActiveSupport::NumberHelper.number_to_percentage(cr[1], precision: 1)}\n" + end + + slack_details + end + + def report_uploaded + # Upload Submisions stuck in uploaded status indicate the consumers are able to push their + # file to s3 but we are unable to process it. + uss = VBADocuments::UploadSubmission.where(status: 'uploaded', + created_at: ..UPLOADED_AGE_THRESHOLD_MINUTES.minutes.ago) + .order(:created_at) + if Flipper.enabled?(:decision_review_delay_evidence) + # if delay appeal evidence is on(max delay of 24 hours), look for any appeal's evidence + # uploads that have exceeded the max delay of 24 hours plus the normal time we allow + # the UploadScanner Sidekiq job to pickup the submission and upload to EMMS\CM + max_age = (24.hours + UPLOADED_AGE_THRESHOLD_MINUTES.minutes).ago + uss = uss.where.not(consumer_name: DELAYED_EVIDENCE_CONSUMERS) + uss = uss.or(VBADocuments::UploadSubmission.where(consumer_name: DELAYED_EVIDENCE_CONSUMERS, + status: 'uploaded', + created_at: ..max_age)) + end + + if uss.size.positive? + message = "#{uss.size} Benefits Intake Submissions have been in the uploaded status " \ + 'for longer than expected. This could indicate an issue with Benefits Intake or Central Mail' + notify_slack(message, uploaded_details(uss)) + end + rescue => e + notify_slack("'Uploaded' status notifier exception: #{e.class}", e.message) + raise e + end + + def uploaded_details(upload_submissions) + # sort by age in uploaded and kick out some details + uss_sorted = upload_submissions.map do |us| + upload_age = Time.zone.now - us.created_at + [us.guid, upload_age, us.metadata['upload_timeout_error_count'], us.metadata['size'], us.detail] + end + + # gather details and report + slack_details = "Oldest 20 Stuck Upload Submissions\n" + slack_details += "\n\t(Guid, Age(Hours:Minutes), upload retry count\n, upload size), detail\n" + uss_sorted.first(50).each do |us| + slack_details += "\t#{us[0]} " \ + "#{us[1].to_i / 3600}:#{format('%02d', (us[1] / 60 % 60).to_i)} " \ + "#{us[2] || 0} " \ + "#{ActiveSupport::NumberHelper.number_to_delimited(us[3])} " \ + "#{us[4]}\n" + end + slack_details + end + + def notify_slack(message, results) + slack_details = { + class: self.class.name, + alert: message, + details: results + } + VBADocuments::Slack::Messenger.new(slack_details).notify! + end + end +end diff --git a/modules/vba_documents/app/sidekiq/vba_documents/upload_scanner.rb b/modules/vba_documents/app/sidekiq/vba_documents/upload_scanner.rb index 412090b313a..aeffa9f9c3c 100644 --- a/modules/vba_documents/app/sidekiq/vba_documents/upload_scanner.rb +++ b/modules/vba_documents/app/sidekiq/vba_documents/upload_scanner.rb @@ -63,13 +63,3 @@ def log_details(upload) end end end -# def again -# load('./modules/vba_documents/app/sidekiq/vba_documents/upload_processor.rb') -# load('./modules/vba_documents/app/sidekiq/vba_documents/upload_scanner.rb') -# g = "0f35783e-4b33-4e22-b64f-9a50eb391b49" -# -# u = UploadSubmission.find_by_guid(@g) -# u.status = 'pending' -# u.save -# UploadScanner.new.perform -# end diff --git a/modules/vba_documents/spec/sidekiq/slack_expired_notifier_spec.rb b/modules/vba_documents/spec/sidekiq/slack_expired_notifier_spec.rb index 5c8eab31613..a29a8303e40 100644 --- a/modules/vba_documents/spec/sidekiq/slack_expired_notifier_spec.rb +++ b/modules/vba_documents/spec/sidekiq/slack_expired_notifier_spec.rb @@ -1,9 +1,9 @@ # frozen_string_literal: true require 'rails_helper' -require './modules/vba_documents/app/sidekiq/vba_documents/slack_expired_notifier' +require './modules/vba_documents/app/sidekiq/vba_documents/slack_status_notifier' -RSpec.describe 'VBADocuments::SlackExpiredNotifier', type: :job do +RSpec.describe 'VBADocuments::SlackStatusNotifier', type: :job do let(:slack_messenger) { instance_double(VBADocuments::Slack::Messenger.class_name) } let(:slack_enabled) { true } @@ -11,7 +11,7 @@ allow(VBADocuments::Slack::Messenger).to receive(:new).and_return(slack_messenger) allow(slack_messenger).to receive(:notify!) allow(Settings.vba_documents.slack).to receive(:enabled).and_return(slack_enabled) - @job = VBADocuments::SlackExpiredNotifier.new + @job = VBADocuments::SlackStatusNotifier.new @results = nil end @@ -56,7 +56,7 @@ expect(slack_messenger).to have_received(:notify!).once expect(VBADocuments::Slack::Messenger).to have_received(:new).with( { - class: 'VBADocuments::SlackExpiredNotifier', + class: 'VBADocuments::SlackStatusNotifier', alert: "1(50.0%) out of 2 Benefits Intake uploads created since #{t} " \ 'have expired with no consumer uploads to S3' \ "\nThis could indicate an S3 issue impacting consumers.", @@ -65,4 +65,64 @@ ) end end + + context 'when no new uploaded uploads were found' do + before do + VBADocuments::UploadSubmission.new(status: 'pending', consumer_name: 'sometech2').save! + VBADocuments::UploadSubmission.new(status: 'uploaded', consumer_name: 'sometech').save! + VBADocuments::UploadSubmission.new(status: 'received', consumer_name: 'sometech').save! + VBADocuments::UploadSubmission.new(status: 'processing', consumer_name: 'sometech').save! + VBADocuments::UploadSubmission.new(status: 'success', consumer_name: 'sometech').save! + VBADocuments::UploadSubmission.new(status: 'vbms', consumer_name: 'sometech').save! + VBADocuments::UploadSubmission.new(status: 'error', consumer_name: 'sometech').save! + VBADocuments::UploadSubmission.new(status: 'expired', consumer_name: 'sometech', created_at: 5.days.ago).save! + end + + it 'does nothing' do + @results = @job.perform + expect(slack_messenger).not_to have_received(:notify!) + end + end + + context 'when uploaded uploads are found' do + before do + # should pick up since its 5.5 hours old + @us_uploaded = VBADocuments::UploadSubmission.new(status: 'uploaded', consumer_name: 'sometech2', + created_at: 5.5.hours.ago) + @us_uploaded.save! + + # should not pick up wrong status + VBADocuments::UploadSubmission.new(status: 'vbms', consumer_name: 'vagov').save! + + # should not pick up, uploaded status but not old enough yet + VBADocuments::UploadSubmission.new(status: 'uploaded', consumer_name: 'sometech2').save! + + # should not pick up, uploaded status, but since appeal evidense sub are not considered + # stuck until they hit their potential max delay of 24 hours plus 100 minutes + VBADocuments::UploadSubmission.new(status: 'uploaded', + consumer_name: 'appeals_api_nod_evidence_submission', + created_at: 5.5.hours.ago).save! + + @es_us_uploaded = VBADocuments::UploadSubmission.new(status: 'uploaded', + consumer_name: 'appeals_api_nod_evidence_submission', + created_at: 27.hours.ago) + @es_us_uploaded.save! + end + + it 'warns using slack' do + @results = @job.perform + expect(slack_messenger).to have_received(:notify!).once + expect(VBADocuments::Slack::Messenger).to have_received(:new).with( + { + class: 'VBADocuments::SlackStatusNotifier', + alert: '2 Benefits Intake Submissions have been in the uploaded status for longer than expected. ' \ + 'This could indicate an issue with Benefits Intake or Central Mail', + details: "Oldest 20 Stuck Upload Submissions\n\n\t(Guid, Age(Hours:Minutes), " \ + "upload retry count\n, upload size), detail\n" \ + "\t#{@es_us_uploaded.guid} 27:00 0 \n" \ + "\t#{@us_uploaded.guid} 5:30 0 \n" + } + ) + end + end end