From 76e6e4c8dca0028a402bd0ff6318e4972ddf1642 Mon Sep 17 00:00:00 2001 From: benk-gc Date: Thu, 24 Oct 2024 22:00:38 +0100 Subject: [PATCH] Remove dead connections from the pool on connection errors. With the introduction of Yugabyte, there is now the possibility that connections in the pool may become "dead" when new nodes are rotated into a cluster, since they end up pointing at a node which no longer exists. In this case, we can proactively remove these connections from the connection pool when we see the corresponding exceptions. As long as jobs are retryable the work will still eventually be processed - this merely prevents us from being stuck in a pathological case where the pool contains dead connections and nothing is automatically clearing them out (e.g. a deployment killing the pod). --- .rubocop.yml | 4 ++++ lib/que/adapters/active_record.rb | 22 ++++++++++++++++++++-- spec/lib/que/worker_spec.rb | 25 +++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 2 deletions(-) diff --git a/.rubocop.yml b/.rubocop.yml index 1758b32..9b4d464 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -25,3 +25,7 @@ RSpec/IndexedLet: RSpec/NestedGroups: Max: 5 + +Sequel/IrreversibleMigration: + Exclude: + - "**/*_spec.rb" diff --git a/lib/que/adapters/active_record.rb b/lib/que/adapters/active_record.rb index 6a4b02f..aea1c86 100644 --- a/lib/que/adapters/active_record.rb +++ b/lib/que/adapters/active_record.rb @@ -85,8 +85,26 @@ def add_to_transaction private - def checkout_activerecord_adapter(&block) - ::ActiveRecord::Base.connection_pool.with_connection(&block) + def checkout_activerecord_adapter + ::ActiveRecord::Base.connection_pool.with_connection do |conn| + yield conn + rescue ::PG::Error, ::ActiveRecord::StatementInvalid => e + remove_dead_connections(e) + raise + end + end + + def remove_dead_connections(exception) + # Cater for errors both from a raw connection or a connection adapter, + # since the calling code could use either. + cause = exception.is_a?(::PG::Error) ? exception : exception.cause + + return unless cause.instance_of?(::PG::UnableToSend) || + cause.instance_of?(::PG::ConnectionBad) + + ::ActiveRecord::Base.connection_pool.connections. + find { |conn| conn.owner == ActiveSupport::IsolatedExecutionState.context }. + then { |failed| failed.pool.remove(failed) } end end end diff --git a/spec/lib/que/worker_spec.rb b/spec/lib/que/worker_spec.rb index 400ec0f..8f45fc1 100644 --- a/spec/lib/que/worker_spec.rb +++ b/spec/lib/que/worker_spec.rb @@ -216,6 +216,31 @@ end end + context "when postgres raises a bad connection error while processing a job" do + before do + allow(Que).to receive(:execute). + with(:lock_job, ["default", 0]). + and_raise(PG::ConnectionBad) + + # Ensure we don't have any currently leased connections, since in a thread + # using with_connection this would never be the case (but in specs it + # sometimes is). + pool.disconnect! + end + + let(:pool) { ActiveRecord::Base.connection_pool } + + it "rescues it and returns an error" do + FakeJob.enqueue(1) + + expect(work).to eq(:postgres_error) + end + + it "removes the connection from the connection pool" do + expect { work }.to_not change { pool.connections.count }.from(0) + end + end + context "when we time out checking out a new connection" do it "rescues it and returns an error" do FakeJob.enqueue(1)