From 3a3240d0296d6365c6c2822d33b8a85b719056dd Mon Sep 17 00:00:00 2001 From: ricolin Date: Tue, 12 Nov 2024 13:06:51 +0800 Subject: [PATCH] [ATMOSPHERE-570] [stable/zed] allow failover on ERROR Amphora add health_manager.failover_on_error config to allow running failover when an Amphora is expired with heartbeats and in ERROR status. ref: https://review.opendev.org/c/openstack/octavia/+/934638 --- images/octavia/Dockerfile | 2 + ...dd-config-to-allow-failover-on-error.patch | 109 ++++++++++++++++++ roles/octavia/vars/main.yml | 1 + 3 files changed, 112 insertions(+) create mode 100644 images/octavia/patches/octavia/0001-Add-config-to-allow-failover-on-error.patch diff --git a/images/octavia/Dockerfile b/images/octavia/Dockerfile index df52fb3ff..156584a93 100644 --- a/images/octavia/Dockerfile +++ b/images/octavia/Dockerfile @@ -7,6 +7,8 @@ FROM registry.atmosphere.dev/library/openstack-venv-builder:${RELEASE} AS build ARG OCTAVIA_GIT_REF=000b577f3e9c9ff7cb893e9f6e635753017a78c6 ADD --keep-git-dir=true https://opendev.org/openstack/octavia.git#${OCTAVIA_GIT_REF} /src/octavia RUN git -C /src/octavia fetch --unshallow +COPY patches/octavia /patches/octavia +RUN git -C /src/octavia apply --verbose /patches/octavia/* ADD --keep-git-dir=true https://opendev.org/openstack/ovn-octavia-provider.git#unmaintained/zed /src/ovn-octavia-provider RUN git -C /src/ovn-octavia-provider fetch --unshallow RUN --mount=type=cache,mode=0755,target=/root/.cache/pip,sharing=private < +Date: Tue, 12 Nov 2024 14:48:52 +0800 +Subject: [PATCH] Add config to allow failover on error + +Add config `failover_on_error` to allow Amphora failover with `ERROR` +status. + +A lot of Amphora `ERROR` are able to solved by running failover on LB. +With nothing change on original scenario, this new config allow +environments to do failover automatically for Amphora on ERROR status. + +Change-Id: Icff02f7a621cc13a8a0383e1b322f96027c421a6 +--- + octavia/common/config.py | 6 ++++++ + octavia/db/repositories.py | 13 ++++++++++--- + octavia/tests/functional/db/test_repositories.py | 16 ++++++++++++++++ + ...-error-amphora-failover-ab882982adc05f01.yaml | 9 +++++++++ + 4 files changed, 41 insertions(+), 3 deletions(-) + create mode 100644 releasenotes/notes/allow-error-amphora-failover-ab882982adc05f01.yaml + +diff --git a/octavia/common/config.py b/octavia/common/config.py +index 5fde9bd41..5ed7cf4cb 100644 +--- a/octavia/common/config.py ++++ b/octavia/common/config.py +@@ -324,6 +324,12 @@ health_manager_opts = [ + deprecated_for_removal=True, + deprecated_reason=_('This driver interface was removed.'), + deprecated_since='Victoria'), ++ cfg.BoolOpt('failover_on_error', default=False, ++ help=_('Set this to True to allow failover when amphora ' ++ 'status in Error. Beware that, try to performing ' ++ 'failover when ERROR status might not help to solve ' ++ 'the ERROR status for Amphora. ' ++ 'So use this option with caution.')), + ] + + oslo_messaging_opts = [ +diff --git a/octavia/db/repositories.py b/octavia/db/repositories.py +index b0ed8a6ab..7009e5e42 100644 +--- a/octavia/db/repositories.py ++++ b/octavia/db/repositories.py +@@ -1579,11 +1579,18 @@ class AmphoraHealthRepository(BaseRepository): + # We don't want to attempt to failover amphora that are not + # currently in the ALLOCATED or FAILOVER_STOPPED state. + # i.e. Not DELETED, PENDING_*, etc. ++ # If CONF.health_manager.failover_on_error is set, we will allow ++ # performing failover when Amphora on error status. ++ allow_stats = [ ++ consts.AMPHORA_ALLOCATED, ++ consts.AMPHORA_FAILOVER_STOPPED ++ ] ++ if CONF.health_manager.failover_on_error: ++ allow_stats.append(consts.ERROR) ++ + allocated_amp_ids_subquery = ( + select(models.Amphora.id).where( +- models.Amphora.status.in_( +- [consts.AMPHORA_ALLOCATED, +- consts.AMPHORA_FAILOVER_STOPPED]))) ++ models.Amphora.status.in_(allow_stats))) + + # Pick one expired amphora for automatic failover + amp_health = lock_session.query( +diff --git a/octavia/tests/functional/db/test_repositories.py b/octavia/tests/functional/db/test_repositories.py +index 15d3063a5..0ed408335 100644 +--- a/octavia/tests/functional/db/test_repositories.py ++++ b/octavia/tests/functional/db/test_repositories.py +@@ -3957,6 +3957,22 @@ class AmphoraHealthRepositoryTest(BaseRepositoryTest): + self.session) + self.assertEqual(uuid, stale_amphora.amphora_id) + ++ def test_get_stale_error_amphora(self): ++ conf = self.useFixture(oslo_fixture.Config(cfg.CONF)) ++ conf.config(group='health_manager', failover_on_error=True) ++ stale_amphora = self.amphora_health_repo.get_stale_amphora( ++ self.session) ++ self.assertIsNone(stale_amphora) ++ ++ uuid = uuidutils.generate_uuid() ++ self.create_amphora(uuid) ++ self.amphora_repo.update(self.session, uuid, ++ status=constants.ERROR) ++ self.create_amphora_health(uuid) ++ stale_amphora = self.amphora_health_repo.get_stale_amphora( ++ self.session) ++ self.assertEqual(uuid, stale_amphora.amphora_id) ++ + def test_get_stale_amphora_past_threshold(self): + conf = self.useFixture(oslo_fixture.Config(cfg.CONF)) + conf.config(group='health_manager', failover_threshold=3) +diff --git a/releasenotes/notes/allow-error-amphora-failover-ab882982adc05f01.yaml b/releasenotes/notes/allow-error-amphora-failover-ab882982adc05f01.yaml +new file mode 100644 +index 000000000..abbe0dd3d +--- /dev/null ++++ b/releasenotes/notes/allow-error-amphora-failover-ab882982adc05f01.yaml +@@ -0,0 +1,9 @@ ++--- ++fixes: ++ - | ++ Add config `[health_manager]/failover_on_error` (disable by default) to ++ allow health manager pick up any Amphora in both ERROR status and ++ expired on heartbeats conditions when looking for stale Amphora to ++ failover. Beware that, try to performing failover when ERROR status ++ might not help to solve the ERROR status for Amphora. ++ So use this option with caution. +-- +2.25.1 + diff --git a/roles/octavia/vars/main.yml b/roles/octavia/vars/main.yml index 8423aba61..ac0b3c90b 100644 --- a/roles/octavia/vars/main.yml +++ b/roles/octavia/vars/main.yml @@ -118,6 +118,7 @@ _octavia_helm_values: health_manager: controller_ip_port_list: "{{ _octavia_controller_ip_port_list | sort | join(',') }}" heartbeat_key: "{{ octavia_heartbeat_key }}" + failover_on_error: true oslo_messaging_notifications: driver: noop neutron: