fix: use rabbit quorum queues in lieu of ha

We define the use of quorum queues via kustomize as the default queue type for the named vhosts, but the oslo_messaging_rabbit config opt of `rabbit_ha_queues: true` was set, taking precedence. We actually do not want to use HA queues, as they are being deprecated, and will be removed in newer versions of RMQ (4.x being released EOY 2024). The use of HA queues in genestack up to this point was the result of sane but no longer ideal defaults set by openstack-helm that were carried forth. This explicitly disables rabbit_ha_queues, and then enables rabbit_quorum_queue. Removing the related rabbit vhost is required for this change prior to re-deploying a given openstack service. Example of re-deploying nova when making this change; note how we remove the queue, vhost, and user: ``` kubectl -n openstack delete queues.rabbitmq.com nova-queue kubectl -n openstack delete vhosts.rabbitmq.com nova-vhost kubectl -n openstack delete users.rabbitmq.com nova helm --upgrade install nova ./nova ``` **NOTE**: Several helm upgrades may be required due to a race condition with the operator removing the vhost. Uninstalling first may be easier, but do so carefully. Other changes: - add: `rabbit_transient_quorum_queue` which is newly availably in 2024.1. We will want to begin using this to make transient queues reliable - add: `use_queue_manager` which is newly available in 2024.1 We will want to begin using this when available to de-obfuscate named queues in rabbit - add: `rabbit_interval_max` to reconnect faster after a node outage - fix: send heartbeats more frequently; clients should mark a given node as down about 30s more quickly (default was 60s) - fix: set `kombu_reconnect_delay` lower to help avoid multiple code paths not being traversed when a RMQ node goes down
rackerlabs · Jun 4, 2024 · ec0ff9e · ec0ff9e
1 parent 17fcc37
commit ec0ff9e
Show file tree

Hide file tree

Showing 7 changed files with 639 additions and 84 deletions.
diff --git a/helm-configs/cinder/cinder-helm-overrides.yaml b/helm-configs/cinder/cinder-helm-overrides.yaml
@@ -813,7 +813,25 @@ conf:
     oslo_middleware:
       enable_proxy_headers_parsing: true
     oslo_messaging_rabbit:
-      rabbit_ha_queues: true
+      # We define use of quorum queues via kustomize but this was enabling HA queues instead
+      # ha_queues are deprecated, explicitly set to false and set quorum_queue true
+      rabbit_ha_queues: false
+      rabbit_quorum_queue: true
+      # TODO: Not available until 2024.1, but once it is, we want to enable these!
+      # new feature ref; https://docs.openstack.org/releasenotes/oslo.messaging/2024.1.html
+      # rabbit_transient_quorum_queue: true
+      # use_queue_manager: true
+      # Reconnect after a node outage more quickly
+      rabbit_interval_max: 10
+      # Send more frequent heartbeats and fail unhealthy nodes faster
+      # heartbeat_timeout / heartbeat_rate / 2.0 = 30 / 3 / 2.0 = 5
+      # https://opendev.org/openstack/oslo.messaging/commit/36fb5bceabe08a982ebd52e4a8f005cd26fdf6b8
+      heartbeat_rate: 3
+      heartbeat_timeout_threshold: 30
+      # Setting lower kombu_reconnect_delay should resolve isssue with HA failing when one node is down
+      # https://lists.openstack.org/pipermail/openstack-discuss/2023-April/033314.html
+      # https://review.opendev.org/c/openstack/oslo.messaging/+/866617
+      kombu_reconnect_delay: 0.5
     coordination:
       backend_url: file:///var/lib/cinder/coordination
     service_user:
@@ -880,19 +898,7 @@ conf:
       format: "%(message)s"
       datefmt: "%Y-%m-%d %H:%M:%S"
   rabbitmq:
-    # NOTE(rk760n): adding rmq policy to mirror messages from notification queues and set expiration time for the ones
-    policies:
-      - vhost: "cinder"
-        name: "ha_ttl_cinder"
-        definition:
-          # mirror messges to other nodes in rmq cluster
-          ha-mode: "all"
-          ha-sync-mode: "automatic"
-          # 70s
-          message-ttl: 70000
-        priority: 0
-        apply-to: all
-        pattern: '^(?!(amq\.|reply_)).*'
+    policies: []
   backends:
     # Those options will be written to backends.conf as-is.
     lvmdriver-1:

diff --git a/helm-configs/glance/glance-helm-overrides.yaml b/helm-configs/glance/glance-helm-overrides.yaml
@@ -264,7 +264,25 @@ conf:
     oslo_messaging_notifications:
       driver: messagingv2
     oslo_messaging_rabbit:
-      rabbit_ha_queues: true
+      # We define use of quorum queues via kustomize but this was enabling HA queues instead
+      # ha_queues are deprecated, explicitly set to false and set quorum_queue true
+      rabbit_ha_queues: false
+      rabbit_quorum_queue: true
+      # TODO: Not available until 2024.1, but once it is, we want to enable these!
+      # new feature ref; https://docs.openstack.org/releasenotes/oslo.messaging/2024.1.html
+      # rabbit_transient_quorum_queue: true
+      # use_queue_manager: true
+      # Reconnect after a node outage more quickly
+      rabbit_interval_max: 10
+      # Send more frequent heartbeats and fail unhealthy nodes faster
+      # heartbeat_timeout / heartbeat_rate / 2.0 = 30 / 3 / 2.0 = 5
+      # https://opendev.org/openstack/oslo.messaging/commit/36fb5bceabe08a982ebd52e4a8f005cd26fdf6b8
+      heartbeat_rate: 3
+      heartbeat_timeout_threshold: 30
+      # Setting lower kombu_reconnect_delay should resolve isssue with HA failing when one node is down
+      # https://lists.openstack.org/pipermail/openstack-discuss/2023-April/033314.html
+      # https://review.opendev.org/c/openstack/oslo.messaging/+/866617
+      kombu_reconnect_delay: 0.5
     oslo_policy:
       policy_file: /etc/glance/policy.yaml
     cors: {}
@@ -358,19 +376,7 @@ conf:
     user_domain_id =
     {{- end -}}
   rabbitmq:
-    # NOTE(rk760n): adding rmq policy to mirror messages from notification queues and set expiration time for the ones
-    policies:
-      - vhost: "glance"
-        name: "ha_ttl_glance"
-        definition:
-          # mirror messges to other nodes in rmq cluster
-          ha-mode: "all"
-          ha-sync-mode: "automatic"
-          # 70s
-          message-ttl: 70000
-        priority: 0
-        apply-to: all
-        pattern: '^(?!(amq\.|reply_)).*'
+    policies: []
 
 network:
   api:

diff --git a/helm-configs/heat/heat-helm-overrides.yaml b/helm-configs/heat/heat-helm-overrides.yaml
@@ -368,7 +368,25 @@ conf:
     oslo_middleware:
       enable_proxy_headers_parsing: true
     oslo_messaging_rabbit:
-      rabbit_ha_queues: True
+      # We define use of quorum queues via kustomize but this was enabling HA queues instead
+      # ha_queues are deprecated, explicitly set to false and set quorum_queue true
+      rabbit_ha_queues: false
+      rabbit_quorum_queue: true
+      # TODO: Not available until 2024.1, but once it is, we want to enable these!
+      # new feature ref; https://docs.openstack.org/releasenotes/oslo.messaging/2024.1.html
+      # rabbit_transient_quorum_queue: true
+      # use_queue_manager: true
+      # Reconnect after a node outage more quickly
+      rabbit_interval_max: 10
+      # Send more frequent heartbeats and fail unhealthy nodes faster
+      # heartbeat_timeout / heartbeat_rate / 2.0 = 30 / 3 / 2.0 = 5
+      # https://opendev.org/openstack/oslo.messaging/commit/36fb5bceabe08a982ebd52e4a8f005cd26fdf6b8
+      heartbeat_rate: 3
+      heartbeat_timeout_threshold: 30
+      # Setting lower kombu_reconnect_delay should resolve isssue with HA failing when one node is down
+      # https://lists.openstack.org/pipermail/openstack-discuss/2023-April/033314.html
+      # https://review.opendev.org/c/openstack/oslo.messaging/+/866617
+      kombu_reconnect_delay: 0.5
     oslo_policy:
       policy_file: /etc/heat/policy.yaml
   api_audit_map:
@@ -460,19 +478,7 @@ conf:
       datefmt: "%Y-%m-%d %H:%M:%S"
 
   rabbitmq:
-    # NOTE(rk760n): adding rmq policy to mirror messages from notification queues and set expiration time for the ones
-    policies:
-      - vhost: "heat"
-        name: "ha_ttl_heat"
-        definition:
-          # mirror messges to other nodes in rmq cluster
-          ha-mode: "all"
-          ha-sync-mode: "automatic"
-          # 70s
-          message-ttl: 70000
-        priority: 0
-        apply-to: all
-        pattern: '^(?!(amq\.|reply_)).*'
+    policies: []
 
 network:
   api:

diff --git a/helm-configs/keystone/keystone-helm-overrides.yaml b/helm-configs/keystone/keystone-helm-overrides.yaml
@@ -520,7 +520,25 @@ conf:
     oslo_messaging_notifications:
       driver: messagingv2
     oslo_messaging_rabbit:
-      rabbit_ha_queues: true
+      # We define use of quorum queues via kustomize but this was enabling HA queues instead
+      # ha_queues are deprecated, explicitly set to false and set quorum_queue true
+      rabbit_ha_queues: false
+      rabbit_quorum_queue: true
+      # TODO: Not available until 2024.1, but once it is, we want to enable these!
+      # new feature ref; https://docs.openstack.org/releasenotes/oslo.messaging/2024.1.html
+      # rabbit_transient_quorum_queue: true
+      # use_queue_manager: true
+      # Reconnect after a node outage more quickly
+      rabbit_interval_max: 10
+      # Send more frequent heartbeats and fail unhealthy nodes faster
+      # heartbeat_timeout / heartbeat_rate / 2.0 = 30 / 3 / 2.0 = 5
+      # https://opendev.org/openstack/oslo.messaging/commit/36fb5bceabe08a982ebd52e4a8f005cd26fdf6b8
+      heartbeat_rate: 3
+      heartbeat_timeout_threshold:
+      # Setting lower kombu_reconnect_delay should resolve isssue with HA failing when one node is down
+      # https://lists.openstack.org/pipermail/openstack-discuss/2023-April/033314.html
+      # https://review.opendev.org/c/openstack/oslo.messaging/+/866617
+      kombu_reconnect_delay: 0.5
     oslo_middleware:
       enable_proxy_headers_parsing: true
     oslo_policy:
@@ -543,19 +561,7 @@ conf:
   policy: {}
   access_rules: {}
   rabbitmq:
-    # NOTE(rk760n): adding rmq policy to mirror messages from notification queues and set expiration time for the ones
-    policies:
-      - vhost: "keystone"
-        name: "ha_ttl_keystone"
-        definition:
-          # mirror messges to other nodes in rmq cluster
-          ha-mode: "all"
-          ha-sync-mode: "automatic"
-          # 70s
-          message-ttl: 70000
-        priority: 0
-        apply-to: all
-        pattern: '^(?!(amq\.|reply_)).*'
+    policies: []
   rally_tests:
     run_tempest: false
     tests: