Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[rescheduling] Add mutex #188

Draft
wants to merge 1 commit into
base: stable/yoga-m3
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions octavia_f5/api/drivers/f5_driver/arbiter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@

import abc

from octavia.common import base_taskflow
BenjaminLudwigSAP marked this conversation as resolved.
Show resolved Hide resolved
from oslo_config import cfg
from oslo_log import log as logging
from taskflow import flow
from taskflow.listeners import logging as tf_logging
from taskflow.patterns import linear_flow, unordered_flow

from octavia.common import base_taskflow
from octavia_f5.controller.worker.tasks import network_tasks
from octavia_f5.api.drivers.f5_driver.tasks import reschedule_tasks
from octavia_f5.controller.worker.tasks import network_tasks
from octavia_f5.utils import driver_utils

CONF = cfg.CONF
Expand Down Expand Up @@ -83,13 +83,22 @@ def get_reschedule_flow(self) -> flow.Flow:
update_vip_sub_flow = linear_flow.Flow("update-vip-sub-flow")
update_vip_sub_flow.add(get_vip_port_task, update_vip_task, all_selfips_task, update_aap_task)

# update loadbalancer, amphora and vip and invalidate cache can be run parallelized
# update load balancer, amphora and vip and invalidate cache can be run parallelized
update_database_flow = unordered_flow.Flow("database-update-flow")
update_database_flow.add(rewrite_loadbalancer_task, rewrite_amphora_task, update_vip_sub_flow,
invalidate_cache_task)

# We want to hold a lock between adding/removing the load balancer and updating the database, so that the
# load balancer won't be seen as an orphan (because it's on the wrong worker according to the DB) and cleaned
# up by the worker loop and cleaned up.
loadbalancer_lock_task = reschedule_tasks.LockLoadBalancer()
loadbalancer_release_task = reschedule_tasks.ReleaseLoadBalancer()

switchover_load_balancer_flow = linear_flow.Flow('switchover-flow')
switchover_load_balancer_flow.add(loadbalancer_lock_task, add_remove_loadbalancer_flow,
update_database_flow, loadbalancer_release_task)
BenjaminLudwigSAP marked this conversation as resolved.
Show resolved Hide resolved

reschedule_flow = linear_flow.Flow('reschedule-flow')
reschedule_flow.add(get_loadbalancer_task, get_old_agent_task, create_selfips_task,
wait_for_selfip_task, add_remove_loadbalancer_flow,
update_database_flow)
wait_for_selfip_task, switchover_load_balancer_flow)
return reschedule_flow
30 changes: 27 additions & 3 deletions octavia_f5/api/drivers/f5_driver/tasks/reschedule_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@

from abc import ABCMeta

from octavia.common import data_models as models
from octavia.db import api as db_apis
from oslo_config import cfg
from oslo_log import log as logging
from taskflow import task
from taskflow.types import failure

from octavia.common import constants
from octavia.common import data_models as models
from octavia.db import api as db_apis
from octavia_f5.common import constants as f5_const
from octavia_f5.db import repositories as repo

LOG = logging.getLogger(__name__)
Expand All @@ -37,6 +37,30 @@ def __init__(self, **kwargs):
self._amphora_repo = repo.AmphoraRepository()


class LockLoadBalancer(RescheduleTasks):
def execute(self, loadbalancer_id):
LOG.debug("Locking load balancer: %s ", loadbalancer_id)
# Fail if LB already locked. We probably don't want to wait until it's unlocked, since that would mean we're
# doing two migrations back-to-back, which we most definitely don't want.
if self._amphora_repo.get(db_apis.get_session(), id=loadbalancer_id) \
.vrrp_interface == f5_const.RESCHEDULING_LOCK_STRING:
return failure.Failure(causes=["Cannot acquire lock for load balancer {}. It's already being rescheduled."])
self._amphora_repo.update(db_apis.get_session(), loadbalancer_id,
vrrp_interface=f5_const.RESCHEDULING_LOCK_STRING)
BenjaminLudwigSAP marked this conversation as resolved.
Show resolved Hide resolved
# TODO temp - Testing revert
return failure.Failure(causes=["Testing revert"])

def revert(self, loadbalancer_id, **kwargs):
self._amphora_repo.update(db_apis.get_session(), loadbalancer_id, vrrp_interface=None)


class ReleaseLoadBalancer(RescheduleTasks):
def execute(self, loadbalancer_id):
LOG.debug("Releasing load balancer: %s ", loadbalancer_id)
self._amphora_repo.update(db_apis.get_session(), loadbalancer_id, vrrp_interface=None)
# TODO error handling without rolling back the whole migration


class GetLoadBalancerByID(RescheduleTasks):
default_provides = 'load_balancer'

Expand Down
2 changes: 2 additions & 0 deletions octavia_f5/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
ROLE_MASTER = 'MASTER'
ROLE_BACKUP = 'BACKUP'

RESCHEDULING_LOCK_STRING = 'RESCHEDULING'

SEGMENT = 'segment'
VIF_TYPE = 'f5'
ESD = 'esd'
Expand Down
3 changes: 3 additions & 0 deletions octavia_f5/controller/worker/sync_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,9 @@ def tenant_update(self, network_id, device=None, selfips=None, loadbalancers=Non
if not loadbalancers:
raise exceptions.AS3Exception("No loadbalancers specified for tenant_update")

# If any of the load balancers is currently being rescheduled, we want to hold back the whole declaration because we don't know whether the load balancer is supposed to exist on this device or not.
pass # TODO

Comment on lines +186 to +188
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason not to use the provisioning_status?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So far my reasoning for not using provisioning_status was that it is bound to specific values. There are in fact possible values that aren't used for load balancers so far (e.g. ALLOCATED). However we can't use that field at all due to following reason:
No matter what locking mechanism we use, it has to be used by the controller_worker as well when updating a load balancer. It calls status_manager.update_status after syncing the LBs. That function determines the status to set by looking at the current value of provisioning_status. If we let the controller worker change that field, the value it was set to before would be lost. The controller worker could remember the value of that field for every load balancer, however this would not be crash-safe.

It turns out there is a simple solution however: I'll use the amphora status field (which is bound to the same range of values as provisioning_status) and set it to ALLOCATED (that value is so far only used for amphora entries representing devices) while the associated load balancer is locked by either the rescheduling arbiter or the controller worker. update_status would then reset that field (unlock the LB). It can be made crash-resistant by having workers reset the field to the value of the associated LB's provisioning_status at startup, if compute_flavor matches the host this worker is assigned to.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems pretty over-engineered to me, so you want to introduce locks to the status_manager as well as controller_worker? The status is considered a user facing property, nothing to act on.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not overengineered, it's basically exactly what you proposed, just with all necessary considerations laid out.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unless I misunderstood your initial proposal (the one with provisioning_status)

decl = self._declaration_manager.get_declaration({network_id: loadbalancers}, self_ips)
if CONF.f5_agent.dry_run:
decl.set_action('dry-run')
Expand Down