Skip to content
This repository has been archived by the owner on Jul 25, 2022. It is now read-only.

Commit

Permalink
Updates to fix adding and removing a worker client node in both
Browse files Browse the repository at this point in the history
monitored and managed mode.

Fixes #1860.
Fixes #1858.
Fixes #1859.

Signed-off-by: johnsonw <[email protected]>
  • Loading branch information
johnsonw committed Jun 10, 2020
1 parent 7b6af3f commit c4bad3f
Show file tree
Hide file tree
Showing 10 changed files with 148 additions and 50 deletions.
16 changes: 14 additions & 2 deletions chroma_core/lib/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,13 @@ def satisfied(self):

class DependOn(Dependable):
def __init__(
self, stateful_object, preferred_state, acceptable_states=None, unacceptable_states=None, fix_state=None
self,
stateful_object,
preferred_state,
acceptable_states=None,
unacceptable_states=None,
fix_state=None,
skip_if_satisfied=False,
):
"""preferred_state: what we will try to put the dependency into if
it is not already in one of acceptable_states.
Expand Down Expand Up @@ -80,6 +86,8 @@ def __init__(
self.fix_state = fix_state
self.stateful_object = stateful_object

self.skip_if_satisfied = skip_if_satisfied

def __str__(self):
return "%s %s %s %s" % (self.stateful_object, self.preferred_state, self.acceptable_states, self.fix_state)

Expand All @@ -92,7 +100,11 @@ def satisfied(self):
except:
self.stateful_object.__class__._base_manager.get(pk=self.stateful_object.pk)

satisfied = depended_object.state in self.acceptable_states
if self.skip_if_satisfied:
satisfied = depended_object.state in self.stateful_object.get_current_route()
else:
satisfied = depended_object.state in self.acceptable_states

if not satisfied:
job_log.warning(
"DependOn not satisfied: %s in state %s, not one of %s (preferred %s)"
Expand Down
16 changes: 15 additions & 1 deletion chroma_core/models/client_mount.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class LustreClientMount(DeletableStatefulObject):

states = ["unmounted", "mounted", "removed"]
initial_state = "unmounted"
skip_if_satisfied = True

def __str__(self):
return self.get_label()
Expand All @@ -42,6 +43,10 @@ def get_deps(self, state=None):
state = self.state

deps = []

if self.host.immutable_state:
return DependAll(deps)

if state == "mounted":
# Depend on this mount's host having LNet up. If LNet is stopped
# on the host, this filesystem will be unmounted first.
Expand Down Expand Up @@ -140,6 +145,7 @@ class MountLustreClientJob(StateChangeJob):
stateful_object = "lustre_client_mount"
lustre_client_mount = models.ForeignKey(LustreClientMount, on_delete=CASCADE)
state_verb = None
skip_if_satisfied = True

@classmethod
def long_description(cls, stateful_object):
Expand Down Expand Up @@ -179,6 +185,7 @@ class UnmountLustreClientMountJob(StateChangeJob):
stateful_object = "lustre_client_mount"
lustre_client_mount = models.ForeignKey(LustreClientMount, on_delete=CASCADE)
state_verb = None
skip_if_satisfied = True

@classmethod
def long_description(cls, stateful_object):
Expand Down Expand Up @@ -215,6 +222,7 @@ class RemoveLustreClientJob(StateChangeJob):
stateful_object = "lustre_client_mount"
lustre_client_mount = models.ForeignKey(LustreClientMount, on_delete=CASCADE)
state_verb = None
skip_if_satisfied = True

@classmethod
def long_description(cls, stateful_object):
Expand Down Expand Up @@ -353,5 +361,11 @@ def description(self):
def get_steps(self):
search = lambda cm: (cm.host == self.host and cm.state == "mounted")
mounted = ObjectCache.get(LustreClientMount, search)
args = dict(host=self.host, filesystems=[(m.filesystem.mount_path(), m.mountpoint) for m in mounted])
args = dict(
host=self.host,
filesystems=[
(ObjectCache.get_one(ManagedFilesystem, lambda mf, mtd=m: mf.name == mtd.filesystem).mount_path(), m.mountpoint)
for m in mounted
],
)
return [(UnmountLustreFilesystemsStep, args)]
39 changes: 21 additions & 18 deletions chroma_core/models/host.py
Original file line number Diff line number Diff line change
Expand Up @@ -797,7 +797,7 @@ class BaseSetupHostJob(NullStateChangeJob):
class Meta:
abstract = True

def _common_deps(self, lnet_state_required, lnet_acceptable_states, lnet_unacceptable_states):
def _common_deps(self):
# It really does not feel right that this is in here, but it does sort of work. These are the things
# it is dependent on so create them. Also I can't work out with today's state machine anywhere else to
# put them that works.
Expand Down Expand Up @@ -826,23 +826,13 @@ def _common_deps(self, lnet_state_required, lnet_acceptable_states, lnet_unaccep

deps = []

if self.target_object.lnet_configuration:
deps.append(
DependOn(
self.target_object.lnet_configuration,
lnet_state_required,
lnet_acceptable_states,
lnet_unacceptable_states,
)
)

if self.target_object.pacemaker_configuration:
deps.append(DependOn(self.target_object.pacemaker_configuration, "started"))

if self.target_object.ntp_configuration:
deps.append(DependOn(self.target_object.ntp_configuration, "configured"))

return DependAll(deps)
return deps


class InitialiseBlockDeviceDriversStep(Step):
Expand Down Expand Up @@ -871,7 +861,12 @@ def description(self):
return help_text["setup_managed_host_on"] % self.target_object

def get_deps(self):
return self._common_deps("lnet_up", None, None)
deps = self._common_deps()

if self.target_object.lnet_configuration:
deps.append(DependOn(self.target_object.lnet_configuration, "lnet_up"))

return DependAll(deps)

def get_steps(self):
return [(InitialiseBlockDeviceDriversStep, {"host": self.target_object})]
Expand All @@ -891,9 +886,9 @@ class Meta:
ordering = ["id"]

def get_deps(self):
# Moving out of unconfigured into lnet_unloaded will mean that lnet will start monitoring and responding to
# the state. Once we start monitoring any state other than unconfigured is acceptable.
return self._common_deps("lnet_unloaded", None, ["unconfigured"])
deps = self._common_deps()

return DependAll(deps)

def description(self):
return help_text["setup_monitored_host_on"] % self.target_object
Expand All @@ -913,14 +908,19 @@ class Meta:
ordering = ["id"]

def get_deps(self):
return self._common_deps("lnet_up", None, None)
deps = self._common_deps()

if self.target_object.lnet_configuration and not self.target_object.immutable_state:
deps.append(DependOn(self.target_object.lnet_configuration, "lnet_up"))

return DependAll(deps)

def description(self):
return help_text["setup_worker_host_on"] % self.target_object

@classmethod
def can_run(cls, host):
return host.is_managed and host.is_worker and (host.state != "unconfigured")
return host.is_worker and (host.state != "unconfigured")


class DetectTargetsStep(Step):
Expand Down Expand Up @@ -1174,6 +1174,9 @@ def description(self):
def get_deps(self):
deps = []

if self.host.immutable_state:
return DependAll(deps)

if self.host.lnet_configuration:
deps.append(DependOn(self.host.lnet_configuration, "unconfigured"))

Expand Down
29 changes: 27 additions & 2 deletions chroma_core/models/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ class Meta:
route_map = None
transition_map = None
job_class_map = None
skip_if_satisfied = False
_begin_state = None
_end_state = None

reverse_deps = {}

Expand All @@ -61,6 +64,12 @@ def set_state(self, state, intentional=False):
self.state = state
self.state_modified_at = now()

def set_begin_state(self, begin_state):
self._begin_state = begin_state

def set_end_state(self, end_state):
self._end_state = end_state

def not_state(self, state):
return list(set(self.states) - set([state]))

Expand Down Expand Up @@ -190,6 +199,12 @@ def get_route(cls, begin_state, end_state):
except KeyError:
raise SchedulingError("%s->%s not legal state transition for %s" % (begin_state, end_state, cls))

def get_current_route(self):
if self._begin_state and self._end_state:
return self.get_route(self._begin_state, self._end_state)

return []

def get_available_states(self, begin_state):
"""States which should be advertised externally (i.e. exclude states which
are used internally but don't make sense when requested externally, for example
Expand Down Expand Up @@ -385,6 +400,8 @@ def get_confirmation_string(self):
#: Whether the job can be safely cancelled
cancellable = True

skip_if_satisfied = False

class Meta:
app_label = "chroma_core"
ordering = ["id"]
Expand Down Expand Up @@ -422,13 +439,21 @@ def all_deps(self, dep_cache):
dependent_dependency.stateful_object == stateful_object
and not new_state in dependent_dependency.acceptable_states
):
dependent_deps.append(DependOn(dependent, dependent_dependency.fix_state))
if dependent.state != dependent_dependency.fix_state:
dependent.set_begin_state(dependent.state)
dependent.set_end_state(dependent_dependency.fix_state)

dependent_deps.append(
DependOn(
dependent, dependent_dependency.fix_state, skip_if_satisfied=dependent.skip_if_satisfied
)
)

return DependAll(
DependAll(dependent_deps),
dep_cache.get(self),
dep_cache.get(stateful_object, new_state),
DependOn(stateful_object, self.old_state),
DependOn(stateful_object, self.old_state, skip_if_satisfied=self.skip_if_satisfied),
)
else:
return dep_cache.get(self)
Expand Down
3 changes: 3 additions & 0 deletions chroma_core/models/lnet_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ def get_steps(self):
class EnableLNetJob(NullStateChangeJob):
target_object = models.ForeignKey(LNetConfiguration, on_delete=CASCADE)
state_transition = StateChangeJob.StateTransition(LNetConfiguration, "unconfigured", "lnet_unloaded")
skip_if_satisfied = True

class Meta:
app_label = "chroma_core"
Expand Down Expand Up @@ -320,6 +321,7 @@ class LoadLNetJob(LNetStateChangeJob):
stateful_object = "lnet_configuration"
lnet_configuration = models.ForeignKey(LNetConfiguration, on_delete=CASCADE)
state_verb = "Load LNet"
skip_if_satisfied = True

display_group = Job.JOB_GROUPS.COMMON
display_order = 30
Expand Down Expand Up @@ -359,6 +361,7 @@ class StartLNetJob(LNetStateChangeJob):
stateful_object = "lnet_configuration"
lnet_configuration = models.ForeignKey(LNetConfiguration, on_delete=CASCADE)
state_verb = "Start LNet"
skip_if_satisfied = True

display_group = Job.JOB_GROUPS.COMMON
display_order = 40
Expand Down
2 changes: 1 addition & 1 deletion chroma_core/models/stratagem.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,7 +679,7 @@ def get_steps(self):
client_host = ManagedHost.objects.get(
Q(server_profile_id="stratagem_client") | Q(server_profile_id="stratagem_existing_client")
)
client_mount = LustreClientMount.objects.get(host_id=client_host.id, filesystem=filesystem.name)
client_mount = LustreClientMount.objects.get(host_id=client_host.id, filesystem=self.filesystem.name)

return [
(
Expand Down
Loading

0 comments on commit c4bad3f

Please sign in to comment.