Skip to content

Commit

Permalink
Add a CRD for managing Blazar leases (#53)
Browse files Browse the repository at this point in the history
* Add initial code to call Blazar

* Improve the test coverage a little

* Lease CRD working

* Got into Error on 400 errors

TODO: notifiy caas about the type of error,
cloud full or out of credit

* Lease creation and deletion working + non-Blazar case

* Fix issues with tox

* Add map of size names for CAPI

* Wait until starts_at before setting non-Blazar leases to ACTIVE

* Run black

* Allow Blazar support to be disabled, even when Blazar is available

* Add metrics for the lease CRD

* Add starts_at metric for leases

* Add alert for lease phase

* Add debugging step to workflow

* Add functional tests for lease CRD

* Move tmate step earlier in job

* Small tweaks to functional test

* Move tmate step before k3s

* Allow tmate to be used at multiple steps

* Use non-conflicting CIDRs for k3s

* Move log outputting to signal handler

* Fix handling of path prefix for catalog URL

* Add tmate step back in

* Fix OpenStack client prefix calculation

* Fix catalog URL

* Fix typo in catalog URL calculation

* Use correct error message

* Fix broken tox tests

* Add unit tests for the lease CRD

---------

Co-authored-by: John Garbutt <[email protected]>
  • Loading branch information
mkjpryor and JohnGarbutt authored Aug 23, 2024
1 parent 316ea4e commit d74953a
Show file tree
Hide file tree
Showing 20 changed files with 2,905 additions and 30 deletions.
38 changes: 26 additions & 12 deletions .github/workflows/functional.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,35 @@ on:

jobs:
functional_test:
name: Operator functional tests via tox
timeout-minutes: 10
name: Operator functional tests
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v4

- name: Set up Helm
uses: azure/setup-helm@v4
with:
version: v3.11.3
- name: Deploy devstack
uses: EmilienM/[email protected]

- name: Create k8s Kind Cluster
uses: helm/[email protected]
- name: Install k3s
run: |
set -eo pipefail
curl -sfL https://get.k3s.io | \
bash -s - \
--disable traefik \
--cluster-cidr 172.30.0.0/16 \
--service-cidr 172.31.0.0/16
mkdir $HOME/.kube
sudo cp /etc/rancher/k3s/k3s.yaml $HOME/.kube/config
sudo chown $USER $HOME/.kube/config
- name: Run test
timeout-minutes: 10
run: tools/functional_test.sh
- name: Install gomplate
run: |
GOBIN=/usr/local/bin \
go install github.com/hairyhenderson/gomplate/v4/cmd/gomplate@latest
gomplate --version
- name: Run functional tests
timeout-minutes: 15
run: |
source devstack/openrc demo demo
tools/functional_test.sh
2 changes: 1 addition & 1 deletion azimuth_schedule_operator/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ async def main():
# This import is required to pick up the operator handlers
from . import operator # noqa

kopf.configure()
kopf.configure(log_prefix=True)
tasks = await kopf.spawn_tasks(
clusterwide=True, liveness_endpoint="http://0.0.0.0:8000/healthz"
)
Expand Down
50 changes: 50 additions & 0 deletions azimuth_schedule_operator/metrics.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
import datetime
import functools

from aiohttp import web
Expand Down Expand Up @@ -72,6 +73,51 @@ def value(self, obj):
return 1 if obj.get("status", {}).get("refDeleteTriggered", False) else 0


class LeaseMetric(Metric):
prefix = "azimuth_lease"

def labels(self, obj):
return {
"lease_namespace": obj.metadata.namespace,
"lease_name": obj.metadata.name,
}


class LeasePhase(LeaseMetric):
suffix = "phase"
description = "The phase of the lease"

def labels(self, obj):
return {
**super().labels(obj),
"phase": obj.get("status", {}).get("phase", "Unknown"),
}


class LeaseStartsAt(LeaseMetric):
suffix = "starts_at"
type = "gauge"
description = "The start time of the lease"

def value(self, obj):
created_at = obj.metadata["creationTimestamp"]
starts_at = obj.get("spec", {}).get("startsAt", created_at)
return datetime.datetime.fromisoformat(starts_at).timestamp()


class LeaseEndsAt(LeaseMetric):
suffix = "ends_at"
type = "gauge"
description = "The end time of the lease"

def value(self, obj):
ends_at = obj.get("spec", {}).get("endsAt")
if ends_at:
return datetime.datetime.fromisoformat(ends_at).timestamp()
else:
return -1


def escape(content):
"""Escape the given content for use in metric output."""
return content.replace("\\", r"\\").replace("\n", r"\n").replace('"', r"\"")
Expand Down Expand Up @@ -116,6 +162,10 @@ def render_openmetrics(*metrics):

METRICS = {
registry.API_GROUP: {
"leases": [
LeasePhase,
LeaseEndsAt,
],
"schedules": [
ScheduleRefFound,
ScheduleDeleteTriggered,
Expand Down
3 changes: 2 additions & 1 deletion azimuth_schedule_operator/models/registry.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import kube_custom_resource as crd

from azimuth_schedule_operator.models.v1alpha1 import schedule
from azimuth_schedule_operator.models.v1alpha1 import lease, schedule

API_GROUP = "scheduling.azimuth.stackhpc.com"
API_VERSION = API_GROUP + "/v1alpha1"
Expand All @@ -9,6 +9,7 @@

def get_registry():
registry = crd.CustomResourceRegistry(API_GROUP, CATEGORIES)
registry.discover_models(lease)
registry.discover_models(schedule)
return registry

Expand Down
127 changes: 127 additions & 0 deletions azimuth_schedule_operator/models/v1alpha1/lease.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import datetime as dt
import typing as t

from pydantic import Field

from kube_custom_resource import CustomResource, schema


class Machine(schema.BaseModel):
"""Represents a reservation for a machine."""

size_id: schema.constr(min_length=1) = Field(
..., description="The ID of the size for the machine."
)
count: schema.conint(gt=0) = Field(
..., description="The number of machines of this size to reserve."
)


class ResourcesSpec(schema.BaseModel):
"""The resources that a lease is reserving."""

machines: t.List[Machine] = Field(
default_factory=list,
description="Machines that should be reserved by the lease.",
)


class LeaseSpec(schema.BaseModel):
"""The spec of a lease."""

cloud_credentials_secret_name: schema.constr(min_length=1) = Field(
..., description="The name of the secret containing the cloud credentials."
)
starts_at: schema.Optional[dt.datetime] = Field(
None,
description=(
"The start time for the lease. "
"If no start time is given, it is assumed to start immediately."
),
)
ends_at: schema.Optional[dt.datetime] = Field(
None,
description=(
"The end time for the lease. "
"If no end time is given, the lease is assumed to be infinite."
),
)
grace_period: schema.Optional[schema.conint(ge=0)] = Field(
None,
description=(
"The grace period before the end of the lease that the platform "
"will be given to shut down gracefully. "
"If not given, the operator default grace period will be used."
),
)
resources: ResourcesSpec = Field(
..., description="The resources that the lease is reserving."
)


class LeasePhase(str, schema.Enum):
"""The phase of a lease."""

# Stable phases
PENDING = "Pending"
ACTIVE = "Active"
TERMINATED = "Terminated"
ERROR = "Error"
# Transitional phases
CREATING = "Creating"
STARTING = "Starting"
UPDATING = "Updating"
TERMINATING = "Terminating"
DELETING = "Deleting"
UNKNOWN = "Unknown"


class LeaseStatus(schema.BaseModel, extra="allow"):
"""The status of a lease."""

phase: LeasePhase = Field(LeasePhase.UNKNOWN, description="The phase of the lease.")
error_message: str = Field(
"", description="The error message for the lease, if known."
)
size_map: schema.Dict[str, str] = Field(
default_factory=dict,
description="Mapping of original size ID to reserved size ID.",
)
size_name_map: schema.Dict[str, str] = Field(
default_factory=dict,
description="Mapping of original size name to reserved size name.",
)

def set_phase(self, phase: LeasePhase, error_message: t.Optional[str] = None):
"""Set the phase of the lease, along with an optional error message."""
self.phase = phase
self.error_message = error_message if phase == LeasePhase.ERROR else ""


class Lease(
CustomResource,
subresources={"status": {}},
printer_columns=[
{
"name": "Starts At",
"type": "string",
"format": "date-time",
"jsonPath": ".spec.startsAt",
},
{
"name": "Ends At",
"type": "string",
"format": "date-time",
"jsonPath": ".spec.endsAt",
},
{
"name": "phase",
"type": "string",
"jsonPath": ".status.phase",
},
],
):
"""A lease consisting of one or more reserved resources."""

spec: LeaseSpec
status: LeaseStatus = Field(default_factory=LeaseStatus)
Loading

0 comments on commit d74953a

Please sign in to comment.