Skip to content

Commit

Permalink
🎨Autoscaling monitoring tool: compatible with TIP, small improvements (
Browse files Browse the repository at this point in the history
  • Loading branch information
sanderegg authored Sep 6, 2024
1 parent b7098eb commit 99de33d
Show file tree
Hide file tree
Showing 7 changed files with 67 additions and 53 deletions.
33 changes: 21 additions & 12 deletions scripts/maintenance/computational-clusters/Makefile
Original file line number Diff line number Diff line change
@@ -1,25 +1,34 @@
.DEFAULT_GOAL := install
.DEFAULT_GOAL := help

SHELL := /bin/bash

PYTHON_VERSION := $(or $(PYTHON), 3.11)


.PHONY: hel%
# thanks to https://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
hel%:
@echo "usage: make [target] ..."
@echo ""
@echo "Targets for '$(notdir $(CURDIR))':"
@echo ""
@awk --posix 'BEGIN {FS = ":.*?## "} /^[[:alpha:][:space:]_-]+:.*?## / {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
@echo ""

.venv:
# creating python virtual environment
@uv venv .venv
# installing python dependencies
@uv pip install --upgrade pip setuptools wheel

@uv venv --python=$(PYTHON_VERSION)

install: .venv
# activating python virtual environment
@source .venv/bin/activate
install: .venv ## installs using $PYTHON_VERSION or uses defaults
# installing package
@uv pip install .
@uv pip install --python=$(PYTHON_VERSION) .
# now you can call the maintenance scripts
# source .venv/bin/activate
# autoscaled-monitor --deploy-config PATH/TO/REPO.CONFIG summary

install-dev: .venv
# activating python virtual environment
@source .venv/bin/activate
install-dev: .venv ## installs in devel mode using PYTHON_VERSION or uses defaults
# installing package
@uv pip install -e .
# now you can call the maintenance scripts
# source .venv/bin/activate
# autoscaled-monitor --deploy-config PATH/TO/REPO.CONFIG summary
Original file line number Diff line number Diff line change
Expand Up @@ -33,25 +33,14 @@

def _parse_environment(deploy_config: Path) -> dict[str, str | None]:
repo_config = deploy_config / "repo.config"
assert repo_config.exists()
environment = dotenv_values(repo_config)
if environment["AUTOSCALING_EC2_ACCESS_KEY_ID"] == "":
if not repo_config.exists():
rich.print(
"Terraform variables detected, looking for repo.config.frozen as alternative."
" TIP: you are responsible for them being up to date!!"
f"[red]{repo_config} does not exist! Please run OPS code to generate it[/red]"
)
repo_config = deploy_config / "repo.config.frozen"
assert repo_config.exists()
environment = dotenv_values(repo_config)

if environment["AUTOSCALING_EC2_ACCESS_KEY_ID"] == "":
error_msg = (
"Terraform is necessary in order to check into that deployment!\n"
f"install terraform (check README.md in {state.deploy_config} for instructions)"
"then run make repo.config.frozen, then re-run this code"
)
rich.print(error_msg)
raise typer.Abort(error_msg)
raise typer.Exit(1)

environment = dotenv_values(repo_config)

assert environment
return environment

Expand All @@ -77,28 +66,21 @@ def main(
assert state.environment["EC2_INSTANCES_KEY_NAME"]
dynamic_pattern = f"{state.environment['EC2_INSTANCES_NAME_PREFIX']}-{{key_name}}"
state.dynamic_parser = parse.compile(dynamic_pattern)
rich.print(f"using dynamic-naming-regex: {dynamic_pattern}")
if state.environment["CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX"]:
state.computational_parser_primary = parse.compile(
f"{state.environment['CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX'].strip('-')}-{DEFAULT_COMPUTATIONAL_EC2_FORMAT}",
{"wallet_id_spec", wallet_id_spec},
rf"{state.environment['CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX'].strip('-')}-{DEFAULT_COMPUTATIONAL_EC2_FORMAT}",
{"wallet_id_spec": wallet_id_spec},
)
state.computational_parser_workers = parse.compile(
f"{state.environment['CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX'].strip('-')}-{DEFAULT_COMPUTATIONAL_EC2_FORMAT_WORKERS}",
{"wallet_id_spec", wallet_id_spec},
rf"{state.environment['CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX'].strip('-')}-{DEFAULT_COMPUTATIONAL_EC2_FORMAT_WORKERS}",
{"wallet_id_spec": wallet_id_spec},
)
rich.print(
f"compuational-primary-naming-regex: {state.computational_parser_primary._expression}" # noqa: SLF001
)
rich.print(
f"compuational-workers-naming-regex: {state.computational_parser_workers._expression}" # noqa: SLF001
)

# locate ssh key path
for file_path in deploy_config.glob("**/*.pem"):
if "license" in file_path.name:
if any(_ in file_path.name for _ in ["license", "pkcs8"]):
continue
# very bad HACK
# very bad HACK where the license file contain openssh in the name
if (
any(_ in f"{file_path}" for _ in ("sim4life.io", "osparc-master"))
and "openssh" not in f"{file_path}"
Expand All @@ -112,6 +94,11 @@ def main(
)
state.ssh_key_path = file_path
break
if not state.ssh_key_path:
rich.print(
f"[red]could not find ssh key in {deploy_config}! Please run OPS code to generate it[/red]"
)
raise typer.Exit(1)


@app.command()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ def wallet_id_spec(text) -> None | int:
str
] = r"osparc-computational-cluster-{role}-{swarm_stack_name}-user_id:{user_id:d}-wallet_id:{wallet_id:wallet_id_spec}-{key_name}"
DEFAULT_DYNAMIC_EC2_FORMAT: Final[str] = r"osparc-dynamic-autoscaled-worker-{key_name}"
DEPLOY_SSH_KEY_PARSER: Final[parse.Parser] = parse.compile(r"osparc-{random_name}.pem")
DEPLOY_SSH_KEY_PARSER: Final[parse.Parser] = parse.compile(
r"{prefix}-{random_name}.pem"
)

MINUTE: Final[int] = 60
HOUR: Final[int] = 60 * MINUTE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,6 @@ def _parse_computational(
or state.computational_parser_primary.parse(name)
):
assert isinstance(result, parse.Result)
# special handling for optional wallet
rich.print(result.named)

last_heartbeat = utils.get_last_heartbeat(instance)
return ComputationalInstance(
Expand Down Expand Up @@ -450,7 +448,7 @@ async def summary(state: AppState, user_id: int | None, wallet_id: int | None) -

def _print_computational_tasks(
user_id: int,
wallet_id: int,
wallet_id: int | None,
tasks: list[tuple[ComputationalTask | None, DaskTask | None]],
) -> None:
table = Table(
Expand Down Expand Up @@ -489,7 +487,7 @@ def _print_computational_tasks(


async def _list_computational_clusters(
state: AppState, user_id: int, wallet_id: int
state: AppState, user_id: int, wallet_id: int | None
) -> list[ComputationalCluster]:
assert state.ec2_resource_clusters_keeper
computational_instances = await ec2.list_computational_instances_from_ec2(
Expand All @@ -501,7 +499,7 @@ async def _list_computational_clusters(


async def cancel_jobs( # noqa: C901, PLR0912
state: AppState, user_id: int, wallet_id: int, *, force: bool
state: AppState, user_id: int, wallet_id: int | None, *, force: bool
) -> None:
# get the theory
computational_tasks = await db.list_computational_tasks_from_db(state, user_id)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import distributed
import rich
import typer
from mypy_boto3_ec2.service_resource import Instance
from pydantic import AnyUrl

Expand Down Expand Up @@ -63,6 +64,25 @@ async def dask_client(
f"{url}", security=security, timeout="5", asynchronous=True
)
)
versions = await _wrap_dask_async_call(client.get_versions())
if versions["client"]["python"] != versions["scheduler"]["python"]:
rich.print(
f"[red]python versions do not match! TIP: install the correct version {versions['scheduler']['python']}[/red]"
)
raise typer.Exit(1)
if (
versions["client"]["distributed"]
!= versions["scheduler"]["distributed"]
):
rich.print(
f"[red]distributed versions do not match! TIP: install the correct version {versions['scheduler']['distributed']}[/red]"
)
raise typer.Exit(1)
if versions["client"]["dask"] != versions["scheduler"]["dask"]:
rich.print(
f"[red]dask versions do not match! TIP: install the correct version {versions['scheduler']['dask']}[/red]"
)
raise typer.Exit(1)
yield client

finally:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,9 @@ async def get_dask_ip(
if exit_status != 0:
error_message = stderr.read().decode().strip()
_logger.error(
"Command failed with exit status %s: %s", exit_status, error_message
"Inspecting dask IP Command failed with exit status %s: %s",
exit_status,
error_message,
)
return "Not Found / Drained / Not Ready"

Expand Down
4 changes: 0 additions & 4 deletions scripts/maintenance/computational-clusters/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"

[project]
dependencies = [
"arrow",
Expand Down

0 comments on commit 99de33d

Please sign in to comment.