diff --git a/scripts/maintenance/computational-clusters/Makefile b/scripts/maintenance/computational-clusters/Makefile index c9aa61316d9..fcb32639566 100644 --- a/scripts/maintenance/computational-clusters/Makefile +++ b/scripts/maintenance/computational-clusters/Makefile @@ -1,25 +1,34 @@ -.DEFAULT_GOAL := install +.DEFAULT_GOAL := help SHELL := /bin/bash +PYTHON_VERSION := $(or $(PYTHON), 3.11) + + +.PHONY: hel% +# thanks to https://marmelab.com/blog/2016/02/29/auto-documented-makefile.html +hel%: + @echo "usage: make [target] ..." + @echo "" + @echo "Targets for '$(notdir $(CURDIR))':" + @echo "" + @awk --posix 'BEGIN {FS = ":.*?## "} /^[[:alpha:][:space:]_-]+:.*?## / {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST) + @echo "" + .venv: # creating python virtual environment - @uv venv .venv - # installing python dependencies - @uv pip install --upgrade pip setuptools wheel - + @uv venv --python=$(PYTHON_VERSION) -install: .venv - # activating python virtual environment - @source .venv/bin/activate +install: .venv ## installs using $PYTHON_VERSION or uses defaults # installing package - @uv pip install . + @uv pip install --python=$(PYTHON_VERSION) . # now you can call the maintenance scripts # source .venv/bin/activate # autoscaled-monitor --deploy-config PATH/TO/REPO.CONFIG summary -install-dev: .venv - # activating python virtual environment - @source .venv/bin/activate +install-dev: .venv ## installs in devel mode using PYTHON_VERSION or uses defaults # installing package @uv pip install -e . + # now you can call the maintenance scripts + # source .venv/bin/activate + # autoscaled-monitor --deploy-config PATH/TO/REPO.CONFIG summary diff --git a/scripts/maintenance/computational-clusters/autoscaled_monitor/cli.py b/scripts/maintenance/computational-clusters/autoscaled_monitor/cli.py index dd306e09220..189dde5c0df 100644 --- a/scripts/maintenance/computational-clusters/autoscaled_monitor/cli.py +++ b/scripts/maintenance/computational-clusters/autoscaled_monitor/cli.py @@ -33,25 +33,14 @@ def _parse_environment(deploy_config: Path) -> dict[str, str | None]: repo_config = deploy_config / "repo.config" - assert repo_config.exists() - environment = dotenv_values(repo_config) - if environment["AUTOSCALING_EC2_ACCESS_KEY_ID"] == "": + if not repo_config.exists(): rich.print( - "Terraform variables detected, looking for repo.config.frozen as alternative." - " TIP: you are responsible for them being up to date!!" + f"[red]{repo_config} does not exist! Please run OPS code to generate it[/red]" ) - repo_config = deploy_config / "repo.config.frozen" - assert repo_config.exists() - environment = dotenv_values(repo_config) - - if environment["AUTOSCALING_EC2_ACCESS_KEY_ID"] == "": - error_msg = ( - "Terraform is necessary in order to check into that deployment!\n" - f"install terraform (check README.md in {state.deploy_config} for instructions)" - "then run make repo.config.frozen, then re-run this code" - ) - rich.print(error_msg) - raise typer.Abort(error_msg) + raise typer.Exit(1) + + environment = dotenv_values(repo_config) + assert environment return environment @@ -77,28 +66,21 @@ def main( assert state.environment["EC2_INSTANCES_KEY_NAME"] dynamic_pattern = f"{state.environment['EC2_INSTANCES_NAME_PREFIX']}-{{key_name}}" state.dynamic_parser = parse.compile(dynamic_pattern) - rich.print(f"using dynamic-naming-regex: {dynamic_pattern}") if state.environment["CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX"]: state.computational_parser_primary = parse.compile( - f"{state.environment['CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX'].strip('-')}-{DEFAULT_COMPUTATIONAL_EC2_FORMAT}", - {"wallet_id_spec", wallet_id_spec}, + rf"{state.environment['CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX'].strip('-')}-{DEFAULT_COMPUTATIONAL_EC2_FORMAT}", + {"wallet_id_spec": wallet_id_spec}, ) state.computational_parser_workers = parse.compile( - f"{state.environment['CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX'].strip('-')}-{DEFAULT_COMPUTATIONAL_EC2_FORMAT_WORKERS}", - {"wallet_id_spec", wallet_id_spec}, + rf"{state.environment['CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX'].strip('-')}-{DEFAULT_COMPUTATIONAL_EC2_FORMAT_WORKERS}", + {"wallet_id_spec": wallet_id_spec}, ) - rich.print( - f"compuational-primary-naming-regex: {state.computational_parser_primary._expression}" # noqa: SLF001 - ) - rich.print( - f"compuational-workers-naming-regex: {state.computational_parser_workers._expression}" # noqa: SLF001 - ) # locate ssh key path for file_path in deploy_config.glob("**/*.pem"): - if "license" in file_path.name: + if any(_ in file_path.name for _ in ["license", "pkcs8"]): continue - # very bad HACK + # very bad HACK where the license file contain openssh in the name if ( any(_ in f"{file_path}" for _ in ("sim4life.io", "osparc-master")) and "openssh" not in f"{file_path}" @@ -112,6 +94,11 @@ def main( ) state.ssh_key_path = file_path break + if not state.ssh_key_path: + rich.print( + f"[red]could not find ssh key in {deploy_config}! Please run OPS code to generate it[/red]" + ) + raise typer.Exit(1) @app.command() diff --git a/scripts/maintenance/computational-clusters/autoscaled_monitor/constants.py b/scripts/maintenance/computational-clusters/autoscaled_monitor/constants.py index b730a1672e0..661039bed39 100644 --- a/scripts/maintenance/computational-clusters/autoscaled_monitor/constants.py +++ b/scripts/maintenance/computational-clusters/autoscaled_monitor/constants.py @@ -19,7 +19,9 @@ def wallet_id_spec(text) -> None | int: str ] = r"osparc-computational-cluster-{role}-{swarm_stack_name}-user_id:{user_id:d}-wallet_id:{wallet_id:wallet_id_spec}-{key_name}" DEFAULT_DYNAMIC_EC2_FORMAT: Final[str] = r"osparc-dynamic-autoscaled-worker-{key_name}" -DEPLOY_SSH_KEY_PARSER: Final[parse.Parser] = parse.compile(r"osparc-{random_name}.pem") +DEPLOY_SSH_KEY_PARSER: Final[parse.Parser] = parse.compile( + r"{prefix}-{random_name}.pem" +) MINUTE: Final[int] = 60 HOUR: Final[int] = 60 * MINUTE diff --git a/scripts/maintenance/computational-clusters/autoscaled_monitor/core.py b/scripts/maintenance/computational-clusters/autoscaled_monitor/core.py index 03611a94ea3..c0c4ba7bed6 100755 --- a/scripts/maintenance/computational-clusters/autoscaled_monitor/core.py +++ b/scripts/maintenance/computational-clusters/autoscaled_monitor/core.py @@ -43,8 +43,6 @@ def _parse_computational( or state.computational_parser_primary.parse(name) ): assert isinstance(result, parse.Result) - # special handling for optional wallet - rich.print(result.named) last_heartbeat = utils.get_last_heartbeat(instance) return ComputationalInstance( @@ -450,7 +448,7 @@ async def summary(state: AppState, user_id: int | None, wallet_id: int | None) - def _print_computational_tasks( user_id: int, - wallet_id: int, + wallet_id: int | None, tasks: list[tuple[ComputationalTask | None, DaskTask | None]], ) -> None: table = Table( @@ -489,7 +487,7 @@ def _print_computational_tasks( async def _list_computational_clusters( - state: AppState, user_id: int, wallet_id: int + state: AppState, user_id: int, wallet_id: int | None ) -> list[ComputationalCluster]: assert state.ec2_resource_clusters_keeper computational_instances = await ec2.list_computational_instances_from_ec2( @@ -501,7 +499,7 @@ async def _list_computational_clusters( async def cancel_jobs( # noqa: C901, PLR0912 - state: AppState, user_id: int, wallet_id: int, *, force: bool + state: AppState, user_id: int, wallet_id: int | None, *, force: bool ) -> None: # get the theory computational_tasks = await db.list_computational_tasks_from_db(state, user_id) diff --git a/scripts/maintenance/computational-clusters/autoscaled_monitor/dask.py b/scripts/maintenance/computational-clusters/autoscaled_monitor/dask.py index e18c2beb831..d6e8859869a 100644 --- a/scripts/maintenance/computational-clusters/autoscaled_monitor/dask.py +++ b/scripts/maintenance/computational-clusters/autoscaled_monitor/dask.py @@ -4,6 +4,7 @@ import distributed import rich +import typer from mypy_boto3_ec2.service_resource import Instance from pydantic import AnyUrl @@ -63,6 +64,25 @@ async def dask_client( f"{url}", security=security, timeout="5", asynchronous=True ) ) + versions = await _wrap_dask_async_call(client.get_versions()) + if versions["client"]["python"] != versions["scheduler"]["python"]: + rich.print( + f"[red]python versions do not match! TIP: install the correct version {versions['scheduler']['python']}[/red]" + ) + raise typer.Exit(1) + if ( + versions["client"]["distributed"] + != versions["scheduler"]["distributed"] + ): + rich.print( + f"[red]distributed versions do not match! TIP: install the correct version {versions['scheduler']['distributed']}[/red]" + ) + raise typer.Exit(1) + if versions["client"]["dask"] != versions["scheduler"]["dask"]: + rich.print( + f"[red]dask versions do not match! TIP: install the correct version {versions['scheduler']['dask']}[/red]" + ) + raise typer.Exit(1) yield client finally: diff --git a/scripts/maintenance/computational-clusters/autoscaled_monitor/ssh.py b/scripts/maintenance/computational-clusters/autoscaled_monitor/ssh.py index a19f4be0992..3caee6e3cb0 100644 --- a/scripts/maintenance/computational-clusters/autoscaled_monitor/ssh.py +++ b/scripts/maintenance/computational-clusters/autoscaled_monitor/ssh.py @@ -164,7 +164,9 @@ async def get_dask_ip( if exit_status != 0: error_message = stderr.read().decode().strip() _logger.error( - "Command failed with exit status %s: %s", exit_status, error_message + "Inspecting dask IP Command failed with exit status %s: %s", + exit_status, + error_message, ) return "Not Found / Drained / Not Ready" diff --git a/scripts/maintenance/computational-clusters/pyproject.toml b/scripts/maintenance/computational-clusters/pyproject.toml index e98f7f89049..b3db20eb921 100644 --- a/scripts/maintenance/computational-clusters/pyproject.toml +++ b/scripts/maintenance/computational-clusters/pyproject.toml @@ -1,7 +1,3 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - [project] dependencies = [ "arrow",