CLI: Add ability to create CVAT agents based on AA functions (cvat-ai…

…#8821)
ritikraj26 · Dec 25, 2024 · b1d0a6c · b1d0a6c
1 parent 7ce704d
commit b1d0a6c
Show file tree

Hide file tree

Showing 12 changed files with 710 additions and 82 deletions.
diff --git a/changelog.d/20241212_193004_roman_cli_agent.md b/changelog.d/20241212_193004_roman_cli_agent.md
@@ -0,0 +1,4 @@
+### Added
+
+- \[CLI\] Added commands for working with native functions
+  (<https://github.com/cvat-ai/cvat/pull/8821>)
diff --git a/cvat-cli/README.md b/cvat-cli/README.md
@@ -22,6 +22,11 @@ The following subcommands are supported:
   - `backup` - back up a task
   - `auto-annotate` - automatically annotate a task using a local function
 
+- Functions (Enterprise/Cloud only):
+  - `create-native` - create a function that can be powered by an agent
+  - `delete` - delete a function
+  - `run-agent` - process requests for a native function
+
 ## Installation
 
 `pip install cvat-cli`

diff --git a/cvat-cli/requirements/base.txt b/cvat-cli/requirements/base.txt
@@ -1,3 +1,5 @@
-cvat-sdk~=2.24.1
+cvat-sdk==2.24.1
+
+attrs>=24.2.0
 Pillow>=10.3.0
 setuptools>=70.0.0 # not directly required, pinned by Snyk to avoid a vulnerability
diff --git a/cvat-cli/src/cvat_cli/__main__.py b/cvat-cli/src/cvat_cli/__main__.py
@@ -11,7 +11,12 @@
 from cvat_sdk import exceptions
 
 from ._internal.commands_all import COMMANDS
-from ._internal.common import build_client, configure_common_arguments, configure_logger
+from ._internal.common import (
+    CriticalError,
+    build_client,
+    configure_common_arguments,
+    configure_logger,
+)
 from ._internal.utils import popattr
 
 logger = logging.getLogger(__name__)
@@ -29,7 +34,7 @@ def main(args: list[str] = None):
     try:
         with build_client(parsed_args, logger=logger) as client:
             popattr(parsed_args, "_executor")(client, **vars(parsed_args))
-    except (exceptions.ApiException, urllib3.exceptions.HTTPError) as e:
+    except (exceptions.ApiException, urllib3.exceptions.HTTPError, CriticalError) as e:
         logger.critical(e)
         return 1
 

diff --git a/cvat-cli/src/cvat_cli/_internal/agent.py b/cvat-cli/src/cvat_cli/_internal/agent.py
@@ -0,0 +1,351 @@
+# Copyright (C) 2024 CVAT.ai Corporation
+#
+# SPDX-License-Identifier: MIT
+
+import concurrent.futures
+import json
+import multiprocessing
+import random
+import secrets
+import shutil
+import tempfile
+import time
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Optional
+
+import cvat_sdk.auto_annotation as cvataa
+import cvat_sdk.datasets as cvatds
+import urllib3.exceptions
+from cvat_sdk import Client, models
+from cvat_sdk.auto_annotation.driver import (
+    _AnnotationMapper,
+    _DetectionFunctionContextImpl,
+    _LabelNameMapping,
+    _SpecNameMapping,
+)
+from cvat_sdk.exceptions import ApiException
+
+from .common import CriticalError, FunctionLoader
+
+FUNCTION_PROVIDER_NATIVE = "native"
+FUNCTION_KIND_DETECTOR = "detector"
+
+_POLLING_INTERVAL_MEAN = timedelta(seconds=60)
+_POLLING_INTERVAL_MAX_OFFSET = timedelta(seconds=10)
+
+_UPDATE_INTERVAL = timedelta(seconds=30)
+
+
+class _RecoverableExecutor:
+    # A wrapper around ProcessPoolExecutor that recreates the underlying
+    # executor when a worker crashes.
+    def __init__(self, initializer, initargs):
+        self._mp_context = multiprocessing.get_context("spawn")
+        self._initializer = initializer
+        self._initargs = initargs
+        self._executor = self._new_executor()
+
+    def _new_executor(self):
+        return concurrent.futures.ProcessPoolExecutor(
+            max_workers=1,
+            mp_context=self._mp_context,
+            initializer=self._initializer,
+            initargs=self._initargs,
+        )
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._executor.shutdown()
+
+    def submit(self, func, /, *args, **kwargs):
+        return self._executor.submit(func, *args, **kwargs)
+
+    def result(self, future: concurrent.futures.Future):
+        try:
+            return future.result()
+        except concurrent.futures.BrokenExecutor:
+            self._executor.shutdown()
+            self._executor = self._new_executor()
+            raise
+
+
+_current_function: cvataa.DetectionFunction
+
+
+def _worker_init(function_loader: FunctionLoader):
+    global _current_function
+    _current_function = function_loader.load()
+
+
+def _worker_job_get_function_spec():
+    return _current_function.spec
+
+
+def _worker_job_detect(context, image):
+    return _current_function.detect(context, image)
+
+
+class _Agent:
+    def __init__(self, client: Client, executor: _RecoverableExecutor, function_id: int):
+        self._rng = random.Random()  # nosec
+
+        self._client = client
+        self._executor = executor
+        self._function_id = function_id
+        self._function_spec = self._executor.result(
+            self._executor.submit(_worker_job_get_function_spec)
+        )
+
+        _, response = self._client.api_client.call_api(
+            "/api/functions/{function_id}",
+            "GET",
+            path_params={"function_id": self._function_id},
+        )
+
+        remote_function = json.loads(response.data)
+
+        self._validate_function_compatibility(remote_function)
+
+        self._agent_id = secrets.token_hex(16)
+        self._client.logger.info("Agent starting with ID %r", self._agent_id)
+
+        self._cached_task_id = None
+
+    def _validate_function_compatibility(self, remote_function: dict) -> None:
+        function_id = remote_function["id"]
+
+        if remote_function["provider"] != FUNCTION_PROVIDER_NATIVE:
+            raise CriticalError(
+                f"Function #{function_id} has provider {remote_function['provider']!r}. "
+                f"Agents can only be run for functions with provider {FUNCTION_PROVIDER_NATIVE!r}."
+            )
+
+        if isinstance(self._function_spec, cvataa.DetectionFunctionSpec):
+            self._validate_detection_function_compatibility(remote_function)
+            self._calculate_result_for_ar = self._calculate_result_for_detection_ar
+        else:
+            raise CriticalError(
+                f"Unsupported function spec type: {type(self._function_spec).__name__}"
+            )
+
+    def _validate_detection_function_compatibility(self, remote_function: dict) -> None:
+        incompatible_msg = (
+            f"Function #{remote_function['id']} is incompatible with function object: "
+        )
+
+        if remote_function["kind"] != FUNCTION_KIND_DETECTOR:
+            raise CriticalError(
+                incompatible_msg
+                + f"kind is {remote_function['kind']!r} (expected {FUNCTION_KIND_DETECTOR!r})."
+            )
+
+        labels_by_name = {label.name: label for label in self._function_spec.labels}
+
+        for remote_label in remote_function["labels_v2"]:
+            label = labels_by_name.get(remote_label["name"])
+
+            if not label:
+                raise CriticalError(
+                    incompatible_msg + f"label {remote_label['name']!r} is not supported."
+                )
+
+            if (
+                remote_label["type"] not in {"any", "unknown"}
+                and remote_label["type"] != label.type
+            ):
+                raise CriticalError(
+                    incompatible_msg
+                    + f"label {remote_label['name']!r} has type {remote_label['type']!r}, "
+                    f"but the function object expects type {label.type!r}."
+                )
+
+            if remote_label["attributes"]:
+                raise CriticalError(
+                    incompatible_msg
+                    + f"label {remote_label['name']!r} has attributes, which is not supported."
+                )
+
+    def _wait_between_polls(self):
+        # offset the interval randomly to avoid synchronization between workers
+        max_offset_sec = _POLLING_INTERVAL_MAX_OFFSET.total_seconds()
+        offset_sec = self._rng.uniform(-max_offset_sec, max_offset_sec)
+        time.sleep(_POLLING_INTERVAL_MEAN.total_seconds() + offset_sec)
+
+    def run(self, *, burst: bool) -> None:
+        if burst:
+            while ar_assignment := self._poll_for_ar():
+                self._process_ar(ar_assignment)
+            self._client.logger.info("No annotation requests left in queue; exiting.")
+        else:
+            while True:
+                if ar_assignment := self._poll_for_ar():
+                    self._process_ar(ar_assignment)
+                else:
+                    self._wait_between_polls()
+
+    def _process_ar(self, ar_assignment: dict) -> None:
+        self._client.logger.info("Got annotation request assignment: %r", ar_assignment)
+
+        ar_id = ar_assignment["ar_id"]
+
+        try:
+            result = self._calculate_result_for_ar(ar_id, ar_assignment["ar_params"])
+
+            self._client.logger.info("Submitting result for AR %r...", ar_id)
+            self._client.api_client.call_api(
+                "/api/functions/queues/{queue_id}/requests/{request_id}/complete",
+                "POST",
+                path_params={"queue_id": f"function:{self._function_id}", "request_id": ar_id},
+                body={"agent_id": self._agent_id, "annotations": result},
+            )
+            self._client.logger.info("AR %r completed", ar_id)
+        except Exception as ex:
+            self._client.logger.error("Failed to process AR %r", ar_id, exc_info=True)
+
+            # Arbitrary exceptions may contain details of the client's system or code, which
+            # shouldn't be exposed to the server (and to users of the function).
+            # Therefore, we only produce a limited amount of detail, and only in known failure cases.
+            error_message = "Unknown error"
+
+            if isinstance(ex, ApiException):
+                if ex.status:
+                    error_message = f"Received HTTP status {ex.status}"
+                else:
+                    error_message = "Failed an API call"
+            elif isinstance(ex, urllib3.exceptions.RequestError):
+                if isinstance(ex, urllib3.exceptions.MaxRetryError):
+                    ex_type = type(ex.reason)
+                else:
+                    ex_type = type(ex)
+
+                error_message = f"Failed to make an HTTP request to {ex.url} ({ex_type.__name__})"
+            elif isinstance(ex, urllib3.exceptions.HTTPError):
+                error_message = "Failed to make an HTTP request"
+            elif isinstance(ex, cvataa.BadFunctionError):
+                error_message = "Underlying function returned incorrect result: " + str(ex)
+            elif isinstance(ex, concurrent.futures.BrokenExecutor):
+                error_message = "Worker process crashed"
+
+            try:
+                self._client.api_client.call_api(
+                    "/api/functions/queues/{queue_id}/requests/{request_id}/fail",
+                    "POST",
+                    path_params={
+                        "queue_id": f"function:{self._function_id}",
+                        "request_id": ar_id,
+                    },
+                    body={"agent_id": self._agent_id, "exc_info": error_message},
+                )
+            except Exception:
+                self._client.logger.error("Couldn't fail AR %r", ar_id, exc_info=True)
+            else:
+                self._client.logger.info("AR %r failed", ar_id)
+
+    def _poll_for_ar(self) -> Optional[dict]:
+        while True:
+            self._client.logger.info("Trying to acquire an annotation request...")
+            try:
+                _, response = self._client.api_client.call_api(
+                    "/api/functions/queues/{queue_id}/requests/acquire",
+                    "POST",
+                    path_params={"queue_id": f"function:{self._function_id}"},
+                    body={"agent_id": self._agent_id, "request_category": "batch"},
+                )
+                break
+            except (urllib3.exceptions.HTTPError, ApiException) as ex:
+                if isinstance(ex, ApiException) and ex.status and 400 <= ex.status < 500:
+                    # We did something wrong; no point in retrying.
+                    raise
+
+                self._client.logger.error("Acquire request failed; will retry", exc_info=True)
+                self._wait_between_polls()
+
+        response_data = json.loads(response.data)
+        return response_data["ar_assignment"]
+
+    def _calculate_result_for_detection_ar(
+        self, ar_id: str, ar_params
+    ) -> models.PatchedLabeledDataRequest:
+        if ar_params["type"] != "annotate_task":
+            raise RuntimeError(f"Unsupported AR type: {ar_params['type']!r}")
+
+        if ar_params["task"] != self._cached_task_id:
+            # To avoid uncontrolled disk usage,
+            # we'll only keep one task in the cache at a time.
+            self._client.logger.info("Switched to a new task; clearing the cache...")
+            if self._client.config.cache_dir.exists():
+                shutil.rmtree(self._client.config.cache_dir)
+
+        ds = cvatds.TaskDataset(self._client, ar_params["task"], load_annotations=False)
+
+        self._cached_task_id = ar_params["task"]
+
+        # Fetching the dataset might take a while, so do a progress update to let the server
+        # know we're still alive.
+        self._update_ar(ar_id, 0)
+        last_update_timestamp = datetime.now(tz=timezone.utc)
+
+        mapping = ar_params["mapping"]
+        conv_mask_to_poly = ar_params["conv_mask_to_poly"]
+
+        spec_nm = _SpecNameMapping(
+            labels={k: _LabelNameMapping(v["name"]) for k, v in mapping.items()}
+        )
+
+        mapper = _AnnotationMapper(
+            self._client.logger,
+            self._function_spec.labels,
+            ds.labels,
+            allow_unmatched_labels=False,
+            spec_nm=spec_nm,
+            conv_mask_to_poly=conv_mask_to_poly,
+        )
+
+        all_annotations = models.PatchedLabeledDataRequest(shapes=[])
+
+        for sample_index, sample in enumerate(ds.samples):
+            context = _DetectionFunctionContextImpl(
+                frame_name=sample.frame_name,
+                conf_threshold=ar_params["threshold"],
+                conv_mask_to_poly=conv_mask_to_poly,
+            )
+            shapes = self._executor.result(
+                self._executor.submit(_worker_job_detect, context, sample.media.load_image())
+            )
+
+            mapper.validate_and_remap(shapes, sample.frame_index)
+            all_annotations.shapes.extend(shapes)
+
+            current_timestamp = datetime.now(tz=timezone.utc)
+
+            if current_timestamp >= last_update_timestamp + _UPDATE_INTERVAL:
+                self._update_ar(ar_id, (sample_index + 1) / len(ds.samples))
+                last_update_timestamp = current_timestamp
+
+        return all_annotations
+
+    def _update_ar(self, ar_id: str, progress: float) -> None:
+        self._client.logger.info("Updating AR %r progress to %.2f%%", ar_id, progress * 100)
+        self._client.api_client.call_api(
+            "/api/functions/queues/{queue_id}/requests/{request_id}/update",
+            "POST",
+            path_params={"queue_id": f"function:{self._function_id}", "request_id": ar_id},
+            body={"agent_id": self._agent_id, "progress": progress},
+        )
+
+
+def run_agent(
+    client: Client, function_loader: FunctionLoader, function_id: int, *, burst: bool
+) -> None:
+    with (
+        _RecoverableExecutor(initializer=_worker_init, initargs=[function_loader]) as executor,
+        tempfile.TemporaryDirectory() as cache_dir,
+    ):
+        client.config.cache_dir = Path(cache_dir, "cache")
+        client.logger.info("Will store cache at %s", client.config.cache_dir)
+
+        agent = _Agent(client, executor, function_id)
+        agent.run(burst=burst)