Add ML Profile APIs (opensearch-project#787)

* Added ML get profile APIs. Signed-off-by: Nathalie Jonathan <[email protected]> * Modified response payload state from RUNNING to COMPLETED, updated PR link in CHANGELOG.md. Signed-off-by: Nathalie Jonathan <[email protected]> * Rebase, changed native memory threshold settings to JVP heap memory threshold, fixed vale errors. Signed-off-by: Nathalie Jonathan <[email protected]> * Renamed files to follow the naming convention. Signed-off-by: Nathalie Jonathan <[email protected]> --------- Signed-off-by: Nathalie Jonathan <[email protected]>
dblock · Jan 14, 2025 · 806b25f · 806b25f
1 parent 5e22b8b
commit 806b25f
Show file tree

Hide file tree

Showing 6 changed files with 455 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -49,6 +49,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - Added support for evaluating response payloads in prologues and epilogues ([#772](https://github.com/opensearch-project/opensearch-api-specification/pull/772))
 - Added `GET /_plugins/_ml/models/{model_id}`, `POST /_plugins/_ml/models/_search`, `POST /_plugins/_ml/models/_unload`, `_undeploy`, `_upload`, `meta`, `_register_meta`, `POST /_plugins/_ml/models/{model_id}/_load`, `_predict`, `_unload`, `chunk/{chunk_number}`, `upload_chunk/{chunk_number}`, and `PUT /_plugins/_ml/models/{model_id}` ([#733](https://github.com/opensearch-project/opensearch-api-specification/pull/733))
 - Added `GET`, `POST`, `PUT`, `DELETE /_plugins/_ml/controllers/{model_id}` ([#779](https://github.com/opensearch-project/opensearch-api-specification/pull/779))
+- Added `GET /_plugins/_ml/profile`, `GET /_plugins/_ml/profile/models`, `models/{model_id}`, `tasks`, `tasks/{task_id}` ([#787](https://github.com/opensearch-project/opensearch-api-specification/pull/787))
 
 ### Removed
 - Removed unsupported `_common.mapping:SourceField`'s `mode` field and associated `_common.mapping:SourceFieldMode` enum ([#652](https://github.com/opensearch-project/opensearch-api-specification/pull/652))

diff --git a/spec/namespaces/ml.yaml b/spec/namespaces/ml.yaml
@@ -630,6 +630,65 @@ paths:
       responses:
         '200':
           $ref: '#/components/responses/ml.delete_controller@200'
+  /_plugins/_ml/profile:
+    get:
+      operationId: ml.get_profile.0
+      x-operation-group: ml.get_profile
+      x-version-added: '2.4'
+      description: Get a profile.
+      requestBody:
+        $ref: '#/components/requestBodies/ml.get_profile'
+      responses:
+        '200':
+          $ref: '#/components/responses/ml.get_profile@200'
+  /_plugins/_ml/profile/models:
+    get:
+      operationId: ml.get_profile_models.0
+      x-operation-group: ml.get_profile_models
+      x-version-added: '2.4'
+      description: Get a profile models.
+      requestBody:
+        $ref: '#/components/requestBodies/ml.get_profile_models'
+      responses:
+        '200':
+          $ref: '#/components/responses/ml.get_profile_models@200'
+  /_plugins/_ml/profile/models/{model_id}:
+    get:
+      operationId: ml.get_profile_models.1
+      x-operation-group: ml.get_profile_models
+      x-version-added: '2.11'
+      description: Get a profile models.
+      requestBody:
+        $ref: '#/components/requestBodies/ml.get_profile_models'
+      parameters:
+        - $ref: '#/components/parameters/ml.get_profile_models::path.model_id'
+      responses:
+        '200':
+          $ref: '#/components/responses/ml.get_profile_models@200'
+  /_plugins/_ml/profile/tasks:
+    get:
+      operationId: ml.get_profile_tasks.0
+      x-operation-group: ml.get_profile_tasks
+      x-version-added: '2.4'
+      description: Get a profile tasks.
+      requestBody:
+        $ref: '#/components/requestBodies/ml.get_profile_tasks'
+      responses:
+        '200':
+          $ref: '#/components/responses/ml.get_profile_tasks@200'
+  /_plugins/_ml/profile/tasks/{task_id}:
+    get:
+      operationId: ml.get_profile_tasks.1
+      x-operation-group: ml.get_profile_tasks
+      x-version-added: '2.11'
+      description: Get a profile tasks.
+      requestBody:
+        $ref: '#/components/requestBodies/ml.get_profile_tasks'
+      parameters:
+        - $ref: '#/components/parameters/ml.get_profile_tasks::path.task_id'
+      responses:
+        '200':
+          $ref: '#/components/responses/ml.get_profile_tasks@200'
 components:
   requestBodies:
     ml.register_model_group:
@@ -1234,6 +1293,21 @@ components:
                 $ref: '../schemas/ml._common.yaml#/components/schemas/UserRateLimiter'
               model_id:
                 $ref: '../schemas/_common.yaml#/components/schemas/Name'
+    ml.get_profile:
+      content:
+        application/json:
+          schema:
+            $ref: '../schemas/ml._common.yaml#/components/schemas/ProfileRequest'
+    ml.get_profile_models:
+      content:
+        application/json:
+          schema:
+            $ref: '../schemas/ml._common.yaml#/components/schemas/ProfileRequest'
+    ml.get_profile_tasks:
+      content:
+        application/json:
+          schema:
+            $ref: '../schemas/ml._common.yaml#/components/schemas/ProfileRequest'
   responses:
     ml.register_model_group@200:
       content:
@@ -1571,6 +1645,21 @@ components:
         application/json:
           schema:
             $ref: '../schemas/_common.yaml#/components/schemas/WriteResponseBase'
+    ml.get_profile@200:
+      content:
+        application/json:
+          schema:
+            $ref: '../schemas/ml._common.yaml#/components/schemas/GetProfileResponse'
+    ml.get_profile_models@200:
+      content:
+        application/json:
+          schema:
+            $ref: '../schemas/ml._common.yaml#/components/schemas/GetProfileResponse'
+    ml.get_profile_tasks@200:
+      content:
+        application/json:
+          schema:
+            $ref: '../schemas/ml._common.yaml#/components/schemas/GetProfileResponse'
   parameters:
     ml.get_model_group::path.model_group_id:
       name: model_group_id
@@ -1800,5 +1889,17 @@ components:
       name: model_id
       in: path
       required: true
+      schema:
+        type: string
+    ml.get_profile_models::path.model_id:
+      name: model_id
+      in: path
+      required: true
+      schema:
+        type: string
+    ml.get_profile_tasks::path.task_id:
+      name: task_id
+      in: path
+      required: true
       schema:
         type: string
diff --git a/spec/schemas/ml._common.yaml b/spec/schemas/ml._common.yaml
@@ -1256,4 +1256,115 @@ components:
             - MILLISECONDS
             - MINUTES
             - NANOSECONDS
-            - SECONDS
+            - SECONDS
+    ProfileRequest:
+      type: object
+      properties:
+        node_ids:
+          type: array
+          items:
+            $ref: '_common.yaml#/components/schemas/Id'
+        model_ids:
+          type: array
+          items:
+            $ref: '_common.yaml#/components/schemas/Id'
+        task_ids:
+          type: array
+          items:
+            $ref: '_common.yaml#/components/schemas/Id'
+        return_all_tasks:
+          type: boolean
+          description: Whether to return all tasks.
+        return_all_models:
+          type: boolean
+          description: Whether to return all models.
+    GetProfileResponse:
+      type: object
+      properties:
+        nodes:
+          $ref: '#/components/schemas/Nodes'
+    Nodes:
+      type: object
+      additionalProperties:
+        $ref: '#/components/schemas/Node'
+    Node:
+      type: object
+      properties:
+        tasks:
+          $ref: '#/components/schemas/Tasks'
+        models:
+          $ref: '#/components/schemas/Models'
+    Models:
+      type: object
+      additionalProperties:
+        $ref: '#/components/schemas/ModelProfile'
+    ModelProfile:
+      type: object
+      properties:
+        model_state:
+          type: string
+          description: The model state.
+          enum:
+            - DEPLOYED
+            - DEPLOYING
+            - DEPLOY_FAILED
+            - PARTIALLY_DEPLOYED
+            - REGISTERED
+            - REGISTERING
+            - UNDEPLOYED
+        predictor:
+          type: string
+          description: The predictor.
+        worker_nodes:
+          type: array
+          items:
+            $ref: '_common.yaml#/components/schemas/NodeIds'
+        predict_request_stats:
+          $ref: '#/components/schemas/PredictRequestStats'
+        target_worker_nodes:
+          type: array
+          items:
+            $ref: '_common.yaml#/components/schemas/NodeIds'
+        memory_size_estimation_cpu:
+          type: integer
+          format: int64
+          description: The estimated memory size in CPU.
+        memory_size_estimation_gpu:
+          type: integer
+          format: int64
+          description: The estimated memory size in GPU.
+    PredictRequestStats:
+      type: object
+      properties:
+        count:
+          type: integer
+          format: int64
+          description: The total predict requests on this node.
+        max:
+          type: number
+          format: double
+          description: The maximum latency in milliseconds.
+        min:
+          type: number
+          format: double
+          description: The minimum latency in milliseconds.
+        average:
+          type: number
+          format: double
+          description: The average latency in milliseconds.
+        p50:
+          type: number
+          format: double
+          description: The 50th percentile latency in milliseconds.
+        p90:
+          type: number
+          format: double
+          description: The 90th percentile latency in milliseconds.
+        p99:
+          type: number
+          format: double
+          description: The 99th percentile latency in milliseconds.
+    Tasks:
+      type: object
+      additionalProperties:
+        $ref: '#/components/schemas/Task'
diff --git a/tests/plugins/ml/ml/profile.yaml b/tests/plugins/ml/ml/profile.yaml
@@ -0,0 +1,59 @@
+$schema: ../../../../json_schemas/test_story.schema.yaml
+
+description: Test the retrieval of runtime information about ML tasks and models.
+version: '>= 2.7'
+prologues:
+  - path: /_cluster/settings
+    method: PUT
+    request:
+      payload:
+        persistent:
+          plugins.ml_commons.jvm_heap_memory_threshold: 100
+  - path: /_plugins/_ml/models/_register
+    id: register_model
+    method: POST
+    request:
+      payload:
+        name: huggingface/sentence-transformers/msmarco-distilbert-base-tas-b
+        version: 1.0.1
+        model_format: TORCH_SCRIPT
+    output:
+      task_id: payload.task_id
+  - path: /_plugins/_ml/tasks/{task_id}
+    id: get_completed_task
+    method: GET
+    parameters:
+      task_id: ${register_model.task_id}
+    retry:
+      count: 3
+      wait: 10000
+    response:
+      status: 200
+      payload:
+        state: COMPLETED
+    output:
+      model_id: payload.model_id
+      node_id: payload.worker_node[0]
+epilogues:
+  - path: /_plugins/_ml/models/{model_id}
+    parameters:
+      model_id: ${get_completed_task.model_id}
+    method: DELETE
+    status: [200, 404]
+  - path: /_plugins/_ml/tasks/{task_id}
+    parameters:
+      task_id: ${register_model.task_id}
+    method: DELETE
+    status: [200, 404]
+chapters:
+  - synopsis: Get runtime information of all tasks and models on a specific node.
+    path: /_plugins/_ml/profile
+    method: GET
+    request:
+      payload:
+        node_ids:
+          - ${get_completed_task.node_id}
+        return_all_tasks: true
+        return_all_models: true
+    response:
+      status: 200