From c674c866548eab4cd9fe4112d753b3ffdc086435 Mon Sep 17 00:00:00 2001
From: Rivindu W <rivindu@arcee.ai>
Date: Wed, 28 Aug 2024 12:20:10 +1200
Subject: [PATCH 1/3] add start_evaluation

---
 arcee/__init__.py       |  2 ++
 arcee/api.py            | 70 ++++++++++++++++++++++++++++++++++++++++-
 arcee/schemas/routes.py |  1 +
 3 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/arcee/__init__.py b/arcee/__init__.py
index a9e04de..4ad17e5 100644
--- a/arcee/__init__.py
+++ b/arcee/__init__.py
@@ -17,6 +17,7 @@
     retrieve,
     start_alignment,
     start_deployment,
+    start_evaluation,
     start_pretraining,
     start_retriever_training,
     stop_deployment,
@@ -59,4 +60,5 @@
     "deployment_status",
     "merging_status",
     "alignment_status",
+    "start_evaluation",
 ]
diff --git a/arcee/api.py b/arcee/api.py
index 6207d05..aacf2b7 100644
--- a/arcee/api.py
+++ b/arcee/api.py
@@ -346,7 +346,6 @@ def corpus_status(corpus: str) -> Dict[str, str]:
 
     return make_request("post", Route.pretraining + "/corpus/status", data)
 
-
 def start_alignment(
     alignment_name: str,
     qa_set: Optional[str] = None,
@@ -533,3 +532,72 @@ def download_weights(type: model_weight_types, id_or_name: str) -> Response:
     """
     route = type_to_weights_route[type].format(id_or_name=id_or_name)
     return nonjson_request("get", route, stream=True)
+
+
+def start_evaluation(
+    evaluations_name: str,
+    eval_type: Optional[str] = None,
+    qa_set_name: Optional[str] = None,
+    deployment_model: Optional[Dict[str, Optional[str]]] = None,
+    reference_model: Optional[Dict[str, Optional[str]]] = None,
+    judge_model: Optional[Dict[str, Optional[str]]] = None,
+    model_type: Optional[str] = "arcee",
+    model_args: Optional[str] = None,
+    tasks_list: Optional[List[str]] = None,
+    target_compute: Optional[str] = None,
+    capacity_id: Optional[str] = None,
+    batch_size: Optional[int] = None
+) -> Dict[str, str]:
+    """
+    Start an evaluation job.
+
+    LLM as a judge model config should be in a format similar to:
+    deployment_model = {
+        "model_name": "arcee-model-name"
+    }
+    reference_model" = {
+        "model_name": "claude-3-5-sonnet-20240620",
+        "base_url": "https://api.anthropic.com/v1",
+        "api_key": anthropic_api_key
+    }
+    judge_model = {
+        "model_name": "gpt-4o",
+        "base_url": "https://api.openai.com/v1/",
+        "api_key": openai_api_key,
+        "custom_prompt": "Evaluate which response better adheres to factual accuracy, clarity, and relevance."
+    }
+
+    Args:
+        evaluations_name (str): The name of the evaluation job.
+        eval_type (Optional[str]): The type of evaluation, e.g., "llm_as_a_judge" or "lm-eval".
+        qa_set_name (Optional[str]): The name of the QA set being evaluated.
+        deployment_model (Optional[Dict[str, Optional[str]]]): Configuration for the deployment model.
+        reference_model (Optional[Dict[str, Optional[str]]]): Configuration for the reference model.
+        judge_model (Optional[Dict[str, Optional[str]]]): Configuration for the judge model.
+        model_type (Optional[str]): The type of the model, default is "arcee".
+        model_args (Optional[str]): Model arguments to be fed into lm-eval harness.
+        tasks_list (Optional[List[str]]): List of tasks for the evaluation. See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/README.md for more information.
+        target_compute (Optional[str]): The name of the compute instance to use, default is "p3.2xlarge".
+        capacity_id (Optional[str]): The name of the capacity block ID to use.
+        batch_size (Optional[int]): Batch size for evaluation.
+    """
+
+    data = {
+        "evaluations_name": evaluations_name,
+        "model_type": model_type,
+        "model_args": model_args,
+        "tasks_list": tasks_list if tasks_list else [],
+        "target_compute": target_compute,
+        "capacity_id": capacity_id,
+        "batch_size": batch_size,
+        "eval_type": eval_type,
+        "qa_set_name": qa_set_name,
+        "deployment_model": deployment_model,
+        "reference_model": reference_model,
+        "judge_model": judge_model,
+    }
+
+    # Remove any keys with None values
+    data = {k: v for k, v in data.items() if v is not None}
+
+    return make_request("post", Route.evaluation + "/evaluation/start", data)
diff --git a/arcee/schemas/routes.py b/arcee/schemas/routes.py
index 4be2caa..f2e6222 100644
--- a/arcee/schemas/routes.py
+++ b/arcee/schemas/routes.py
@@ -15,3 +15,4 @@ class Route(StrEnum):
     deployment = "deployment"
     merging = "merging"
     retriever = "models"
+    evaluation = "evaluation"

From fab2931231f10771242a38f07832beafb771ae05 Mon Sep 17 00:00:00 2001
From: Rivindu W <rivindu@arcee.ai>
Date: Mon, 7 Oct 2024 09:32:11 +0700
Subject: [PATCH 2/3] debug

---
 arcee/api.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/arcee/api.py b/arcee/api.py
index aacf2b7..0ea145d 100644
--- a/arcee/api.py
+++ b/arcee/api.py
@@ -551,9 +551,20 @@ def start_evaluation(
     """
     Start an evaluation job.
 
+    To run lm-eval-harness benchmarks, the input should be in a format similar to:
+    {
+        "evaluations_name": "my_lm_eval_harness_test",
+        "eval_type": "lm-eval-harness",
+        "model_type": "arcee",  # or "hf" depending on your model
+        "model_args": "pretrained=my_model_name,use_accelerate=True",  # Adjust based on https://github.com/EleutherAI/lm-evaluation-harness
+        "tasks_list": ["hellaswag", "mmlu_stem"],  # List of tasks to evaluate
+    }
+
     LLM as a judge model config should be in a format similar to:
     deployment_model = {
-        "model_name": "arcee-model-name"
+        "model_name": "arcee_model_name",
+        "base_url": "https://app.arcee.ai/api/v2",
+        "api_key": f"{os.environ['ARCEE_API_KEY']}"
     }
     reference_model" = {
         "model_name": "claude-3-5-sonnet-20240620",
@@ -562,7 +573,7 @@ def start_evaluation(
     }
     judge_model = {
         "model_name": "gpt-4o",
-        "base_url": "https://api.openai.com/v1/",
+        "base_url": "https://api.openai.com/v1",
         "api_key": openai_api_key,
         "custom_prompt": "Evaluate which response better adheres to factual accuracy, clarity, and relevance."
     }
@@ -583,6 +594,7 @@ def start_evaluation(
     """
 
     data = {
+        "action": "start",
         "evaluations_name": evaluations_name,
         "model_type": model_type,
         "model_args": model_args,
@@ -600,4 +612,4 @@ def start_evaluation(
     # Remove any keys with None values
     data = {k: v for k, v in data.items() if v is not None}
 
-    return make_request("post", Route.evaluation + "/evaluation/start", data)
+    return make_request("post", Route.evaluation + "/start", data)
\ No newline at end of file

From e9a88efd66daa25a6ad6173c80ac1c248ab47f1f Mon Sep 17 00:00:00 2001
From: Rivindu W <rivindu@arcee.ai>
Date: Mon, 7 Oct 2024 15:45:03 +0700
Subject: [PATCH 3/3] add status and fix linting errors

---
 arcee/__init__.py |  2 ++
 arcee/api.py      | 52 +++++++++++++++++++++++++++++++++++++----------
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/arcee/__init__.py b/arcee/__init__.py
index 4ad17e5..2939ad7 100644
--- a/arcee/__init__.py
+++ b/arcee/__init__.py
@@ -9,6 +9,7 @@
     delete_corpus,
     deployment_status,
     generate,
+    get_evaluation_status,
     get_retriever_status,
     list_pretrainings,
     mergekit_evolve,
@@ -61,4 +62,5 @@
     "merging_status",
     "alignment_status",
     "start_evaluation",
+    "get_evaluation_status",
 ]
diff --git a/arcee/api.py b/arcee/api.py
index 0ea145d..d901f1e 100644
--- a/arcee/api.py
+++ b/arcee/api.py
@@ -346,6 +346,7 @@ def corpus_status(corpus: str) -> Dict[str, str]:
 
     return make_request("post", Route.pretraining + "/corpus/status", data)
 
+
 def start_alignment(
     alignment_name: str,
     qa_set: Optional[str] = None,
@@ -538,15 +539,15 @@ def start_evaluation(
     evaluations_name: str,
     eval_type: Optional[str] = None,
     qa_set_name: Optional[str] = None,
-    deployment_model: Optional[Dict[str, Optional[str]]] = None,
-    reference_model: Optional[Dict[str, Optional[str]]] = None,
-    judge_model: Optional[Dict[str, Optional[str]]] = None,
+    deployment_model: Optional[Dict[str, Any]] = None,
+    reference_model: Optional[Dict[str, Any]] = None,
+    judge_model: Optional[Dict[str, Any]] = None,
     model_type: Optional[str] = "arcee",
     model_args: Optional[str] = None,
     tasks_list: Optional[List[str]] = None,
     target_compute: Optional[str] = None,
     capacity_id: Optional[str] = None,
-    batch_size: Optional[int] = None
+    batch_size: Optional[int] = None,
 ) -> Dict[str, str]:
     """
     Start an evaluation job.
@@ -582,23 +583,27 @@ def start_evaluation(
         evaluations_name (str): The name of the evaluation job.
         eval_type (Optional[str]): The type of evaluation, e.g., "llm_as_a_judge" or "lm-eval".
         qa_set_name (Optional[str]): The name of the QA set being evaluated.
-        deployment_model (Optional[Dict[str, Optional[str]]]): Configuration for the deployment model.
-        reference_model (Optional[Dict[str, Optional[str]]]): Configuration for the reference model.
-        judge_model (Optional[Dict[str, Optional[str]]]): Configuration for the judge model.
+        deployment_model (Optional[Dict[str, Any]]): Configuration for the deployment model.
+        reference_model (Optional[Dict[str, Any]]): Configuration for the reference model.
+        judge_model (Optional[Dict[str, Any]]): Configuration for the judge model.
         model_type (Optional[str]): The type of the model, default is "arcee".
         model_args (Optional[str]): Model arguments to be fed into lm-eval harness.
-        tasks_list (Optional[List[str]]): List of tasks for the evaluation. See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/README.md for more information.
+        tasks_list (Optional[List[str]]): List of tasks for the evaluation.
+            See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/README.md
+            for more information.
         target_compute (Optional[str]): The name of the compute instance to use, default is "p3.2xlarge".
         capacity_id (Optional[str]): The name of the capacity block ID to use.
         batch_size (Optional[int]): Batch size for evaluation.
+    Returns:
+        Dict[str, str]: A dictionary containing a success message, evaluations name, and evaluations ID.
     """
 
-    data = {
+    data: Dict[str, Any] = {
         "action": "start",
         "evaluations_name": evaluations_name,
         "model_type": model_type,
         "model_args": model_args,
-        "tasks_list": tasks_list if tasks_list else [],
+        "tasks_list": tasks_list or [],
         "target_compute": target_compute,
         "capacity_id": capacity_id,
         "batch_size": batch_size,
@@ -612,4 +617,29 @@ def start_evaluation(
     # Remove any keys with None values
     data = {k: v for k, v in data.items() if v is not None}
 
-    return make_request("post", Route.evaluation + "/start", data)
\ No newline at end of file
+    response = make_request("post", Route.evaluation + "/start", data)
+    return {
+        "message": response.get("message", "Starting evals..."),
+        "evaluations_name": response.get("evaluations_name", ""),
+        "evaluations_id": response.get("evaluations_id", ""),
+    }
+
+
+def get_evaluation_status(evaluation_id_or_name: str) -> Dict[str, Any]:
+    """
+    Retrieve the status and results of an evaluation job.
+
+    Args:
+        evaluation_id_or_name (str): The ID or name of the evaluation job.
+
+    Returns:
+        Dict[str, Any]: A dictionary containing the evaluation status and results.
+    """
+    response = make_request("get", f"{Route.evaluation}/{evaluation_id_or_name}")
+    return {
+        "name": response.get("name"),
+        "id": response.get("id"),
+        "processing_state": response.get("processing_state", ""),
+        "status": response.get("status", ""),
+        "evaluation_data": response.get("evaluation_data", {}),
+    }