From c674c866548eab4cd9fe4112d753b3ffdc086435 Mon Sep 17 00:00:00 2001 From: Rivindu W Date: Wed, 28 Aug 2024 12:20:10 +1200 Subject: [PATCH 1/3] add start_evaluation --- arcee/__init__.py | 2 ++ arcee/api.py | 70 ++++++++++++++++++++++++++++++++++++++++- arcee/schemas/routes.py | 1 + 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/arcee/__init__.py b/arcee/__init__.py index a9e04de..4ad17e5 100644 --- a/arcee/__init__.py +++ b/arcee/__init__.py @@ -17,6 +17,7 @@ retrieve, start_alignment, start_deployment, + start_evaluation, start_pretraining, start_retriever_training, stop_deployment, @@ -59,4 +60,5 @@ "deployment_status", "merging_status", "alignment_status", + "start_evaluation", ] diff --git a/arcee/api.py b/arcee/api.py index 6207d05..aacf2b7 100644 --- a/arcee/api.py +++ b/arcee/api.py @@ -346,7 +346,6 @@ def corpus_status(corpus: str) -> Dict[str, str]: return make_request("post", Route.pretraining + "/corpus/status", data) - def start_alignment( alignment_name: str, qa_set: Optional[str] = None, @@ -533,3 +532,72 @@ def download_weights(type: model_weight_types, id_or_name: str) -> Response: """ route = type_to_weights_route[type].format(id_or_name=id_or_name) return nonjson_request("get", route, stream=True) + + +def start_evaluation( + evaluations_name: str, + eval_type: Optional[str] = None, + qa_set_name: Optional[str] = None, + deployment_model: Optional[Dict[str, Optional[str]]] = None, + reference_model: Optional[Dict[str, Optional[str]]] = None, + judge_model: Optional[Dict[str, Optional[str]]] = None, + model_type: Optional[str] = "arcee", + model_args: Optional[str] = None, + tasks_list: Optional[List[str]] = None, + target_compute: Optional[str] = None, + capacity_id: Optional[str] = None, + batch_size: Optional[int] = None +) -> Dict[str, str]: + """ + Start an evaluation job. + + LLM as a judge model config should be in a format similar to: + deployment_model = { + "model_name": "arcee-model-name" + } + reference_model" = { + "model_name": "claude-3-5-sonnet-20240620", + "base_url": "https://api.anthropic.com/v1", + "api_key": anthropic_api_key + } + judge_model = { + "model_name": "gpt-4o", + "base_url": "https://api.openai.com/v1/", + "api_key": openai_api_key, + "custom_prompt": "Evaluate which response better adheres to factual accuracy, clarity, and relevance." + } + + Args: + evaluations_name (str): The name of the evaluation job. + eval_type (Optional[str]): The type of evaluation, e.g., "llm_as_a_judge" or "lm-eval". + qa_set_name (Optional[str]): The name of the QA set being evaluated. + deployment_model (Optional[Dict[str, Optional[str]]]): Configuration for the deployment model. + reference_model (Optional[Dict[str, Optional[str]]]): Configuration for the reference model. + judge_model (Optional[Dict[str, Optional[str]]]): Configuration for the judge model. + model_type (Optional[str]): The type of the model, default is "arcee". + model_args (Optional[str]): Model arguments to be fed into lm-eval harness. + tasks_list (Optional[List[str]]): List of tasks for the evaluation. See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/README.md for more information. + target_compute (Optional[str]): The name of the compute instance to use, default is "p3.2xlarge". + capacity_id (Optional[str]): The name of the capacity block ID to use. + batch_size (Optional[int]): Batch size for evaluation. + """ + + data = { + "evaluations_name": evaluations_name, + "model_type": model_type, + "model_args": model_args, + "tasks_list": tasks_list if tasks_list else [], + "target_compute": target_compute, + "capacity_id": capacity_id, + "batch_size": batch_size, + "eval_type": eval_type, + "qa_set_name": qa_set_name, + "deployment_model": deployment_model, + "reference_model": reference_model, + "judge_model": judge_model, + } + + # Remove any keys with None values + data = {k: v for k, v in data.items() if v is not None} + + return make_request("post", Route.evaluation + "/evaluation/start", data) diff --git a/arcee/schemas/routes.py b/arcee/schemas/routes.py index 4be2caa..f2e6222 100644 --- a/arcee/schemas/routes.py +++ b/arcee/schemas/routes.py @@ -15,3 +15,4 @@ class Route(StrEnum): deployment = "deployment" merging = "merging" retriever = "models" + evaluation = "evaluation" From fab2931231f10771242a38f07832beafb771ae05 Mon Sep 17 00:00:00 2001 From: Rivindu W Date: Mon, 7 Oct 2024 09:32:11 +0700 Subject: [PATCH 2/3] debug --- arcee/api.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/arcee/api.py b/arcee/api.py index aacf2b7..0ea145d 100644 --- a/arcee/api.py +++ b/arcee/api.py @@ -551,9 +551,20 @@ def start_evaluation( """ Start an evaluation job. + To run lm-eval-harness benchmarks, the input should be in a format similar to: + { + "evaluations_name": "my_lm_eval_harness_test", + "eval_type": "lm-eval-harness", + "model_type": "arcee", # or "hf" depending on your model + "model_args": "pretrained=my_model_name,use_accelerate=True", # Adjust based on https://github.com/EleutherAI/lm-evaluation-harness + "tasks_list": ["hellaswag", "mmlu_stem"], # List of tasks to evaluate + } + LLM as a judge model config should be in a format similar to: deployment_model = { - "model_name": "arcee-model-name" + "model_name": "arcee_model_name", + "base_url": "https://app.arcee.ai/api/v2", + "api_key": f"{os.environ['ARCEE_API_KEY']}" } reference_model" = { "model_name": "claude-3-5-sonnet-20240620", @@ -562,7 +573,7 @@ def start_evaluation( } judge_model = { "model_name": "gpt-4o", - "base_url": "https://api.openai.com/v1/", + "base_url": "https://api.openai.com/v1", "api_key": openai_api_key, "custom_prompt": "Evaluate which response better adheres to factual accuracy, clarity, and relevance." } @@ -583,6 +594,7 @@ def start_evaluation( """ data = { + "action": "start", "evaluations_name": evaluations_name, "model_type": model_type, "model_args": model_args, @@ -600,4 +612,4 @@ def start_evaluation( # Remove any keys with None values data = {k: v for k, v in data.items() if v is not None} - return make_request("post", Route.evaluation + "/evaluation/start", data) + return make_request("post", Route.evaluation + "/start", data) \ No newline at end of file From e9a88efd66daa25a6ad6173c80ac1c248ab47f1f Mon Sep 17 00:00:00 2001 From: Rivindu W Date: Mon, 7 Oct 2024 15:45:03 +0700 Subject: [PATCH 3/3] add status and fix linting errors --- arcee/__init__.py | 2 ++ arcee/api.py | 52 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/arcee/__init__.py b/arcee/__init__.py index 4ad17e5..2939ad7 100644 --- a/arcee/__init__.py +++ b/arcee/__init__.py @@ -9,6 +9,7 @@ delete_corpus, deployment_status, generate, + get_evaluation_status, get_retriever_status, list_pretrainings, mergekit_evolve, @@ -61,4 +62,5 @@ "merging_status", "alignment_status", "start_evaluation", + "get_evaluation_status", ] diff --git a/arcee/api.py b/arcee/api.py index 0ea145d..d901f1e 100644 --- a/arcee/api.py +++ b/arcee/api.py @@ -346,6 +346,7 @@ def corpus_status(corpus: str) -> Dict[str, str]: return make_request("post", Route.pretraining + "/corpus/status", data) + def start_alignment( alignment_name: str, qa_set: Optional[str] = None, @@ -538,15 +539,15 @@ def start_evaluation( evaluations_name: str, eval_type: Optional[str] = None, qa_set_name: Optional[str] = None, - deployment_model: Optional[Dict[str, Optional[str]]] = None, - reference_model: Optional[Dict[str, Optional[str]]] = None, - judge_model: Optional[Dict[str, Optional[str]]] = None, + deployment_model: Optional[Dict[str, Any]] = None, + reference_model: Optional[Dict[str, Any]] = None, + judge_model: Optional[Dict[str, Any]] = None, model_type: Optional[str] = "arcee", model_args: Optional[str] = None, tasks_list: Optional[List[str]] = None, target_compute: Optional[str] = None, capacity_id: Optional[str] = None, - batch_size: Optional[int] = None + batch_size: Optional[int] = None, ) -> Dict[str, str]: """ Start an evaluation job. @@ -582,23 +583,27 @@ def start_evaluation( evaluations_name (str): The name of the evaluation job. eval_type (Optional[str]): The type of evaluation, e.g., "llm_as_a_judge" or "lm-eval". qa_set_name (Optional[str]): The name of the QA set being evaluated. - deployment_model (Optional[Dict[str, Optional[str]]]): Configuration for the deployment model. - reference_model (Optional[Dict[str, Optional[str]]]): Configuration for the reference model. - judge_model (Optional[Dict[str, Optional[str]]]): Configuration for the judge model. + deployment_model (Optional[Dict[str, Any]]): Configuration for the deployment model. + reference_model (Optional[Dict[str, Any]]): Configuration for the reference model. + judge_model (Optional[Dict[str, Any]]): Configuration for the judge model. model_type (Optional[str]): The type of the model, default is "arcee". model_args (Optional[str]): Model arguments to be fed into lm-eval harness. - tasks_list (Optional[List[str]]): List of tasks for the evaluation. See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/README.md for more information. + tasks_list (Optional[List[str]]): List of tasks for the evaluation. + See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/README.md + for more information. target_compute (Optional[str]): The name of the compute instance to use, default is "p3.2xlarge". capacity_id (Optional[str]): The name of the capacity block ID to use. batch_size (Optional[int]): Batch size for evaluation. + Returns: + Dict[str, str]: A dictionary containing a success message, evaluations name, and evaluations ID. """ - data = { + data: Dict[str, Any] = { "action": "start", "evaluations_name": evaluations_name, "model_type": model_type, "model_args": model_args, - "tasks_list": tasks_list if tasks_list else [], + "tasks_list": tasks_list or [], "target_compute": target_compute, "capacity_id": capacity_id, "batch_size": batch_size, @@ -612,4 +617,29 @@ def start_evaluation( # Remove any keys with None values data = {k: v for k, v in data.items() if v is not None} - return make_request("post", Route.evaluation + "/start", data) \ No newline at end of file + response = make_request("post", Route.evaluation + "/start", data) + return { + "message": response.get("message", "Starting evals..."), + "evaluations_name": response.get("evaluations_name", ""), + "evaluations_id": response.get("evaluations_id", ""), + } + + +def get_evaluation_status(evaluation_id_or_name: str) -> Dict[str, Any]: + """ + Retrieve the status and results of an evaluation job. + + Args: + evaluation_id_or_name (str): The ID or name of the evaluation job. + + Returns: + Dict[str, Any]: A dictionary containing the evaluation status and results. + """ + response = make_request("get", f"{Route.evaluation}/{evaluation_id_or_name}") + return { + "name": response.get("name"), + "id": response.get("id"), + "processing_state": response.get("processing_state", ""), + "status": response.get("status", ""), + "evaluation_data": response.get("evaluation_data", {}), + }