diff --git a/README.md b/README.md index db5d55e..280c405 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ python -m explainaboard_client.cli.evaluate_system \ --email $EB_EMAIL --api_key $EB_API_KEY \ --task [TASK_ID] \ --system_name [MODEL_NAME] \ - --system_output [SYSTEM_OUTPUT] --output_file_type [FILE_TYPE] \ + --system_output_file [SYSTEM_OUTPUT] --system_output_file_type [FILE_TYPE] \ --dataset [DATASET] --sub_dataset [SUB_DATASET] --split [SPLIT] \ --source_language [SOURCE] --target_language [TARGET] \ [--public] @@ -53,7 +53,7 @@ python -m explainaboard_client.cli.evaluate_system \ You will need to fill in all the settings appropriately, for example: * `[TASK_ID]` is the ID of the task you want to perform. A full list is [here](https://github.com/neulab/explainaboard_web/blob/main/backend/src/impl/tasks.py). * `[MODEL_NAME]` is whatever name you want to give to your model. -* `[SYSTEM_OUTPUT]` is the file that you want to evaluate. +* `[SYSTEM_OUTPUT_FILE]` is the file that you want to evaluate. * `[FILE_TYPE]` is the type of the file, "text", "tsv", "csv", "conll", or "json". * `[DATASET]`, `[SUB_DATASET]` and `[SPLIT]` indicate which dataset you're evaluating a system output for. @@ -71,8 +71,8 @@ python -m explainaboard_client.cli.evaluate_system \ --email $EB_EMAIL --api_key $EB_API_KEY \ --task [TASK_ID] \ --system_name [MODEL_NAME] \ - --system_output [SYSTEM_OUTPUT] --output_file_type [FILE_TYPE] \ - --custom_dataset [CUSTOM_DATASET] --custom_dataset_file_type [FILE_TYPE] \ + --system_output_file [SYSTEM_OUTPUT] --system_output_file_type [FILE_TYPE] \ + --custom_dataset_file [CUSTOM_DATASET] --custom_dataset_file_type [FILE_TYPE] \ --source_language [SOURCE] --target_language [TARGET] ``` diff --git a/explainaboard_client/cli/evaluate_system.py b/explainaboard_client/cli/evaluate_system.py index 374df1e..dc9057a 100644 --- a/explainaboard_client/cli/evaluate_system.py +++ b/explainaboard_client/cli/evaluate_system.py @@ -1,18 +1,7 @@ import argparse -import json -from explainaboard_api_client.model.system import System -from explainaboard_api_client.model.system_create_props import SystemCreateProps -from explainaboard_api_client.model.system_metadata import SystemMetadata -from explainaboard_api_client.model.system_output_props import SystemOutputProps from explainaboard_client import Config, ExplainaboardClient -from explainaboard_client.tasks import ( - DEFAULT_METRICS, - FileType, - infer_file_type, - TaskType, -) -from explainaboard_client.utils import generate_dataset_id +from explainaboard_client.tasks import FileType, TaskType def main(): @@ -30,14 +19,6 @@ def main(): help="Email address used to sign in to ExplainaBoard", ) parser.add_argument("--api_key", type=str, required=True, help="Your API key") - parser.add_argument( - "--server", - type=str, - required=False, - default="main", - choices=["main", "staging", "local"], - help='Which server to use, "main" should be sufficient', - ) # ---- System info parser.add_argument( "--task", @@ -53,13 +34,13 @@ def main(): help="Name of the system that you are evaluating", ) parser.add_argument( - "--system_output", + "--system_output_file", type=str, required=True, help="Path to the system output file", ) parser.add_argument( - "--output_file_type", + "--system_output_file_type", type=str, choices=FileType.list(), help="File type of the system output (eg text/json/tsv/conll)", @@ -82,7 +63,7 @@ def main(): help="The name of the dataset split to process", ) dataset_group.add_argument( - "--custom_dataset", type=str, help="The path to a custom dataset file" + "--custom_dataset_file", type=str, help="The path to a custom dataset file" ) parser.add_argument( "--custom_dataset_file_type", @@ -105,7 +86,7 @@ def main(): "--target_language", type=str, help="The language on the output side" ) parser.add_argument( - "--system_details", type=str, help="File of system details in JSON format" + "--system_details_file", type=str, help="File of system details in JSON format" ) parser.add_argument( "--public", action="store_true", help="Make the evaluation results public" @@ -113,66 +94,15 @@ def main(): parser.add_argument( "--shared_users", type=str, nargs="+", help="Emails of users to share with" ) - args = parser.parse_args() - - # Sanity checks - if not (args.source_language or args.target_language): - raise ValueError("You must specify source and/or target language") - - # Infer missing values - task = TaskType(args.task) - metric_names = args.metric_names or DEFAULT_METRICS[args.task] - source_language = args.source_language or args.target_language - target_language = args.target_language or args.source_language - output_file_type = args.output_file_type or infer_file_type( - args.system_output, task - ) - custom_dataset_file_type = args.custom_dataset_file_type or infer_file_type( - args.custom_dataset_file_type, task - ) - shared_users = args.shared_users or [] - - # Read system details file - system_details = {} - if args.system_details: - with open(args.system_details, "r") as fin: - system_details = json.load(fin) - - # Do the actual upload - system_output = SystemOutputProps( - data=args.system_output, - file_type=output_file_type, - ) - metadata = SystemMetadata( - task=args.task, - is_private=not args.public, - system_name=args.system_name, - metric_names=metric_names, - source_language=source_language, - target_language=target_language, - dataset_split=args.split, - shared_users=shared_users, - system_details=system_details, - ) - custom_dataset = None - if args.custom_dataset: - custom_dataset = SystemOutputProps( - data=args.custom_dataset, - file_type=custom_dataset_file_type, - ) - else: - metadata.dataset_metadata_id = generate_dataset_id( - args.dataset, args.sub_dataset - ) - create_props = ( - SystemCreateProps( - metadata=metadata, - system_output=system_output, - custom_dataset=custom_dataset, - ) - if custom_dataset is not None - else SystemCreateProps(metadata=metadata, system_output=system_output) + parser.add_argument( + "--server", + type=str, + required=False, + default="main", + choices=["main", "staging", "local"], + help='Which server to use, "main" should be sufficient', ) + args = parser.parse_args() client_config = Config( args.email, @@ -181,11 +111,26 @@ def main(): ) client = ExplainaboardClient(client_config) - result: System = client.systems_post(create_props) try: - sys_id = result.system_id - client.systems_get_by_id(sys_id) + evaluation_data = client.evaluate_system_file( + task=args.task, + system_name=args.system_name, + system_output_file=args.system_output_file, + system_output_file_type=args.system_output_file_type, + dataset=args.dataset, + sub_dataset=args.sub_dataset, + split=args.split, + custom_dataset_file=args.custom_dataset_file, + custom_dataset_file_type=args.custom_dataset_file_type, + metric_names=args.metric_names, + source_language=args.source_language, + target_language=args.target_language, + system_details_file=args.system_details_file, + public=args.public, + shared_users=args.shared_users, + ) frontend = client_config.get_env_host_map()[args.server].frontend + sys_id = evaluation_data.system_id print( f"successfully evaluated system {args.system_name} with ID {sys_id}\n" f"view it at {frontend}/systems?system_id={sys_id}\n" diff --git a/explainaboard_client/client.py b/explainaboard_client/client.py index 44b233e..73357ad 100644 --- a/explainaboard_client/client.py +++ b/explainaboard_client/client.py @@ -1,21 +1,35 @@ +from __future__ import annotations + +import json +import logging from multiprocessing.pool import ApplyResult from typing import Union from explainaboard_api_client import ApiClient from explainaboard_api_client.api.default_api import DefaultApi +from explainaboard_api_client.model.system_metadata import SystemMetadata from explainaboard_api_client.models import System, SystemCreateProps, SystemOutputProps from explainaboard_client.config import Config -from explainaboard_client.utils import encode_file_to_base64 +from explainaboard_client.tasks import DEFAULT_METRICS, infer_file_type, TaskType +from explainaboard_client.utils import encode_file_to_base64, generate_dataset_id -class ExplainaboardClient(DefaultApi): +class ExplainaboardClient: + # ---- Initializers, etc. def __init__(self, config: Config) -> None: - self._config = config + """Initialize the ExplainaBoard client with a specific configuration. + + Args: + config (Config): The configuration for the ExplainaBoard client. + """ + self._config: Config = config api_client = ApiClient(self._config.to_client_config()) - super().__init__(api_client) + self._default_api: DefaultApi = DefaultApi(api_client) + self._active: bool = True def close(self): - self.api_client.close() + self._default_api.api_client.close() + self._active = False def __enter__(self): return self @@ -23,9 +37,148 @@ def __enter__(self): def __exit__(self): self.close() + # ---- Client Functions + def evaluate_system_file( + self, + task: str, + system_name: str, + system_output_file: str, + system_output_file_type: str | None = None, + dataset: str | None = None, + sub_dataset: str | None = None, + split: str | None = None, + custom_dataset_file: str | None = None, + custom_dataset_file_type: str | None = None, + metric_names: list[str] | None = None, + source_language: str | None = None, + target_language: str | None = None, + system_details_file: str | None = None, + public: bool = False, + shared_users: list[str] | None = None, + ) -> dict: + """Evaluate a system output file and return a dictionary of results. + + Args: + task: What task you will be analyzing. + system_name: Name of the system that you are evaluating. + system_output_file: Path to the system output file. + system_output_file_type: File type of the system output + (eg text/json/tsv/conll). + dataset: A dataset name from DataLab. + sub_dataset: A sub-dataset name from DataLab. + split: The name of the dataset split to process. + custom_dataset_file: The path to a custom dataset file. + custom_dataset_file_type: File type of the custom dataset + (eg text/json/tsv/conll) + metric_names: The metrics to compute, leave blank for task defaults + source_language: The language on the input side. + target_language: The language on the output side. + system_details_file: File of system details in JSON format. + public: Make the evaluation results public. + shared_users: Emails of users to share with. + """ + # Sanity checks + if not (source_language or target_language): + raise ValueError("You must specify source and/or target language") + + # Infer missing values + task = TaskType(task) + metric_names = metric_names or DEFAULT_METRICS[task] + source_language = source_language or target_language + target_language = target_language or source_language + system_output_file_type = system_output_file_type or infer_file_type( + system_output_file, task + ) + custom_dataset_file_type = custom_dataset_file_type or infer_file_type( + custom_dataset_file_type, task + ) + shared_users = shared_users or [] + + # Read system details file + system_details: dict = {} + if system_details_file is not None: + with open(system_details_file, "r") as fin: + system_details = json.load(fin) + + # Do the actual upload + system_output = SystemOutputProps( + data=system_output_file, + file_type=system_output_file_type, + ) + metadata = SystemMetadata( + task=task, + is_private=not public, + system_name=system_name, + metric_names=metric_names, + source_language=source_language, + target_language=target_language, + dataset_split=split, + shared_users=shared_users, + system_details=system_details, + ) + custom_dataset = None + if custom_dataset_file: + custom_dataset = SystemOutputProps( + data=custom_dataset_file, + file_type=custom_dataset_file_type, + ) + elif dataset is not None: + metadata.dataset_metadata_id = generate_dataset_id(dataset, sub_dataset) + else: + raise ValueError("Must specify dataset or custom_dataset_file") + create_props = ( + SystemCreateProps( + metadata=metadata, + system_output=system_output, + custom_dataset=custom_dataset, + ) + if custom_dataset is not None + else SystemCreateProps(metadata=metadata, system_output=system_output) + ) + + result: System = self._systems_post(create_props) + return result.to_dict() + + # --- Pass-through API calls that will be deprecated def systems_post( self, system_create_props: SystemCreateProps, **kwargs ) -> Union[System, ApplyResult]: + """Post a system using the client. + + The public function is deprecated and will be removed.""" + logging.getLogger("explainaboard_client").warning( + "WARNING: systems_post() is deprecated and may be removed in the future." + " Please use evaluate_file() instead." + ) + return self._systems_post(system_create_props, **kwargs) + + def systems_get_by_id(self, system_id: str, **kwargs): + """API call to get systems. Will be replaced in the future.""" + return self._default_api.systems_get_by_id(system_id, **kwargs) + + def systems_delete_by_id(self, system_id: str, **kwargs): + """API call to delete systems. Will be replaced in the future.""" + self._default_api.systems_delete_by_id(system_id, **kwargs) + + def systems_get(self, **kwargs): + """API call to get systems. Will be replaced in the future.""" + return self._default_api.systems_get(**kwargs) + + def info_get(self, **kwargs): + """API call to get info. Will be replaced in the future.""" + return self._default_api.info_get(**kwargs) + + def user_get(self, **kwargs): + """API call to get a user. Will be replaced in the future.""" + return self._default_api.user_get(**kwargs) + + # --- Private utility functions + def _systems_post( + self, system_create_props: SystemCreateProps, **kwargs + ) -> Union[System, ApplyResult]: + """Post a system using the client.""" + if not self._active: + raise RuntimeError("Client is closed.") loaded_system_output = SystemOutputProps( data=encode_file_to_base64(system_create_props.system_output.data), file_type=system_create_props.system_output.file_type, @@ -45,4 +198,4 @@ def systems_post( metadata=system_create_props.metadata, system_output=loaded_system_output, ) - return super().systems_post(props_with_loaded_file, **kwargs) + return self._default_api.systems_post(props_with_loaded_file, **kwargs) diff --git a/explainaboard_client/config.py b/explainaboard_client/config.py index 716e8db..8bdfd45 100644 --- a/explainaboard_client/config.py +++ b/explainaboard_client/config.py @@ -33,9 +33,13 @@ class HostConfig: @dataclass class Config: - """configurations for explainaboard CLI - :param host: if specified, it takes precedence over environment + """Configurations for explainaboard CLI + Vars: + user_email: The email of the user + api_key: API key for explainaboard + environment: Environment where the call should be made + host: A custom host to use """ user_email: str @@ -51,6 +55,9 @@ def __post_init__(self): def get_env_host_map(): return ENV_HOST_MAP + def get_env(self): + return ENV_HOST_MAP[self.environment] + def to_client_config(self): client_config = Configuration() client_config.host = ENV_HOST_MAP[self.environment].host diff --git a/explainaboard_client/tests/test_system.py b/explainaboard_client/tests/test_system.py index cc43314..0034abc 100644 --- a/explainaboard_client/tests/test_system.py +++ b/explainaboard_client/tests/test_system.py @@ -1,70 +1,47 @@ import os -from explainaboard_api_client.models import ( - System, - SystemCreateProps, - SystemMetadata, - SystemOutputProps, -) from explainaboard_client.tests.test_utils import test_artifacts_path, TestEndpointsE2E -from explainaboard_client.utils import generate_dataset_id class TestSystem(TestEndpointsE2E): - _SYSTEM_OUTPUT = SystemOutputProps( - data=os.path.join(test_artifacts_path, "sst2-lstm-output.txt"), - file_type="text", - ) + _SYSTEM_OUTPUT = os.path.join(test_artifacts_path, "sst2-lstm-output.txt") + _DATASET = os.path.join(test_artifacts_path, "sst2-dataset.tsv") def test_no_custom_dataset(self): - metadata = SystemMetadata( + result: dict = self._client.evaluate_system_file( + system_output_file=self._SYSTEM_OUTPUT, + system_output_file_type="text", task="text-classification", - is_private=True, system_name="test_cli", metric_names=["Accuracy"], source_language="en", target_language="en", - dataset_metadata_id=generate_dataset_id("sst2", None), - dataset_split="test", + dataset="sst2", + split="test", shared_users=["explainaboard@gmail.com"], - system_details={"hello": "world"}, ) - create_props = SystemCreateProps( - metadata=metadata, system_output=self._SYSTEM_OUTPUT - ) - result: System = self._client.systems_post(create_props) - sys_id = result.system_id + sys_id = result["system_id"] try: sys = self._client.systems_get_by_id(sys_id) self.assertIn("dataset", sys) self.assertIn("system_info", sys) - finally: # cleanup self._client.systems_delete_by_id(sys_id) def test_custom_dataset(self): - metadata = SystemMetadata( + result: dict = self._client.evaluate_system_file( + system_output_file=self._SYSTEM_OUTPUT, + system_output_file_type="text", + custom_dataset_file=self._DATASET, + custom_dataset_file_type="tsv", task="text-classification", - is_private=True, system_name="test_cli", metric_names=["Accuracy"], source_language="en", target_language="en", - dataset_split="test", + split="test", # TODO(gneubig): required, but probably shouldn't be shared_users=["explainaboard@gmail.com"], - system_details={"hello": "world"}, - ) - custom_dataset = SystemOutputProps( - data=os.path.join(test_artifacts_path, "sst2-dataset.tsv"), - file_type="tsv", ) - create_props = SystemCreateProps( - metadata=metadata, - system_output=self._SYSTEM_OUTPUT, - custom_dataset=custom_dataset, - ) - result: System = self._client.systems_post(create_props) - # cleanup - sys_id = result.system_id + sys_id = result["system_id"] self._client.systems_delete_by_id(sys_id)