From 01a4f959be15e7e11124b208069b38a0c57bccee Mon Sep 17 00:00:00 2001 From: Angela Date: Tue, 30 Jul 2024 16:54:57 -0700 Subject: [PATCH 1/6] cleanlab studio beta api --- cleanlab_studio/errors.py | 8 + cleanlab_studio/internal/api/api.py | 118 ++++++--------- cleanlab_studio/internal/api/api_helper.py | 52 ++++++- cleanlab_studio/internal/api/beta_api.py | 95 ++++++++++++ cleanlab_studio/internal/studio_base.py | 37 +++++ cleanlab_studio/internal/upload_helpers.py | 10 +- cleanlab_studio/studio/studio.py | 51 ++----- cleanlab_studio/studio_beta/__init__.py | 1 + cleanlab_studio/studio_beta/beta_dataset.py | 68 +++++++++ cleanlab_studio/studio_beta/beta_job.py | 157 ++++++++++++++++++++ cleanlab_studio/studio_beta/studio_beta.py | 64 ++++++++ 11 files changed, 545 insertions(+), 116 deletions(-) create mode 100644 cleanlab_studio/internal/api/beta_api.py create mode 100644 cleanlab_studio/internal/studio_base.py create mode 100644 cleanlab_studio/studio_beta/__init__.py create mode 100644 cleanlab_studio/studio_beta/beta_dataset.py create mode 100644 cleanlab_studio/studio_beta/beta_job.py create mode 100644 cleanlab_studio/studio_beta/studio_beta.py diff --git a/cleanlab_studio/errors.py b/cleanlab_studio/errors.py index c7829ecf..42d8aaeb 100644 --- a/cleanlab_studio/errors.py +++ b/cleanlab_studio/errors.py @@ -152,3 +152,11 @@ def __init__(self, filepath: Union[str, pathlib.Path] = "") -> None: if isinstance(filepath, pathlib.Path): filepath = str(filepath) super().__init__(f"File could not be found at {filepath}. Please check the file path.") + + +class BetaJobError(HandledError): + pass + + +class DownloadResultsError(HandledError): + pass diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index 9e749205..24011763 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -40,52 +40,22 @@ pyspark_exists = False from cleanlab_studio.errors import NotInstalledError -from cleanlab_studio.internal.api.api_helper import check_uuid_well_formed +from cleanlab_studio.internal.api.api_helper import ( + check_uuid_well_formed, + construct_headers, + handle_api_error, +) from cleanlab_studio.internal.types import JSONDict, SchemaOverride from cleanlab_studio.version import __version__ -base_url = os.environ.get("CLEANLAB_API_BASE_URL", "https://api.cleanlab.ai/api") -cli_base_url = f"{base_url}/cli/v0" -upload_base_url = f"{base_url}/upload/v1" -dataset_base_url = f"{base_url}/datasets" -project_base_url = f"{base_url}/projects" -cleanset_base_url = f"{base_url}/cleansets" -model_base_url = f"{base_url}/v1/deployment" -tlm_base_url = f"{base_url}/v0/trustworthy_llm" - - -def _construct_headers( - api_key: Optional[str], content_type: Optional[str] = "application/json" -) -> JSONDict: - retval = dict() - if api_key: - retval["Authorization"] = f"bearer {api_key}" - if content_type: - retval["Content-Type"] = content_type - retval["Client-Type"] = "python-api" - return retval - - -def handle_api_error(res: requests.Response) -> None: - handle_api_error_from_json(res.json(), res.status_code) - - -def handle_api_error_from_json(res_json: JSONDict, status_code: Optional[int] = None) -> None: - if "code" in res_json and "description" in res_json: # AuthError or UserQuotaError format - if res_json["code"] == "user_soft_quota_exceeded": - pass # soft quota limit is going away soon, so ignore it - else: - raise APIError(res_json["description"]) - - if res_json.get("error", None) is not None: - error = res_json["error"] - if ( - status_code == 422 - and isinstance(error, dict) - and error.get("code", None) == "UNSUPPORTED_PROJECT_CONFIGURATION" - ): - raise InvalidProjectConfiguration(error["description"]) - raise APIError(res_json["error"]) +API_BASE_URL = os.environ.get("CLEANLAB_API_BASE_URL", "https://api.cleanlab.ai/api") +cli_base_url = f"{API_BASE_URL}/cli/v0" +upload_base_url = f"{API_BASE_URL}/upload/v1" +dataset_base_url = f"{API_BASE_URL}/datasets" +project_base_url = f"{API_BASE_URL}/projects" +cleanset_base_url = f"{API_BASE_URL}/cleansets" +model_base_url = f"{API_BASE_URL}/v1/deployment" +tlm_base_url = f"{API_BASE_URL}/v0/trustworthy_llm" def handle_rate_limit_error_from_resp(resp: aiohttp.ClientResponse) -> None: @@ -134,7 +104,7 @@ def validate_api_key(api_key: str) -> bool: res = requests.get( cli_base_url + "/validate", json=dict(api_key=api_key), - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) valid: bool = res.json()["valid"] @@ -154,7 +124,7 @@ def initialize_upload( res = requests.post( f"{upload_base_url}/file/initialize", json=dict(size_in_bytes=str(file_size), filename=filename, file_type=file_type), - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) upload_id: str = res.json()["upload_id"] @@ -169,7 +139,7 @@ def complete_file_upload(api_key: str, upload_id: str, upload_parts: List[JSONDi res = requests.post( f"{upload_base_url}/file/complete", json=request_json, - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) @@ -184,7 +154,7 @@ def confirm_upload( res = requests.post( f"{upload_base_url}/confirm", json=request_json, - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) @@ -199,7 +169,7 @@ def update_schema( res = requests.patch( f"{upload_base_url}/schema", json=request_json, - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) @@ -209,7 +179,7 @@ def get_ingestion_status(api_key: str, upload_id: str) -> JSONDict: res = requests.get( f"{upload_base_url}/total_progress", params=dict(upload_id=upload_id), - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) res_json: JSONDict = res.json() @@ -221,7 +191,7 @@ def get_dataset_id(api_key: str, upload_id: str) -> JSONDict: res = requests.get( f"{upload_base_url}/dataset_id", params=dict(upload_id=upload_id), - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) res_json: JSONDict = res.json() @@ -232,7 +202,7 @@ def get_project_of_cleanset(api_key: str, cleanset_id: str) -> str: check_uuid_well_formed(cleanset_id, "cleanset ID") res = requests.get( cli_base_url + f"/cleansets/{cleanset_id}/project", - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) project_id: str = res.json()["project_id"] @@ -243,7 +213,7 @@ def get_label_column_of_project(api_key: str, project_id: str) -> str: check_uuid_well_formed(project_id, "project ID") res = requests.get( cli_base_url + f"/projects/{project_id}/label_column", - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) label_column: str = res.json()["label_column"] @@ -274,7 +244,7 @@ def download_cleanlab_columns( include_cleanlab_columns=include_cleanlab_columns, include_project_details=include_project_details, ), - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) id_col = get_id_column(api_key, cleanset_id) @@ -306,7 +276,7 @@ def download_array( check_uuid_well_formed(cleanset_id, "cleanset ID") res = requests.get( cli_base_url + f"/cleansets/{cleanset_id}/{name}", - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) res_json: JSONDict = res.json() @@ -323,7 +293,7 @@ def get_id_column(api_key: str, cleanset_id: str) -> str: check_uuid_well_formed(cleanset_id, "cleanset ID") res = requests.get( cli_base_url + f"/cleansets/{cleanset_id}/id_column", - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) id_column: str = res.json()["id_column"] @@ -334,7 +304,7 @@ def get_dataset_of_project(api_key: str, project_id: str) -> str: check_uuid_well_formed(project_id, "project ID") res = requests.get( cli_base_url + f"/projects/{project_id}/dataset", - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) dataset_id: str = res.json()["dataset_id"] @@ -345,7 +315,7 @@ def get_dataset_schema(api_key: str, dataset_id: str) -> JSONDict: check_uuid_well_formed(dataset_id, "dataset ID") res = requests.get( cli_base_url + f"/datasets/{dataset_id}/schema", - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) schema: JSONDict = res.json()["schema"] @@ -357,7 +327,7 @@ def get_dataset_details(api_key: str, dataset_id: str, task_type: Optional[str]) res = requests.get( project_base_url + f"/dataset_details/{dataset_id}", params=dict(tasktype=task_type), - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) dataset_details: JSONDict = res.json() @@ -368,7 +338,7 @@ def check_column_diversity(api_key: str, dataset_id: str, column_name: str) -> J check_uuid_well_formed(dataset_id, "dataset ID") res = requests.get( dataset_base_url + f"/diversity/{dataset_id}/{column_name}", - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) column_diversity: JSONDict = res.json() @@ -379,7 +349,7 @@ def is_valid_multilabel_column(api_key: str, dataset_id: str, column_name: str) check_uuid_well_formed(dataset_id, "dataset ID") res = requests.get( dataset_base_url + f"/check_valid_multilabel/{dataset_id}/{column_name}", - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) multilabel_column: JSONDict = res.json() @@ -410,7 +380,7 @@ def clean_dataset( ) res = requests.post( project_base_url + f"/clean", - headers=_construct_headers(api_key), + headers=construct_headers(api_key), json=request_json, ) handle_api_error(res) @@ -422,7 +392,7 @@ def get_latest_cleanset_id(api_key: str, project_id: str) -> str: check_uuid_well_formed(project_id, "project ID") res = requests.get( cleanset_base_url + f"/project/{project_id}/latest_cleanset_id", - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) cleanset_id = res.json()["cleanset_id"] @@ -448,7 +418,7 @@ def get_dataset_id_for_name( res = requests.get( dataset_base_url + f"/dataset_id_for_name", params=dict(dataset_name=dataset_name), - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) return cast(Optional[str], res.json().get("dataset_id", None)) @@ -458,7 +428,7 @@ def get_cleanset_status(api_key: str, cleanset_id: str) -> JSONDict: check_uuid_well_formed(cleanset_id, "cleanset ID") res = requests.get( cleanset_base_url + f"/{cleanset_id}/status", - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) status: JSONDict = res.json() @@ -467,13 +437,13 @@ def get_cleanset_status(api_key: str, cleanset_id: str) -> JSONDict: def delete_dataset(api_key: str, dataset_id: str) -> None: check_uuid_well_formed(dataset_id, "dataset ID") - res = requests.delete(dataset_base_url + f"/{dataset_id}", headers=_construct_headers(api_key)) + res = requests.delete(dataset_base_url + f"/{dataset_id}", headers=construct_headers(api_key)) handle_api_error(res) def delete_project(api_key: str, project_id: str) -> None: check_uuid_well_formed(project_id, "project ID") - res = requests.delete(project_base_url + f"/{project_id}", headers=_construct_headers(api_key)) + res = requests.delete(project_base_url + f"/{project_id}", headers=construct_headers(api_key)) handle_api_error(res) @@ -528,7 +498,7 @@ def deploy_model(api_key: str, cleanset_id: str, model_name: str) -> str: check_uuid_well_formed(cleanset_id, "cleanset ID") res = requests.post( model_base_url, - headers=_construct_headers(api_key), + headers=construct_headers(api_key), json=dict(cleanset_id=cleanset_id, deployment_name=model_name), ) @@ -542,7 +512,7 @@ def get_deployment_status(api_key: str, model_id: str) -> str: check_uuid_well_formed(model_id, "model ID") res = requests.get( f"{model_base_url}/{model_id}", - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) deployment: JSONDict = res.json() @@ -555,7 +525,7 @@ def upload_predict_batch(api_key: str, model_id: str, batch: io.StringIO) -> str url = f"{model_base_url}/{model_id}/upload" res = requests.post( url, - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) @@ -573,7 +543,7 @@ def start_prediction(api_key: str, model_id: str, query_id: str) -> None: check_uuid_well_formed(query_id, "query ID") res = requests.post( f"{model_base_url}/{model_id}/predict/{query_id}", - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) @@ -584,7 +554,7 @@ def get_prediction_status(api_key: str, query_id: str) -> Dict[str, str]: check_uuid_well_formed(query_id, "query ID") res = requests.get( f"{model_base_url}/predict/{query_id}", - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) @@ -596,7 +566,7 @@ def get_deployed_model_info(api_key: str, model_id: str) -> Dict[str, str]: check_uuid_well_formed(model_id, "model ID") res = requests.get( f"{model_base_url}/{model_id}", - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) handle_api_error(res) @@ -672,7 +642,7 @@ async def tlm_prompt( res = await client_session.post( f"{tlm_base_url}/prompt", json=dict(prompt=prompt, quality=quality_preset, options=options or {}), - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) res_json = await res.json() @@ -733,7 +703,7 @@ async def tlm_get_confidence_score( quality=quality_preset, options=options or {}, ), - headers=_construct_headers(api_key), + headers=construct_headers(api_key), ) res_json = await res.json() diff --git a/cleanlab_studio/internal/api/api_helper.py b/cleanlab_studio/internal/api/api_helper.py index a531e5e2..2864657f 100644 --- a/cleanlab_studio/internal/api/api_helper.py +++ b/cleanlab_studio/internal/api/api_helper.py @@ -1,6 +1,22 @@ import uuid +from typing import List, Optional, TypedDict -from cleanlab_studio.errors import InvalidUUIDError +import requests + +from cleanlab_studio.errors import ( + APIError, + InvalidProjectConfiguration, + InvalidUUIDError, +) +from cleanlab_studio.internal.types import JSONDict + + +class UploadPart(TypedDict): + ETag: str + PartNumber: int + + +UploadParts = List[UploadPart] def check_uuid_well_formed(uuid_string: str, id_name: str) -> None: @@ -10,3 +26,37 @@ def check_uuid_well_formed(uuid_string: str, id_name: str) -> None: raise InvalidUUIDError( f"{uuid_string} is not a well-formed {id_name}, please double check and try again." ) + + +def construct_headers( + api_key: Optional[str], content_type: Optional[str] = "application/json" +) -> JSONDict: + retval = dict() + if api_key: + retval["Authorization"] = f"bearer {api_key}" + if content_type: + retval["Content-Type"] = content_type + retval["Client-Type"] = "python-api" + return retval + + +def handle_api_error(res: requests.Response) -> None: + handle_api_error_from_json(res.json(), res.status_code) + + +def handle_api_error_from_json(res_json: JSONDict, status_code: Optional[int] = None) -> None: + if "code" in res_json and "description" in res_json: # AuthError or UserQuotaError format + if res_json["code"] == "user_soft_quota_exceeded": + pass # soft quota limit is going away soon, so ignore it + else: + raise APIError(res_json["description"]) + + if res_json.get("error", None) is not None: + error = res_json["error"] + if ( + status_code == 422 + and isinstance(error, dict) + and error.get("code", None) == "UNSUPPORTED_PROJECT_CONFIGURATION" + ): + raise InvalidProjectConfiguration(error["description"]) + raise APIError(res_json["error"]) diff --git a/cleanlab_studio/internal/api/beta_api.py b/cleanlab_studio/internal/api/beta_api.py new file mode 100644 index 00000000..311ac9be --- /dev/null +++ b/cleanlab_studio/internal/api/beta_api.py @@ -0,0 +1,95 @@ +from typing import Any, Dict, List + +import requests + +from .api import API_BASE_URL, construct_headers +from .api_helper import JSONDict, UploadParts, handle_api_error + +experimental_jobs_base_url = f"{API_BASE_URL}/v0/experimental_jobs" + + +def initialize_upload( + api_key: str, filename: str, file_type: str, file_size: int +) -> Dict[str, Any]: + url = f"{experimental_jobs_base_url}/upload/initialize" + headers = construct_headers(api_key) + data = { + "filename": filename, + "file_type": file_type, + "size_in_bytes": file_size, + } + resp = requests.post(url, headers=headers, json=data) + resp.raise_for_status() + return resp.json() + + +def complete_upload(api_key: str, dataset_id: str, upload_parts: UploadParts) -> JSONDict: + url = f"{experimental_jobs_base_url}/upload/complete" + headers = construct_headers(api_key) + data = { + "dataset_id": dataset_id, + "upload_parts": upload_parts, + } + resp = requests.post(url, headers=headers, json=data) + handle_api_error(resp) + return resp.json() + + +def get_dataset(api_key: str, dataset_id: str) -> JSONDict: + url = f"{experimental_jobs_base_url}/datasets/{dataset_id}" + headers = construct_headers(api_key) + resp = requests.get(url, headers=headers) + handle_api_error(resp) + return resp.json() + + +def run_job(api_key: str, dataset_id: str, job_definition_name: str) -> JSONDict: + url = f"{experimental_jobs_base_url}/run" + headers = construct_headers(api_key) + data = { + "dataset_id": dataset_id, + "job_definition_name": job_definition_name, + } + resp = requests.post(url, headers=headers, json=data) + handle_api_error(resp) + return resp.json() + + +def get_job(api_key: str, job_id: str) -> JSONDict: + url = f"{experimental_jobs_base_url}/{job_id}" + headers = construct_headers(api_key) + resp = requests.get(url, headers=headers) + handle_api_error(resp) + return resp.json() + + +def get_job_status(api_key: str, job_id: str) -> JSONDict: + url = f"{experimental_jobs_base_url}/{job_id}/status" + headers = construct_headers(api_key) + resp = requests.get(url, headers=headers) + resp.raise_for_status() + return resp.json() + + +def get_results(api_key: str, job_id: str) -> JSONDict: + url = f"{experimental_jobs_base_url}/{job_id}/results" + headers = construct_headers(api_key) + resp = requests.get(url, headers=headers) + resp.raise_for_status() + return resp.json() + + +def list_datasets(api_key: str) -> List[JSONDict]: + url = f"{experimental_jobs_base_url}/datasets" + headers = construct_headers(api_key) + resp = requests.get(url, headers=headers) + handle_api_error(resp) + return resp.json()["datasets"] + + +def list_jobs(api_key: str) -> List[JSONDict]: + url = f"{experimental_jobs_base_url}/jobs" + headers = construct_headers(api_key) + resp = requests.get(url, headers=headers) + handle_api_error(resp) + return resp.json()["jobs"] diff --git a/cleanlab_studio/internal/studio_base.py b/cleanlab_studio/internal/studio_base.py new file mode 100644 index 00000000..2d6c6d2b --- /dev/null +++ b/cleanlab_studio/internal/studio_base.py @@ -0,0 +1,37 @@ +from aiohttp_retry import Optional + +from cleanlab_studio.errors import MissingAPIKeyError, VersionError +from cleanlab_studio.internal.api import api +from cleanlab_studio.internal.settings import CleanlabSettings + + +class StudioBase: + _api_key: str + + def __init__(self, api_key: Optional[str]): + """ + Creates a Cleanlab Studio client. + + Args: + api_key: You can find your API key on your [account page](https://app.cleanlab.ai/account) in Cleanlab Studio. Instead of specifying the API key here, you can also log in with `cleanlab login` on the command-line. + + """ + if not api.is_valid_client_version(): + raise VersionError( + "CLI is out of date and must be updated. Run 'pip install --upgrade cleanlab-studio'." + ) + if api_key is None: + try: + api_key = CleanlabSettings.load().api_key + if api_key is None: + raise ValueError + except (FileNotFoundError, KeyError, ValueError): + raise MissingAPIKeyError( + "No API key found; either specify API key or log in with 'cleanlab login' first" + ) + if not api.validate_api_key(api_key): + raise ValueError( + f"Invalid API key, please check if it is properly specified: {api_key}" + ) + + self._api_key = api_key diff --git a/cleanlab_studio/internal/upload_helpers.py b/cleanlab_studio/internal/upload_helpers.py index 2a70190d..5cade0c4 100644 --- a/cleanlab_studio/internal/upload_helpers.py +++ b/cleanlab_studio/internal/upload_helpers.py @@ -2,17 +2,19 @@ import functools import json from typing import Any, Dict, List, Optional -from tqdm import tqdm import aiohttp -from multidict import CIMultiDictProxy import requests +from multidict import CIMultiDictProxy from requests.adapters import HTTPAdapter, Retry +from tqdm import tqdm + +from cleanlab_studio.errors import InvalidSchemaTypeError from .api import api +from .api.api_helper import UploadParts from .dataset_source import DatasetSource from .types import JSONDict, SchemaOverride -from cleanlab_studio.errors import InvalidSchemaTypeError def upload_dataset( @@ -64,7 +66,7 @@ async def upload_file_parts_async( def upload_file_parts( dataset_source: DatasetSource, part_sizes: List[int], presigned_posts: List[str] -) -> List[JSONDict]: +) -> UploadParts: session = requests.Session() session.mount("https://", adapter=HTTPAdapter(max_retries=Retry(total=3, backoff_factor=1))) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 6f41dd7f..788bfeec 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -2,30 +2,29 @@ Python API for Cleanlab Studio. """ -from typing import Any, List, Literal, Optional, Union -from types import FunctionType import warnings +from types import FunctionType +from typing import Any, List, Literal, Optional, Union import numpy as np import numpy.typing as npt import pandas as pd -from . import inference -from . import trustworthy_language_model -from cleanlab_studio.utils import tlm_hybrid -from cleanlab_studio.errors import CleansetError +from cleanlab_studio.errors import CleansetError, InvalidDatasetError from cleanlab_studio.internal import clean_helpers, deploy_helpers, upload_helpers from cleanlab_studio.internal.api import api +from cleanlab_studio.internal.studio_base import StudioBase +from cleanlab_studio.internal.types import SchemaOverride, TLMQualityPreset from cleanlab_studio.internal.util import ( - init_dataset_source, - telemetry, + apply_corrections_pd_df, apply_corrections_snowpark_df, apply_corrections_spark_df, - apply_corrections_pd_df, + init_dataset_source, + telemetry, ) -from cleanlab_studio.internal.settings import CleanlabSettings -from cleanlab_studio.internal.types import SchemaOverride, TLMQualityPreset -from cleanlab_studio.errors import VersionError, MissingAPIKeyError, InvalidDatasetError +from cleanlab_studio.utils import tlm_hybrid + +from . import inference, trustworthy_language_model _snowflake_exists = api.snowflake_exists if _snowflake_exists: @@ -36,36 +35,14 @@ import pyspark.sql -class Studio: - _api_key: str - - def __init__(self, api_key: Optional[str]): +class Studio(StudioBase): + def __init__(self, api_key: str) -> None: """ Creates a Cleanlab Studio client. - Args: api_key: You can find your API key on your [account page](https://app.cleanlab.ai/account) in Cleanlab Studio. Instead of specifying the API key here, you can also log in with `cleanlab login` on the command-line. - """ - if not api.is_valid_client_version(): - raise VersionError( - "CLI is out of date and must be updated. Run 'pip install --upgrade cleanlab-studio'." - ) - if api_key is None: - try: - api_key = CleanlabSettings.load().api_key - if api_key is None: - raise ValueError - except (FileNotFoundError, KeyError, ValueError): - raise MissingAPIKeyError( - "No API key found; either specify API key or log in with 'cleanlab login' first" - ) - if not api.validate_api_key(api_key): - raise ValueError( - f"Invalid API key, please check if it is properly specified: {api_key}" - ) - - self._api_key = api_key + super().__init__(api_key) def upload_dataset( self, diff --git a/cleanlab_studio/studio_beta/__init__.py b/cleanlab_studio/studio_beta/__init__.py new file mode 100644 index 00000000..18721a11 --- /dev/null +++ b/cleanlab_studio/studio_beta/__init__.py @@ -0,0 +1 @@ +from cleanlab_studio.studio_beta.studio_beta import StudioBeta as StudioBeta diff --git a/cleanlab_studio/studio_beta/beta_dataset.py b/cleanlab_studio/studio_beta/beta_dataset.py new file mode 100644 index 00000000..34d3c393 --- /dev/null +++ b/cleanlab_studio/studio_beta/beta_dataset.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +import pathlib +from dataclasses import dataclass +from typing import List + +from cleanlab_studio.internal.api.beta_api import ( + complete_upload, + get_dataset, + initialize_upload, + list_datasets, +) +from cleanlab_studio.internal.dataset_source import FilepathDatasetSource +from cleanlab_studio.internal.upload_helpers import upload_file_parts + + +@dataclass +class BetaDataset: + id: str + filename: str + upload_complete: bool + upload_date: int + + @classmethod + def from_id(cls, api_key: str, dataset_id: str) -> "BetaDataset": + dataset = get_dataset(api_key, dataset_id) + return cls( + id=dataset_id, + filename=dataset["filename"], + upload_complete=dataset["complete"], + upload_date=dataset["upload_date"], + ) + + @classmethod + def from_filepath(cls, api_key: str, filepath: str) -> "BetaDataset": + dataset_source = FilepathDatasetSource(filepath=pathlib.Path(filepath)) + initialize_response = initialize_upload( + api_key, + dataset_source.get_filename(), + dataset_source.file_type, + dataset_source.file_size, + ) + dataset_id = initialize_response["id"] + part_sizes = initialize_response["part_sizes"] + presigned_posts = initialize_response["presigned_posts"] + + # TODO: upload file parts + upload_parts = upload_file_parts(dataset_source, part_sizes, presigned_posts) + dataset = complete_upload(api_key, dataset_id, upload_parts) + return cls( + id=dataset_id, + filename=dataset["filename"], + upload_complete=dataset["complete"], + upload_date=dataset["upload_date"], + ) + + @classmethod + def list(cls, api_key: str) -> List[BetaDataset]: + datasets = list_datasets(api_key) + return [ + cls( + id=dataset["id"], + filename=dataset["filename"], + upload_complete=dataset["complete"], + upload_date=dataset["upload_date"], + ) + for dataset in datasets + ] diff --git a/cleanlab_studio/studio_beta/beta_job.py b/cleanlab_studio/studio_beta/beta_job.py new file mode 100644 index 00000000..3a271dad --- /dev/null +++ b/cleanlab_studio/studio_beta/beta_job.py @@ -0,0 +1,157 @@ +from __future__ import annotations + +import enum +import itertools +import pathlib +import time +from dataclasses import dataclass +from typing import Optional + +import requests +from tqdm import tqdm + +from cleanlab_studio.errors import BetaJobError, DownloadResultsError +from cleanlab_studio.internal.api.beta_api import ( + get_job, + get_job_status, + get_results, + list_jobs, + run_job, +) + + +class JobStatus(enum.Enum): + CREATED = 0 + RUNNING = 1 + READY = 2 + FAILED = -1 + + @classmethod + def from_name(cls, name: str) -> "JobStatus": + return cls[name.upper()] + + +@dataclass +class BetaJob: + id: str + status: JobStatus + dataset_id: str + job_definition_name: str + created_at: int + _api_key: str + + @classmethod + def from_id(cls, api_key: str, job_id: str) -> "BetaJob": + """Loads an existing job by ID. + Args: + api_key: Your API key. + job_id: The ID of the job to load. + Returns: + The job object. + """ + job_resp = get_job(api_key, job_id) + job = cls( + _api_key=api_key, + id=job_resp["id"], + dataset_id=job_resp["dataset_id"], + job_definition_name=job_resp["job_definition_name"], + status=JobStatus.from_name(job_resp["status"]), + created_at=job_resp["created_at"], + ) + return job + + @classmethod + def run(cls, api_key: str, dataset_id: str, job_definition_name: str) -> "BetaJob": + """Creates and runs a new job with the given dataset and job definition. + + Args: + api_key: Your API key. + dataset_id: The ID of the dataset to run the job on. + job_definition_name: The name of the job definition to run. + """ + job_resp = run_job(api_key, dataset_id, job_definition_name) + job = cls( + _api_key=api_key, + id=job_resp["id"], + dataset_id=dataset_id, + job_definition_name=job_definition_name, + status=JobStatus.from_name(job_resp["status"]), + created_at=job_resp["created_at"], + ) + return job + + def wait_until_ready(self, timeout: Optional[int] = None) -> None: + """Blocks until a job is ready or the timeout is reached. + + Args: + timeout (Optional[float], optional): timeout for polling, in seconds. Defaults to None. + + Raises: + TimeoutError: if job is not ready by end of timeout + BetaJobError: if job fails + """ + start_time = time.time() + res = get_job_status(self._api_key, self.id) + self.status = JobStatus.from_name(res["status"]) + spinner = itertools.cycle("|/-\\") + + with tqdm( + total=JobStatus.READY.value, + desc="Job Progress: \\", + bar_format="{desc} {postfix}", + ) as pbar: + while self.status != JobStatus.READY and self.status != JobStatus.FAILED: + pbar.set_postfix_str(self.status.name.capitalize()) + pbar.update(int(self.status.value) - pbar.n) + + if timeout is not None and time.time() - start_time > timeout: + raise TimeoutError("Result not ready before timeout") + + for _ in range(50): + time.sleep(0.1) + pbar.set_description_str(f"Job Progress: {next(spinner)}") + + res = get_job_status(self._api_key, self.id) + self.status = JobStatus.from_name(res["status"]) + + if self.status == JobStatus.READY: + pbar.update(pbar.total - pbar.n) + pbar.set_postfix_str(self.status.name.capitalize()) + return + + if self.status == JobStatus.FAILED: + pbar.set_postfix_str(self.status.name.capitalize()) + raise BetaJobError(f"Experimental job {self.id} failed to complete") + + def download_results(self, output_filepath: str) -> None: + output_path = pathlib.Path(output_filepath) + if self.status != JobStatus.READY: + raise BetaJobError("Job must be ready to download results") + + if self.status == JobStatus.FAILED: + raise BetaJobError("Job failed, cannot download results") + + results = get_results(self._api_key, self.id) + if output_path.suffix != results["result_file_type"]: + raise DownloadResultsError( + f"Output file extension does not match result file type {results['result_file_type']}" + ) + + resp = requests.get(results["result_url"]) + resp.raise_for_status() + output_path.write_bytes(resp.content) + + @classmethod + def list(cls, api_key: str) -> None: + jobs = list_jobs(api_key) + return [ + cls( + _api_key=api_key, + id=job["id"], + dataset_id=job["dataset_id"], + job_definition_name=job["job_definition_name"], + status=JobStatus.from_name(job["status"]), + created_at=job["created_at"], + ) + for job in jobs + ] diff --git a/cleanlab_studio/studio_beta/studio_beta.py b/cleanlab_studio/studio_beta/studio_beta.py new file mode 100644 index 00000000..85db729a --- /dev/null +++ b/cleanlab_studio/studio_beta/studio_beta.py @@ -0,0 +1,64 @@ +from typing import List + +from cleanlab_studio.internal.studio_base import StudioBase +from cleanlab_studio.studio_beta.beta_dataset import BetaDataset +from cleanlab_studio.studio_beta.beta_job import BetaJob + + +class StudioBeta(StudioBase): + def __init__(self, api_key: str): + """ + Creates a client to interact with the Cleanlab Studio Beta API. + Args: + api_key: You can find your API key on your [account page](https://app.cleanlab.ai/account) in Cleanlab Studio. Instead of specifying the API key here, you can also log in with `cleanlab login` on the command-line. + """ + super().__init__(api_key) + + def upload_dataset( + self, + filepath: str, + ) -> BetaDataset: + """Uploads a dataset from the given filepath for use in the Cleanlab Studio Beta API. + Args: + filepath: The path to the dataset file. + + Returns: + The uploaded dataset object. You can use this object to obtain the ID of the uploaded dataset. + Use this ID to run jobs on the dataset. + """ + return BetaDataset.from_filepath(self._api_key, filepath) + + def run_job(self, dataset_id: str, job_definition_name: str) -> BetaJob: + """Runs a Cleanlab Studio Beta job with the given dataset and job definition. + Args: + dataset_id: The ID of the dataset to run the job on. + job_definition_name: The name of the job definition to run. + + Returns: + The object representing the job. You can use this object to check the status of the job and download the results. + """ + return BetaJob.run(self._api_key, dataset_id, job_definition_name) + + def download_results(self, job_id: str, output_filename: str) -> None: + """Downloads the results of an experimental job to the given output filename. + Args: + job_id: The ID of the job to download the results for. + output_filename: The path to save the downloaded results to. + """ + BetaJob.from_id(self._api_key, job_id).download_results(output_filename) + + def list_datasets(self) -> List[BetaDataset]: + """Lists all datasets you have uploaded through the Beta API. + + Returns: + A list of all the datasets you have uploaded through the Cleanlab Studio Beta API. + """ + return BetaDataset.list(self._api_key) + + def list_jobs(self) -> List[BetaJob]: + """Lists all jobs you have run through the Beta API. + + Returns: + A list of all the jobs you have run through the Cleanlab Studio Beta API. + """ + return BetaJob.list(self._api_key) From ac216fd0723b980b9e85d1c2fd88c256e390eefe Mon Sep 17 00:00:00 2001 From: Angela Date: Wed, 31 Jul 2024 23:23:25 -0700 Subject: [PATCH 2/6] mypy --- cleanlab_studio/internal/api/api.py | 3 ++- cleanlab_studio/studio_beta/beta_job.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index 24011763..37cf0983 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -41,6 +41,7 @@ from cleanlab_studio.errors import NotInstalledError from cleanlab_studio.internal.api.api_helper import ( + UploadParts, check_uuid_well_formed, construct_headers, handle_api_error, @@ -133,7 +134,7 @@ def initialize_upload( return upload_id, part_sizes, presigned_posts -def complete_file_upload(api_key: str, upload_id: str, upload_parts: List[JSONDict]) -> None: +def complete_file_upload(api_key: str, upload_id: str, upload_parts: UploadParts) -> None: check_uuid_well_formed(upload_id, "upload ID") request_json = dict(upload_id=upload_id, upload_parts=upload_parts) res = requests.post( diff --git a/cleanlab_studio/studio_beta/beta_job.py b/cleanlab_studio/studio_beta/beta_job.py index 3a271dad..b95195ba 100644 --- a/cleanlab_studio/studio_beta/beta_job.py +++ b/cleanlab_studio/studio_beta/beta_job.py @@ -5,7 +5,7 @@ import pathlib import time from dataclasses import dataclass -from typing import Optional +from typing import List, Optional import requests from tqdm import tqdm @@ -142,7 +142,7 @@ def download_results(self, output_filepath: str) -> None: output_path.write_bytes(resp.content) @classmethod - def list(cls, api_key: str) -> None: + def list(cls, api_key: str) -> List[BetaJob]: jobs = list_jobs(api_key) return [ cls( From 655870b2946353a034b0ed9ccf6c642823e1babc Mon Sep 17 00:00:00 2001 From: Angela Date: Thu, 1 Aug 2024 10:31:51 -0700 Subject: [PATCH 3/6] add types-requests dev requirement --- requirements_dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements_dev.txt b/requirements_dev.txt index 45e64956..d9a9c809 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -2,3 +2,4 @@ black pre-commit twine mypy +types-requests From 9cd16471eb542e75fbbeac9b3e18eddac751c34f Mon Sep 17 00:00:00 2001 From: Angela Date: Thu, 1 Aug 2024 11:12:41 -0700 Subject: [PATCH 4/6] mypy --- requirements_dev.txt | 1 - setup.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements_dev.txt b/requirements_dev.txt index d9a9c809..45e64956 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -2,4 +2,3 @@ black pre-commit twine mypy -types-requests diff --git a/setup.py b/setup.py index 789bf2cc..9bac45b0 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from os import path -from setuptools import setup, find_packages +from setuptools import find_packages, setup here = path.abspath(path.dirname(__file__)) with open(path.join(here, "README.md"), encoding="utf-8") as f: @@ -46,6 +46,7 @@ python_requires=">=3.8", install_requires=[ "aiohttp>=3.8.1", + "aiohttp-retry>=2.4.0", "Click>=8.1.0,<=8.1.3", "colorama>=0.4.4", "nest_asyncio>=1.5.0", From 1570a619a1965dd705e008ef3d32c8a7b74e0a76 Mon Sep 17 00:00:00 2001 From: Angela Date: Thu, 1 Aug 2024 12:31:47 -0700 Subject: [PATCH 5/6] mypy --- cleanlab_studio/internal/api/beta_api.py | 30 ++++++++++++------------ cleanlab_studio/internal/studio_base.py | 2 +- cleanlab_studio/studio_beta/beta_job.py | 5 ++-- setup.py | 1 - 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/cleanlab_studio/internal/api/beta_api.py b/cleanlab_studio/internal/api/beta_api.py index 311ac9be..d88821e5 100644 --- a/cleanlab_studio/internal/api/beta_api.py +++ b/cleanlab_studio/internal/api/beta_api.py @@ -1,16 +1,16 @@ -from typing import Any, Dict, List +from typing import List, cast import requests -from .api import API_BASE_URL, construct_headers -from .api_helper import JSONDict, UploadParts, handle_api_error +from cleanlab_studio.internal.types import JSONDict + +from .api import API_BASE_URL +from .api_helper import UploadParts, construct_headers, handle_api_error experimental_jobs_base_url = f"{API_BASE_URL}/v0/experimental_jobs" -def initialize_upload( - api_key: str, filename: str, file_type: str, file_size: int -) -> Dict[str, Any]: +def initialize_upload(api_key: str, filename: str, file_type: str, file_size: int) -> JSONDict: url = f"{experimental_jobs_base_url}/upload/initialize" headers = construct_headers(api_key) data = { @@ -20,7 +20,7 @@ def initialize_upload( } resp = requests.post(url, headers=headers, json=data) resp.raise_for_status() - return resp.json() + return cast(JSONDict, resp.json()) def complete_upload(api_key: str, dataset_id: str, upload_parts: UploadParts) -> JSONDict: @@ -32,7 +32,7 @@ def complete_upload(api_key: str, dataset_id: str, upload_parts: UploadParts) -> } resp = requests.post(url, headers=headers, json=data) handle_api_error(resp) - return resp.json() + return cast(JSONDict, resp.json()) def get_dataset(api_key: str, dataset_id: str) -> JSONDict: @@ -40,7 +40,7 @@ def get_dataset(api_key: str, dataset_id: str) -> JSONDict: headers = construct_headers(api_key) resp = requests.get(url, headers=headers) handle_api_error(resp) - return resp.json() + return cast(JSONDict, resp.json()) def run_job(api_key: str, dataset_id: str, job_definition_name: str) -> JSONDict: @@ -52,7 +52,7 @@ def run_job(api_key: str, dataset_id: str, job_definition_name: str) -> JSONDict } resp = requests.post(url, headers=headers, json=data) handle_api_error(resp) - return resp.json() + return cast(JSONDict, resp.json()) def get_job(api_key: str, job_id: str) -> JSONDict: @@ -60,7 +60,7 @@ def get_job(api_key: str, job_id: str) -> JSONDict: headers = construct_headers(api_key) resp = requests.get(url, headers=headers) handle_api_error(resp) - return resp.json() + return cast(JSONDict, resp.json()) def get_job_status(api_key: str, job_id: str) -> JSONDict: @@ -68,7 +68,7 @@ def get_job_status(api_key: str, job_id: str) -> JSONDict: headers = construct_headers(api_key) resp = requests.get(url, headers=headers) resp.raise_for_status() - return resp.json() + return cast(JSONDict, resp.json()) def get_results(api_key: str, job_id: str) -> JSONDict: @@ -76,7 +76,7 @@ def get_results(api_key: str, job_id: str) -> JSONDict: headers = construct_headers(api_key) resp = requests.get(url, headers=headers) resp.raise_for_status() - return resp.json() + return cast(JSONDict, resp.json()) def list_datasets(api_key: str) -> List[JSONDict]: @@ -84,7 +84,7 @@ def list_datasets(api_key: str) -> List[JSONDict]: headers = construct_headers(api_key) resp = requests.get(url, headers=headers) handle_api_error(resp) - return resp.json()["datasets"] + return cast(List[JSONDict], resp.json()["datasets"]) def list_jobs(api_key: str) -> List[JSONDict]: @@ -92,4 +92,4 @@ def list_jobs(api_key: str) -> List[JSONDict]: headers = construct_headers(api_key) resp = requests.get(url, headers=headers) handle_api_error(resp) - return resp.json()["jobs"] + return cast(List[JSONDict], resp.json()["jobs"]) diff --git a/cleanlab_studio/internal/studio_base.py b/cleanlab_studio/internal/studio_base.py index 2d6c6d2b..adb5a5e1 100644 --- a/cleanlab_studio/internal/studio_base.py +++ b/cleanlab_studio/internal/studio_base.py @@ -1,4 +1,4 @@ -from aiohttp_retry import Optional +from typing import Optional from cleanlab_studio.errors import MissingAPIKeyError, VersionError from cleanlab_studio.internal.api import api diff --git a/cleanlab_studio/studio_beta/beta_job.py b/cleanlab_studio/studio_beta/beta_job.py index b95195ba..7931afe2 100644 --- a/cleanlab_studio/studio_beta/beta_job.py +++ b/cleanlab_studio/studio_beta/beta_job.py @@ -125,12 +125,13 @@ def wait_until_ready(self, timeout: Optional[int] = None) -> None: def download_results(self, output_filepath: str) -> None: output_path = pathlib.Path(output_filepath) - if self.status != JobStatus.READY: - raise BetaJobError("Job must be ready to download results") if self.status == JobStatus.FAILED: raise BetaJobError("Job failed, cannot download results") + if self.status != JobStatus.READY: + raise BetaJobError("Job must be ready to download results") + results = get_results(self._api_key, self.id) if output_path.suffix != results["result_file_type"]: raise DownloadResultsError( diff --git a/setup.py b/setup.py index 9bac45b0..794a3f9a 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,6 @@ python_requires=">=3.8", install_requires=[ "aiohttp>=3.8.1", - "aiohttp-retry>=2.4.0", "Click>=8.1.0,<=8.1.3", "colorama>=0.4.4", "nest_asyncio>=1.5.0", From aa0948b63a322bdd9c5779174035d885b8846a32 Mon Sep 17 00:00:00 2001 From: Angela Date: Mon, 12 Aug 2024 10:25:02 -0700 Subject: [PATCH 6/6] update docstrings --- cleanlab_studio/studio_beta/beta_dataset.py | 25 +++++++++++++++++++++ cleanlab_studio/studio_beta/beta_job.py | 25 ++++++++++++++++++--- cleanlab_studio/studio_beta/studio_beta.py | 6 ++++- 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/cleanlab_studio/studio_beta/beta_dataset.py b/cleanlab_studio/studio_beta/beta_dataset.py index 34d3c393..1fe72e05 100644 --- a/cleanlab_studio/studio_beta/beta_dataset.py +++ b/cleanlab_studio/studio_beta/beta_dataset.py @@ -16,6 +16,8 @@ @dataclass class BetaDataset: + """Represents a dataset uploaded to the Cleanlab Studio Beta API.""" + id: str filename: str upload_complete: bool @@ -23,6 +25,14 @@ class BetaDataset: @classmethod def from_id(cls, api_key: str, dataset_id: str) -> "BetaDataset": + """Loads a dataset from the Cleanlab Studio Beta API by its ID. + + Args: + api_key: Your Cleanlab Studio API key. + dataset_id: The ID of the dataset to load. + Returns: + The dataset object if you've uploaded a dataset to the Cleanlab Studio Beta API with that ID. + """ dataset = get_dataset(api_key, dataset_id) return cls( id=dataset_id, @@ -33,6 +43,15 @@ def from_id(cls, api_key: str, dataset_id: str) -> "BetaDataset": @classmethod def from_filepath(cls, api_key: str, filepath: str) -> "BetaDataset": + """Uploads a dataset from the given filepath for use in the Cleanlab Studio Beta API. + + Args: + api_key: Your Cleanlab Studio API key. + filepath: The path to the dataset file. + Returns: + The uploaded dataset object. You can use this object to obtain the ID of the uploaded dataset. + Use this ID to run jobs on the dataset. + """ dataset_source = FilepathDatasetSource(filepath=pathlib.Path(filepath)) initialize_response = initialize_upload( api_key, @@ -56,6 +75,12 @@ def from_filepath(cls, api_key: str, filepath: str) -> "BetaDataset": @classmethod def list(cls, api_key: str) -> List[BetaDataset]: + """Lists all datasets you have uploaded through the Beta API. + + Args: + api_key: Your Cleanlab Studio API key. + Returns: + A list of all the datasets you have uploaded through the Cleanlab Studio Beta API.""" datasets = list_datasets(api_key) return [ cls( diff --git a/cleanlab_studio/studio_beta/beta_job.py b/cleanlab_studio/studio_beta/beta_job.py index 7931afe2..39ab975f 100644 --- a/cleanlab_studio/studio_beta/beta_job.py +++ b/cleanlab_studio/studio_beta/beta_job.py @@ -21,6 +21,8 @@ class JobStatus(enum.Enum): + """The status of a job in the Cleanlab Studio Beta API.""" + CREATED = 0 RUNNING = 1 READY = 2 @@ -33,6 +35,8 @@ def from_name(cls, name: str) -> "JobStatus": @dataclass class BetaJob: + """Represents a job in the Cleanlab Studio Beta API.""" + id: str status: JobStatus dataset_id: str @@ -62,7 +66,7 @@ def from_id(cls, api_key: str, job_id: str) -> "BetaJob": @classmethod def run(cls, api_key: str, dataset_id: str, job_definition_name: str) -> "BetaJob": - """Creates and runs a new job with the given dataset and job definition. + """Creates and runs a new job with the given dataset and job definition. Raises an error if the job definition name is invalid. Args: api_key: Your API key. @@ -87,8 +91,8 @@ def wait_until_ready(self, timeout: Optional[int] = None) -> None: timeout (Optional[float], optional): timeout for polling, in seconds. Defaults to None. Raises: - TimeoutError: if job is not ready by end of timeout - BetaJobError: if job fails + TimeoutError: if job is not ready by end of timeout. + BetaJobError: if job fails. """ start_time = time.time() res = get_job_status(self._api_key, self.id) @@ -124,6 +128,14 @@ def wait_until_ready(self, timeout: Optional[int] = None) -> None: raise BetaJobError(f"Experimental job {self.id} failed to complete") def download_results(self, output_filepath: str) -> None: + """Downloads the results of an experimental job to the given output filepath. + + Args: + output_filepath: The path to save the downloaded results to. + Raises: + BetaJobError: if job is not yet ready or has failed. + DownloadResultsError: if output file extension does not match result file type. + """ output_path = pathlib.Path(output_filepath) if self.status == JobStatus.FAILED: @@ -144,6 +156,13 @@ def download_results(self, output_filepath: str) -> None: @classmethod def list(cls, api_key: str) -> List[BetaJob]: + """Lists all jobs you have run through the Beta API. + + Args: + api_key: Your API key. + Returns: + A list of all the jobs you have run through the Cleanlab Studio Beta API. + """ jobs = list_jobs(api_key) return [ cls( diff --git a/cleanlab_studio/studio_beta/studio_beta.py b/cleanlab_studio/studio_beta/studio_beta.py index 85db729a..9227101c 100644 --- a/cleanlab_studio/studio_beta/studio_beta.py +++ b/cleanlab_studio/studio_beta/studio_beta.py @@ -1,3 +1,7 @@ +""" +Python API for Cleanlab Studio Private Beta functionality. +""" + from typing import List from cleanlab_studio.internal.studio_base import StudioBase @@ -29,7 +33,7 @@ def upload_dataset( return BetaDataset.from_filepath(self._api_key, filepath) def run_job(self, dataset_id: str, job_definition_name: str) -> BetaJob: - """Runs a Cleanlab Studio Beta job with the given dataset and job definition. + """Runs a Cleanlab Studio Beta job with the given dataset and job definition. Raises an error if the job definition name is invalid. Args: dataset_id: The ID of the dataset to run the job on. job_definition_name: The name of the job definition to run.