From 5ce2048e5b62f21e13f264bbe25a028437d4f966 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 12 Dec 2024 14:09:15 -0800 Subject: [PATCH] catch UC not found --- llmfoundry/utils/config_utils.py | 17 +++++++++++++++-- llmfoundry/utils/exceptions.py | 15 +++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 997273de7f..2063a8a55e 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -6,6 +6,7 @@ import logging import math import os +import re import warnings from dataclasses import dataclass, fields from typing import ( @@ -21,12 +22,14 @@ import mlflow from composer.loggers import Logger from composer.utils import dist, parse_uri +from exceptions import UCNotFoundError from mlflow.data import ( delta_dataset_source, http_dataset_source, huggingface_dataset_source, uc_volume_dataset_source, ) +from mlflow.exceptions import MlflowException from omegaconf import MISSING, DictConfig, ListConfig, MissingMandatoryValue from omegaconf import OmegaConf as om from transformers import PretrainedConfig @@ -788,13 +791,23 @@ def log_dataset_uri(cfg: dict[str, Any]) -> None: # Map data source types to their respective MLFlow DataSource. for dataset_type, path, split in data_paths: - if dataset_type in dataset_source_mapping: source_class = dataset_source_mapping[dataset_type] if dataset_type == 'delta_table': source = source_class(delta_table_name=path) elif dataset_type == 'hf' or dataset_type == 'uc_volume': - source = source_class(path=path) + try: + source = source_class(path=path) + except MlflowException as e: + error_str = str(e) + match = re.search( + r'MlflowException:\s+(.*?)\s+does not exist in Databricks Unified Catalog\.', + error_str, + ) + if match: + uc_path = match.group(1) + raise UCNotFoundError(uc_path) + raise else: source = source_class(url=path) else: diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 53d3baebfc..78272101ae 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -33,6 +33,7 @@ 'StoragePermissionError', 'UCNotEnabledError', 'DeltaTableNotFoundError', + 'UCNotFoundError', ] ALLOWED_RESPONSE_KEYS = {'response', 'completion'} @@ -585,3 +586,17 @@ def __init__( volume_name=volume_name, table_name=table_name, ) + + +class UCNotFoundError(UserError): + """Error thrown when the UC passed in training doesn't exist.""" + + def __init__( + self, + path: str, + ) -> None: + message = f'Your data path {path} does not exist. Please double check your UC path' + super().__init__( + message=message, + path=path, + )