From 5ce2048e5b62f21e13f264bbe25a028437d4f966 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 12 Dec 2024 14:09:15 -0800 Subject: [PATCH 1/4] catch UC not found --- llmfoundry/utils/config_utils.py | 17 +++++++++++++++-- llmfoundry/utils/exceptions.py | 15 +++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 997273de7f..2063a8a55e 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -6,6 +6,7 @@ import logging import math import os +import re import warnings from dataclasses import dataclass, fields from typing import ( @@ -21,12 +22,14 @@ import mlflow from composer.loggers import Logger from composer.utils import dist, parse_uri +from exceptions import UCNotFoundError from mlflow.data import ( delta_dataset_source, http_dataset_source, huggingface_dataset_source, uc_volume_dataset_source, ) +from mlflow.exceptions import MlflowException from omegaconf import MISSING, DictConfig, ListConfig, MissingMandatoryValue from omegaconf import OmegaConf as om from transformers import PretrainedConfig @@ -788,13 +791,23 @@ def log_dataset_uri(cfg: dict[str, Any]) -> None: # Map data source types to their respective MLFlow DataSource. for dataset_type, path, split in data_paths: - if dataset_type in dataset_source_mapping: source_class = dataset_source_mapping[dataset_type] if dataset_type == 'delta_table': source = source_class(delta_table_name=path) elif dataset_type == 'hf' or dataset_type == 'uc_volume': - source = source_class(path=path) + try: + source = source_class(path=path) + except MlflowException as e: + error_str = str(e) + match = re.search( + r'MlflowException:\s+(.*?)\s+does not exist in Databricks Unified Catalog\.', + error_str, + ) + if match: + uc_path = match.group(1) + raise UCNotFoundError(uc_path) + raise else: source = source_class(url=path) else: diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 53d3baebfc..78272101ae 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -33,6 +33,7 @@ 'StoragePermissionError', 'UCNotEnabledError', 'DeltaTableNotFoundError', + 'UCNotFoundError', ] ALLOWED_RESPONSE_KEYS = {'response', 'completion'} @@ -585,3 +586,17 @@ def __init__( volume_name=volume_name, table_name=table_name, ) + + +class UCNotFoundError(UserError): + """Error thrown when the UC passed in training doesn't exist.""" + + def __init__( + self, + path: str, + ) -> None: + message = f'Your data path {path} does not exist. Please double check your UC path' + super().__init__( + message=message, + path=path, + ) From 58092c0cbd7669cea3e32a1749deb43b82dbb4f8 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 12 Dec 2024 16:46:10 -0800 Subject: [PATCH 2/4] catch double slash --- llmfoundry/utils/config_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 2063a8a55e..1016d46ca9 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -22,7 +22,6 @@ import mlflow from composer.loggers import Logger from composer.utils import dist, parse_uri -from exceptions import UCNotFoundError from mlflow.data import ( delta_dataset_source, http_dataset_source, @@ -37,6 +36,7 @@ from llmfoundry.layers_registry import ffns_with_megablocks from llmfoundry.models.utils import init_empty_weights from llmfoundry.registry import config_transforms +from llmfoundry.utils.exceptions import UCNotFoundError log = logging.getLogger(__name__) @@ -706,6 +706,8 @@ def _process_data_source( true_split (str): The split of the dataset to be added (i.e. train or eval) data_paths (List[Tuple[str, str, str]]): A list of tuples formatted as (data type, path, split) """ + if source_dataset_path: + source_dataset_path = re.sub(r'/+', '/', source_dataset_path) # Check for Delta table if source_dataset_path and len(source_dataset_path.split('.')) == 3: data_paths.append(('delta_table', source_dataset_path, true_split)) From 1f3be0b455e3be7874559d4308450017abfdf359 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 12 Dec 2024 16:49:08 -0800 Subject: [PATCH 3/4] rm old code --- llmfoundry/utils/config_utils.py | 15 +-------------- llmfoundry/utils/exceptions.py | 15 --------------- 2 files changed, 1 insertion(+), 29 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 1016d46ca9..6f3c4d2212 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -28,7 +28,6 @@ huggingface_dataset_source, uc_volume_dataset_source, ) -from mlflow.exceptions import MlflowException from omegaconf import MISSING, DictConfig, ListConfig, MissingMandatoryValue from omegaconf import OmegaConf as om from transformers import PretrainedConfig @@ -36,7 +35,6 @@ from llmfoundry.layers_registry import ffns_with_megablocks from llmfoundry.models.utils import init_empty_weights from llmfoundry.registry import config_transforms -from llmfoundry.utils.exceptions import UCNotFoundError log = logging.getLogger(__name__) @@ -798,18 +796,7 @@ def log_dataset_uri(cfg: dict[str, Any]) -> None: if dataset_type == 'delta_table': source = source_class(delta_table_name=path) elif dataset_type == 'hf' or dataset_type == 'uc_volume': - try: - source = source_class(path=path) - except MlflowException as e: - error_str = str(e) - match = re.search( - r'MlflowException:\s+(.*?)\s+does not exist in Databricks Unified Catalog\.', - error_str, - ) - if match: - uc_path = match.group(1) - raise UCNotFoundError(uc_path) - raise + source = source_class(path=path) else: source = source_class(url=path) else: diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 78272101ae..53d3baebfc 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -33,7 +33,6 @@ 'StoragePermissionError', 'UCNotEnabledError', 'DeltaTableNotFoundError', - 'UCNotFoundError', ] ALLOWED_RESPONSE_KEYS = {'response', 'completion'} @@ -586,17 +585,3 @@ def __init__( volume_name=volume_name, table_name=table_name, ) - - -class UCNotFoundError(UserError): - """Error thrown when the UC passed in training doesn't exist.""" - - def __init__( - self, - path: str, - ) -> None: - message = f'Your data path {path} does not exist. Please double check your UC path' - super().__init__( - message=message, - path=path, - ) From d453c764ab1552741fc8d3503b7b18abee49b60f Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 12 Dec 2024 17:09:40 -0800 Subject: [PATCH 4/4] use pathlib instead --- llmfoundry/utils/config_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 6f3c4d2212..252841cb50 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -6,9 +6,9 @@ import logging import math import os -import re import warnings from dataclasses import dataclass, fields +from pathlib import Path from typing import ( Any, Callable, @@ -705,7 +705,7 @@ def _process_data_source( data_paths (List[Tuple[str, str, str]]): A list of tuples formatted as (data type, path, split) """ if source_dataset_path: - source_dataset_path = re.sub(r'/+', '/', source_dataset_path) + source_dataset_path = str(Path(source_dataset_path)) # Check for Delta table if source_dataset_path and len(source_dataset_path.split('.')) == 3: data_paths.append(('delta_table', source_dataset_path, true_split))