From 92252ce328003d1b4122937c42302861f21cbc56 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Fri, 1 Nov 2024 11:10:18 -0700 Subject: [PATCH] Bump mlflow max version (#1629) --- llmfoundry/callbacks/hf_checkpointer.py | 2 ++ llmfoundry/utils/config_utils.py | 26 +++++++++++++++---------- setup.py | 2 +- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 004b6df09d..a95b68cf28 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -324,6 +324,7 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: ) import mlflow + import mlflow.environment_variables mlflow.environment_variables.MLFLOW_HUGGINGFACE_MODEL_MAX_SHARD_SIZE.set( '1GB', ) @@ -694,6 +695,7 @@ def tensor_hook( # TODO: Remove after mlflow fixes the bug that makes this necessary import mlflow + import mlflow.store mlflow.store._unity_catalog.registry.rest_store.get_feature_dependencies = lambda *args, **kwargs: '' model_saving_kwargs: dict[str, Any] = { 'path': local_save_path, diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 03f8812fa3..997273de7f 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -21,6 +21,12 @@ import mlflow from composer.loggers import Logger from composer.utils import dist, parse_uri +from mlflow.data import ( + delta_dataset_source, + http_dataset_source, + huggingface_dataset_source, + uc_volume_dataset_source, +) from omegaconf import MISSING, DictConfig, ListConfig, MissingMandatoryValue from omegaconf import OmegaConf as om from transformers import PretrainedConfig @@ -769,15 +775,15 @@ def log_dataset_uri(cfg: dict[str, Any]) -> None: data_paths = _parse_source_dataset(cfg) dataset_source_mapping = { - 's3': mlflow.data.http_dataset_source.HTTPDatasetSource, - 'oci': mlflow.data.http_dataset_source.HTTPDatasetSource, - 'azure': mlflow.data.http_dataset_source.HTTPDatasetSource, - 'gs': mlflow.data.http_dataset_source.HTTPDatasetSource, - 'https': mlflow.data.http_dataset_source.HTTPDatasetSource, - 'hf': mlflow.data.huggingface_dataset_source.HuggingFaceDatasetSource, - 'delta_table': mlflow.data.delta_dataset_source.DeltaDatasetSource, - 'uc_volume': mlflow.data.uc_volume_dataset_source.UCVolumeDatasetSource, - 'local': mlflow.data.http_dataset_source.HTTPDatasetSource, + 's3': http_dataset_source.HTTPDatasetSource, + 'oci': http_dataset_source.HTTPDatasetSource, + 'azure': http_dataset_source.HTTPDatasetSource, + 'gs': http_dataset_source.HTTPDatasetSource, + 'https': http_dataset_source.HTTPDatasetSource, + 'hf': huggingface_dataset_source.HuggingFaceDatasetSource, + 'delta_table': delta_dataset_source.DeltaDatasetSource, + 'uc_volume': uc_volume_dataset_source.UCVolumeDatasetSource, + 'local': http_dataset_source.HTTPDatasetSource, } # Map data source types to their respective MLFlow DataSource. @@ -795,7 +801,7 @@ def log_dataset_uri(cfg: dict[str, Any]) -> None: log.info( f'{dataset_type} unknown, defaulting to http dataset source', ) - source = mlflow.data.http_dataset_source.HTTPDatasetSource(url=path) + source = http_dataset_source.HTTPDatasetSource(url=path) mlflow.log_input( mlflow.data.meta_dataset.MetaDataset(source, name=split), diff --git a/setup.py b/setup.py index 00934a30e0..86d696ed5c 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ install_requires = [ 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.26.0,<0.27', - 'mlflow>=2.14.1,<2.17', + 'mlflow>=2.14.1,<2.18', 'accelerate>=0.25,<0.34', # for HF inference `device_map` 'transformers>=4.43.2,<4.44', 'mosaicml-streaming>=0.9.0,<0.10',