From bbfe91ad81406ac1fa5fe9bc1a684c1ff37b9583 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Sun, 3 Nov 2024 14:58:25 +0800 Subject: [PATCH 01/48] debug --- sky/task.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sky/task.py b/sky/task.py index cebc616dc6d..340d3c9d4b4 100644 --- a/sky/task.py +++ b/sky/task.py @@ -442,6 +442,8 @@ def from_yaml_config( mount_path),) + e.args[1:] raise e task_storage_mounts[mount_path] = storage_obj + print(f"task_storage_mounts: {task_storage_mounts}") + print(f"file_mounts: {file_mounts}") task.set_storage_mounts(task_storage_mounts) if config.get('inputs') is not None: From 043dd285cbd889fb7c7a98b712f86947af4234d4 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 4 Nov 2024 13:12:15 +0800 Subject: [PATCH 02/48] support workdir_bucket_name config on yaml file --- sky/task.py | 4 ++-- sky/utils/controller_utils.py | 39 +++++++++++++++++++++++++++++++---- sky/utils/schemas.py | 29 ++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 6 deletions(-) diff --git a/sky/task.py b/sky/task.py index 340d3c9d4b4..76ceea9785d 100644 --- a/sky/task.py +++ b/sky/task.py @@ -905,7 +905,7 @@ def update_storage_mounts( task_storage_mounts.update(storage_mounts) return self.set_storage_mounts(task_storage_mounts) - def _get_preferred_store( + def get_preferred_store( self) -> Tuple[storage_lib.StoreType, Optional[str]]: """Returns the preferred store type and region for this task.""" # TODO(zhwu, romilb): The optimizer should look at the source and @@ -959,7 +959,7 @@ def sync_storage_mounts(self) -> None: """ for storage in self.storage_mounts.values(): if len(storage.stores) == 0: - store_type, store_region = self._get_preferred_store() + store_type, store_region = self.get_preferred_store() self.storage_plans[storage] = store_type storage.add_store(store_type, store_region) else: diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 0ab2fd7e117..fa1d4c31e5f 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -675,6 +675,23 @@ def replace_skypilot_config_path_in_file_mounts( logger.debug(f'Replaced {_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX} ' f'with the real path in file mounts: {file_mounts}') +def _get_workdir_bucket_name_from_config(store_type: storage_lib.StoreType) -> None: + nested_key = ('aws', 'workdir_bucket_name') + match store_type: + case storage_lib.StoreType.S3: + nested_key = (str(clouds.AWS()).lower(), 'workdir_bucket_name') + case storage_lib.StoreType.GCS: + nested_key = (str(clouds.GCP()).lower(), 'workdir_bucket_name') + case storage_lib.StoreType.AZURE: + nested_key = (str(clouds.Azure()).lower(), 'workdir_bucket_name') + case storage_lib.StoreType.R2: + nested_key = (cloudflare.NAME.lower(), 'workdir_bucket_name') + case storage_lib.StoreType.IBM: + nested_key = (str(clouds.IBM()).lower(), 'workdir_bucket_name') + case _: + raise ValueError(f"Unsupported store type: {store_type}") + return skypilot_config.get_nested(nested_key, None) + def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', path: str) -> None: @@ -720,8 +737,14 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', # Step 1: Translate the workdir to SkyPilot storage. new_storage_mounts = {} if task.workdir is not None: - bucket_name = constants.WORKDIR_BUCKET_NAME.format( + store_type, store_region = task.get_preferred_store() + fixed_bucket_name = _get_workdir_bucket_name_from_config(store_type) + if fixed_bucket_name is None: + bucket_name = constants.WORKDIR_BUCKET_NAME.format( username=common_utils.get_cleaned_username(), id=run_id) + else: + bucket_name = fixed_bucket_name + workdir = task.workdir task.workdir = None if (constants.SKY_REMOTE_WORKDIR in original_file_mounts or @@ -729,14 +752,22 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', raise ValueError( f'Cannot mount {constants.SKY_REMOTE_WORKDIR} as both the ' 'workdir and file_mounts contains it as the target.') - new_storage_mounts[ - constants. - SKY_REMOTE_WORKDIR] = storage_lib.Storage.from_yaml_config({ + storage = storage_lib.Storage.from_yaml_config({ 'name': bucket_name, 'source': workdir, 'persistent': False, 'mode': 'COPY', }) + if fixed_bucket_name is not None: + # We load the bucket name from the config file nested under + # specific cloud, in this case we only want get_preferred_store + # be called once. If get_preferred_store is called multiple + # times, we might get different store_type and store_region + # in the future. + storage.add_store(store_type, store_region) + new_storage_mounts[ + constants. + SKY_REMOTE_WORKDIR] = storage # Check of the existence of the workdir in file_mounts is done in # the task construction. logger.info(f' {colorama.Style.DIM}Workdir: {workdir!r} ' diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 81c4cb332a6..1bf30d24ab6 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -725,6 +725,9 @@ def get_config_schema(): 'disk_encrypted': { 'type': 'boolean', }, + 'workdir_bucket_name': { + 'type': 'string', + }, 'security_group_name': (_PRORPERTY_NAME_OR_CLUSTER_NAME_TO_PROPERTY), **_LABELS_SCHEMA, @@ -765,6 +768,9 @@ def get_config_schema(): 'enable_gvnic': { 'type': 'boolean' }, + 'workdir_bucket_name': { + 'type': 'string', + }, **_LABELS_SCHEMA, **_NETWORK_CONFIG_SCHEMA, }, @@ -781,6 +787,29 @@ def get_config_schema(): 'resource_group_vm': { 'type': 'string', }, + 'workdir_bucket_name': { + 'type': 'string', + }, + } + }, + 'cloudflare': { + 'type': 'object', + 'required': [], + 'additionalProperties': False, + 'properties': { + 'workdir_bucket_name': { + 'type': 'string', + }, + } + }, + 'ibm': { + 'type': 'object', + 'required': [], + 'additionalProperties': False, + 'properties': { + 'workdir_bucket_name': { + 'type': 'string', + }, } }, 'kubernetes': { From bc767c8c4d2759575c24d3fc2da8574b4fd22e7f Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 4 Nov 2024 13:31:42 +0800 Subject: [PATCH 03/48] change the match statement to if else due to mypy limit --- sky/utils/controller_utils.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index fa1d4c31e5f..16c48052584 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -677,19 +677,18 @@ def replace_skypilot_config_path_in_file_mounts( def _get_workdir_bucket_name_from_config(store_type: storage_lib.StoreType) -> None: nested_key = ('aws', 'workdir_bucket_name') - match store_type: - case storage_lib.StoreType.S3: - nested_key = (str(clouds.AWS()).lower(), 'workdir_bucket_name') - case storage_lib.StoreType.GCS: - nested_key = (str(clouds.GCP()).lower(), 'workdir_bucket_name') - case storage_lib.StoreType.AZURE: - nested_key = (str(clouds.Azure()).lower(), 'workdir_bucket_name') - case storage_lib.StoreType.R2: - nested_key = (cloudflare.NAME.lower(), 'workdir_bucket_name') - case storage_lib.StoreType.IBM: - nested_key = (str(clouds.IBM()).lower(), 'workdir_bucket_name') - case _: - raise ValueError(f"Unsupported store type: {store_type}") + if store_type == storage_lib.StoreType.S3: + nested_key = (str(clouds.AWS()).lower(), 'workdir_bucket_name') + elif store_type == storage_lib.StoreType.GCS: + nested_key = (str(clouds.GCP()).lower(), 'workdir_bucket_name') + elif store_type == storage_lib.StoreType.AZURE: + nested_key = (str(clouds.Azure()).lower(), 'workdir_bucket_name') + elif store_type == storage_lib.StoreType.R2: + nested_key = (cloudflare.NAME.lower(), 'workdir_bucket_name') + elif store_type == storage_lib.StoreType.IBM: + nested_key = (str(clouds.IBM()).lower(), 'workdir_bucket_name') + else: + raise ValueError(f"Unsupported store type: {store_type}") return skypilot_config.get_nested(nested_key, None) From 8200400ee95aec72b4dd77206ebdfcc75b7ad3b5 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 4 Nov 2024 13:34:13 +0800 Subject: [PATCH 04/48] pass mypy --- sky/utils/controller_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 16c48052584..b2f7091b83c 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -675,7 +675,7 @@ def replace_skypilot_config_path_in_file_mounts( logger.debug(f'Replaced {_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX} ' f'with the real path in file mounts: {file_mounts}') -def _get_workdir_bucket_name_from_config(store_type: storage_lib.StoreType) -> None: +def _get_workdir_bucket_name_from_config(store_type: storage_lib.StoreType) -> Optional[str]: nested_key = ('aws', 'workdir_bucket_name') if store_type == storage_lib.StoreType.S3: nested_key = (str(clouds.AWS()).lower(), 'workdir_bucket_name') From 0e395ca34ffabc91f130dc7d269979551bc7313e Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 4 Nov 2024 13:38:37 +0800 Subject: [PATCH 05/48] yapf format fix --- sky/utils/controller_utils.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index b2f7091b83c..9da12f4e368 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -675,7 +675,8 @@ def replace_skypilot_config_path_in_file_mounts( logger.debug(f'Replaced {_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX} ' f'with the real path in file mounts: {file_mounts}') -def _get_workdir_bucket_name_from_config(store_type: storage_lib.StoreType) -> Optional[str]: +def _get_workdir_bucket_name_from_config( + store_type: storage_lib.StoreType) -> Optional[str]: nested_key = ('aws', 'workdir_bucket_name') if store_type == storage_lib.StoreType.S3: nested_key = (str(clouds.AWS()).lower(), 'workdir_bucket_name') @@ -740,7 +741,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', fixed_bucket_name = _get_workdir_bucket_name_from_config(store_type) if fixed_bucket_name is None: bucket_name = constants.WORKDIR_BUCKET_NAME.format( - username=common_utils.get_cleaned_username(), id=run_id) + username=common_utils.get_cleaned_username(), id=run_id) else: bucket_name = fixed_bucket_name @@ -752,11 +753,11 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', f'Cannot mount {constants.SKY_REMOTE_WORKDIR} as both the ' 'workdir and file_mounts contains it as the target.') storage = storage_lib.Storage.from_yaml_config({ - 'name': bucket_name, - 'source': workdir, - 'persistent': False, - 'mode': 'COPY', - }) + 'name': bucket_name, + 'source': workdir, + 'persistent': False, + 'mode': 'COPY', + }) if fixed_bucket_name is not None: # We load the bucket name from the config file nested under # specific cloud, in this case we only want get_preferred_store @@ -764,9 +765,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', # times, we might get different store_type and store_region # in the future. storage.add_store(store_type, store_region) - new_storage_mounts[ - constants. - SKY_REMOTE_WORKDIR] = storage + new_storage_mounts[constants.SKY_REMOTE_WORKDIR] = storage # Check of the existence of the workdir in file_mounts is done in # the task construction. logger.info(f' {colorama.Style.DIM}Workdir: {workdir!r} ' From 2b1a0630b87bd6ce1ce365690115af031bc77196 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 4 Nov 2024 13:46:58 +0800 Subject: [PATCH 06/48] reformat --- sky/utils/controller_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 9da12f4e368..4beb06a68f4 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -675,6 +675,7 @@ def replace_skypilot_config_path_in_file_mounts( logger.debug(f'Replaced {_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX} ' f'with the real path in file mounts: {file_mounts}') + def _get_workdir_bucket_name_from_config( store_type: storage_lib.StoreType) -> Optional[str]: nested_key = ('aws', 'workdir_bucket_name') @@ -689,7 +690,7 @@ def _get_workdir_bucket_name_from_config( elif store_type == storage_lib.StoreType.IBM: nested_key = (str(clouds.IBM()).lower(), 'workdir_bucket_name') else: - raise ValueError(f"Unsupported store type: {store_type}") + raise ValueError(f'Unsupported store type: {store_type}') return skypilot_config.get_nested(nested_key, None) From fa9cc691b5b3d6cbce5999e927a476c6559e59e4 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 4 Nov 2024 13:56:49 +0800 Subject: [PATCH 07/48] remove debug line --- sky/task.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sky/task.py b/sky/task.py index 76ceea9785d..fc0720b00c7 100644 --- a/sky/task.py +++ b/sky/task.py @@ -442,8 +442,6 @@ def from_yaml_config( mount_path),) + e.args[1:] raise e task_storage_mounts[mount_path] = storage_obj - print(f"task_storage_mounts: {task_storage_mounts}") - print(f"file_mounts: {file_mounts}") task.set_storage_mounts(task_storage_mounts) if config.get('inputs') is not None: From 4c98e11e34e9e11d274d76f408e3ff271d305578 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Wed, 6 Nov 2024 16:20:49 +0800 Subject: [PATCH 08/48] all dir to same bucket --- sky/data/storage.py | 21 ++++++++++ sky/skylet/constants.py | 4 +- sky/utils/controller_utils.py | 79 ++++++++++++++--------------------- sky/utils/schemas.py | 38 ++++------------- 4 files changed, 62 insertions(+), 80 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index 6fbb95a8c56..c96a716da94 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -189,6 +189,27 @@ def get_endpoint_url(cls, store: 'AbstractStore', path: str) -> str: return bucket_endpoint_url +class StorePrefix(enum.Enum): + """Enum for the prefix of different stores.""" + S3 = 's3://' + GCS = 'gs://' + AZURE = 'https://' + R2 = 'r2://' + IBM = 'cos://' + + def to_store_type(self) -> StoreType: + if self == StorePrefix.S3: + return StoreType.S3 + elif self == StorePrefix.GCS: + return StoreType.GCS + elif self == StorePrefix.AZURE: + return StoreType.AZURE + elif self == StorePrefix.R2: + return StoreType.R2 + elif self == StorePrefix.IBM: + return StoreType.IBM + + class StorageMode(enum.Enum): MOUNT = 'MOUNT' COPY = 'COPY' diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 032ad5d25b1..0a297dc9f13 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -241,9 +241,7 @@ # Used for translate local file mounts to cloud storage. Please refer to # sky/execution.py::_maybe_translate_local_file_mounts_and_sync_up for # more details. -WORKDIR_BUCKET_NAME = 'skypilot-workdir-{username}-{id}' -FILE_MOUNTS_BUCKET_NAME = 'skypilot-filemounts-folder-{username}-{id}' -FILE_MOUNTS_FILE_ONLY_BUCKET_NAME = 'skypilot-filemounts-files-{username}-{id}' +FILE_MOUNTS_BUCKET_NAME = 'skypilot-filemounts-{username}-{id}' FILE_MOUNTS_LOCAL_TMP_DIR = 'skypilot-filemounts-files-{id}' FILE_MOUNTS_REMOTE_TMP_DIR = '/tmp/sky-{}-filemounts-files' diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 4beb06a68f4..d09c8958ea0 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -676,22 +676,17 @@ def replace_skypilot_config_path_in_file_mounts( f'with the real path in file mounts: {file_mounts}') -def _get_workdir_bucket_name_from_config( - store_type: storage_lib.StoreType) -> Optional[str]: - nested_key = ('aws', 'workdir_bucket_name') - if store_type == storage_lib.StoreType.S3: - nested_key = (str(clouds.AWS()).lower(), 'workdir_bucket_name') - elif store_type == storage_lib.StoreType.GCS: - nested_key = (str(clouds.GCP()).lower(), 'workdir_bucket_name') - elif store_type == storage_lib.StoreType.AZURE: - nested_key = (str(clouds.Azure()).lower(), 'workdir_bucket_name') - elif store_type == storage_lib.StoreType.R2: - nested_key = (cloudflare.NAME.lower(), 'workdir_bucket_name') - elif store_type == storage_lib.StoreType.IBM: - nested_key = (str(clouds.IBM()).lower(), 'workdir_bucket_name') - else: - raise ValueError(f'Unsupported store type: {store_type}') - return skypilot_config.get_nested(nested_key, None) +def _get_bucket_name_and_store_type_from_job_config( +) -> tuple[Optional[str], Optional[str]]: + bucket_wth_prefix = skypilot_config.get_nested(('jobs', 'bucket'), None) + if bucket_wth_prefix is None: + return None, None + + for prefix in storage_lib.StorePrefix: + if bucket_wth_prefix.startswith(prefix.value): + bucket_name = bucket_wth_prefix[len(prefix.value):] + store = prefix.to_store_type().value + return bucket_name, store def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', @@ -735,17 +730,16 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', ux_utils.spinner_message( f'Translating {msg} to SkyPilot Storage...')) + # Get the bucket name for the workdir and file mounts, + # we stores all these files in same bucket from config. + bucket_name, store = _get_bucket_name_and_store_type_from_job_config() + if bucket_name is None: + bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format( + username=common_utils.get_cleaned_username(), id=run_id) + # Step 1: Translate the workdir to SkyPilot storage. new_storage_mounts = {} if task.workdir is not None: - store_type, store_region = task.get_preferred_store() - fixed_bucket_name = _get_workdir_bucket_name_from_config(store_type) - if fixed_bucket_name is None: - bucket_name = constants.WORKDIR_BUCKET_NAME.format( - username=common_utils.get_cleaned_username(), id=run_id) - else: - bucket_name = fixed_bucket_name - workdir = task.workdir task.workdir = None if (constants.SKY_REMOTE_WORKDIR in original_file_mounts or @@ -753,20 +747,15 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', raise ValueError( f'Cannot mount {constants.SKY_REMOTE_WORKDIR} as both the ' 'workdir and file_mounts contains it as the target.') - storage = storage_lib.Storage.from_yaml_config({ - 'name': bucket_name, - 'source': workdir, - 'persistent': False, - 'mode': 'COPY', - }) - if fixed_bucket_name is not None: - # We load the bucket name from the config file nested under - # specific cloud, in this case we only want get_preferred_store - # be called once. If get_preferred_store is called multiple - # times, we might get different store_type and store_region - # in the future. - storage.add_store(store_type, store_region) - new_storage_mounts[constants.SKY_REMOTE_WORKDIR] = storage + new_storage_mounts[ + constants. + SKY_REMOTE_WORKDIR] = storage_lib.Storage.from_yaml_config({ + 'name': bucket_name, + 'source': workdir, + 'persistent': False, + 'mode': 'COPY', + 'store': store, + }) # Check of the existence of the workdir in file_mounts is done in # the task construction. logger.info(f' {colorama.Style.DIM}Workdir: {workdir!r} ' @@ -784,15 +773,12 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', if os.path.isfile(os.path.abspath(os.path.expanduser(src))): copy_mounts_with_file_in_src[dst] = src continue - bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format( - username=common_utils.get_cleaned_username(), - id=f'{run_id}-{i}', - ) new_storage_mounts[dst] = storage_lib.Storage.from_yaml_config({ 'name': bucket_name, 'source': src, 'persistent': False, 'mode': 'COPY', + 'store': store, }) logger.info(f' {colorama.Style.DIM}Folder : {src!r} ' f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}') @@ -803,8 +789,6 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', tempfile.gettempdir(), constants.FILE_MOUNTS_LOCAL_TMP_DIR.format(id=run_id)) os.makedirs(local_fm_path, exist_ok=True) - file_bucket_name = constants.FILE_MOUNTS_FILE_ONLY_BUCKET_NAME.format( - username=common_utils.get_cleaned_username(), id=run_id) file_mount_remote_tmp_dir = constants.FILE_MOUNTS_REMOTE_TMP_DIR.format( path) if copy_mounts_with_file_in_src: @@ -816,10 +800,11 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', new_storage_mounts[ file_mount_remote_tmp_dir] = storage_lib.Storage.from_yaml_config({ - 'name': file_bucket_name, + 'name': bucket_name, 'source': local_fm_path, 'persistent': False, 'mode': 'MOUNT', + 'store': store, }) if file_mount_remote_tmp_dir in original_storage_mounts: with ux_utils.print_exception_no_traceback(): @@ -830,7 +815,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', sources = list(src_to_file_id.keys()) sources_str = '\n '.join(sources) logger.info(f' {colorama.Style.DIM}Files (listed below) ' - f' -> storage: {file_bucket_name}:' + f' -> storage: {bucket_name}:' f'\n {sources_str}{colorama.Style.RESET_ALL}') rich_utils.force_update_status( ux_utils.spinner_message('Uploading translated local files/folders')) @@ -879,7 +864,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', store_type = list(storage_obj.stores.keys())[0] store_object = storage_obj.stores[store_type] bucket_url = storage_lib.StoreType.get_endpoint_url( - store_object, file_bucket_name) + store_object, bucket_name) for dst, src in copy_mounts_with_file_in_src.items(): file_id = src_to_file_id[src] new_file_mounts[dst] = bucket_url + f'/file-{file_id}' diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 1bf30d24ab6..05febfb03c4 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -3,6 +3,7 @@ Schemas conform to the JSON Schema specification as defined at https://json-schema.org/ """ +import copy import enum from typing import Any, Dict, List, Tuple @@ -707,6 +708,12 @@ def get_config_schema(): }, } } + jobs_configs = copy.deepcopy(controller_resources_schema) + jobs_configs['properties']['bucket'] = { + 'type': 'string', + 'pattern': '^(https|s3|gs|r2|cos)://.+', + 'required': [] + } cloud_configs = { 'aws': { 'type': 'object', @@ -725,9 +732,6 @@ def get_config_schema(): 'disk_encrypted': { 'type': 'boolean', }, - 'workdir_bucket_name': { - 'type': 'string', - }, 'security_group_name': (_PRORPERTY_NAME_OR_CLUSTER_NAME_TO_PROPERTY), **_LABELS_SCHEMA, @@ -768,9 +772,6 @@ def get_config_schema(): 'enable_gvnic': { 'type': 'boolean' }, - 'workdir_bucket_name': { - 'type': 'string', - }, **_LABELS_SCHEMA, **_NETWORK_CONFIG_SCHEMA, }, @@ -787,29 +788,6 @@ def get_config_schema(): 'resource_group_vm': { 'type': 'string', }, - 'workdir_bucket_name': { - 'type': 'string', - }, - } - }, - 'cloudflare': { - 'type': 'object', - 'required': [], - 'additionalProperties': False, - 'properties': { - 'workdir_bucket_name': { - 'type': 'string', - }, - } - }, - 'ibm': { - 'type': 'object', - 'required': [], - 'additionalProperties': False, - 'properties': { - 'workdir_bucket_name': { - 'type': 'string', - }, } }, 'kubernetes': { @@ -956,7 +934,7 @@ def get_config_schema(): 'required': [], 'additionalProperties': False, 'properties': { - 'jobs': controller_resources_schema, + 'jobs': jobs_configs, 'spot': controller_resources_schema, 'serve': controller_resources_schema, 'allowed_clouds': allowed_clouds, From 144a4a317ee787aaf4023ad2a4ef53e459a14fb4 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Wed, 6 Nov 2024 16:45:55 +0800 Subject: [PATCH 09/48] private member function --- sky/task.py | 4 ++-- sky/utils/controller_utils.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sky/task.py b/sky/task.py index fc0720b00c7..cebc616dc6d 100644 --- a/sky/task.py +++ b/sky/task.py @@ -903,7 +903,7 @@ def update_storage_mounts( task_storage_mounts.update(storage_mounts) return self.set_storage_mounts(task_storage_mounts) - def get_preferred_store( + def _get_preferred_store( self) -> Tuple[storage_lib.StoreType, Optional[str]]: """Returns the preferred store type and region for this task.""" # TODO(zhwu, romilb): The optimizer should look at the source and @@ -957,7 +957,7 @@ def sync_storage_mounts(self) -> None: """ for storage in self.storage_mounts.values(): if len(storage.stores) == 0: - store_type, store_region = self.get_preferred_store() + store_type, store_region = self._get_preferred_store() self.storage_plans[storage] = store_type storage.add_store(store_type, store_region) else: diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index d09c8958ea0..5c5f39c2398 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -688,6 +688,8 @@ def _get_bucket_name_and_store_type_from_job_config( store = prefix.to_store_type().value return bucket_name, store + raise ValueError(f'Invalid bucket name with prefix: {bucket_wth_prefix}') + def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', path: str) -> None: From 42de23ad1ea73f4a31a2395e09d1db6c95df97e9 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Wed, 6 Nov 2024 16:53:00 +0800 Subject: [PATCH 10/48] fix mypy --- sky/data/storage.py | 2 ++ sky/utils/controller_utils.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index c96a716da94..fbb79fe2e13 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -208,6 +208,8 @@ def to_store_type(self) -> StoreType: return StoreType.R2 elif self == StorePrefix.IBM: return StoreType.IBM + else: + raise ValueError(f'Unknown store prefix: {self}') class StorageMode(enum.Enum): diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 5c5f39c2398..e40bbf73299 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -6,7 +6,7 @@ import os import tempfile import typing -from typing import Any, Dict, Iterable, List, Optional, Set +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple import colorama @@ -677,7 +677,7 @@ def replace_skypilot_config_path_in_file_mounts( def _get_bucket_name_and_store_type_from_job_config( -) -> tuple[Optional[str], Optional[str]]: +) -> Tuple[Optional[str], Optional[str]]: bucket_wth_prefix = skypilot_config.get_nested(('jobs', 'bucket'), None) if bucket_wth_prefix is None: return None, None From 888c0fa35431fb97659bb2d1ae0ab1ba6d86c7c5 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Thu, 7 Nov 2024 19:20:23 +0800 Subject: [PATCH 11/48] support sub dir config to separate to different directory --- sky/data/storage.py | 98 ++++++++++++++++++++++++----------- sky/utils/controller_utils.py | 3 ++ sky/utils/schemas.py | 3 ++ 3 files changed, 73 insertions(+), 31 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index fbb79fe2e13..eab92a18cb7 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -255,7 +255,8 @@ def __init__(self, source: Optional[SourceType], region: Optional[str] = None, is_sky_managed: Optional[bool] = None, - sync_on_reconstruction: Optional[bool] = True): + sync_on_reconstruction: Optional[bool] = True, + sub_dir: Optional[str] = None): """Initialize AbstractStore Args: @@ -269,6 +270,11 @@ def __init__(self, there. This is set to false when the Storage object is created not for direct use, e.g. for 'sky storage delete', or the storage is being re-used, e.g., for `sky start` on a stopped cluster. + sub_dir: str; The prefix of the directory to be created in the + store, e.g. if sub_dir=my-dir, the files will be uploaded to + s3:///my-dir/. + This only works if source is a local directory. + # TODO(zpoint): Add support for non-local source. Raises: StorageBucketCreateError: If bucket creation fails @@ -280,6 +286,8 @@ def __init__(self, self.region = region self.is_sky_managed = is_sky_managed self.sync_on_reconstruction = sync_on_reconstruction + + self.sub_dir = sub_dir # Whether sky is responsible for the lifecycle of the Store. self._validate() self.initialize() @@ -481,7 +489,8 @@ def __init__(self, stores: Optional[Dict[StoreType, AbstractStore]] = None, persistent: Optional[bool] = True, mode: StorageMode = StorageMode.MOUNT, - sync_on_reconstruction: bool = True) -> None: + sync_on_reconstruction: bool = True, + sub_dir: Optional[str] = None) -> None: """Initializes a Storage object. Three fields are required: the name of the storage, the source @@ -519,6 +528,8 @@ def __init__(self, there. This is set to false when the Storage object is created not for direct use, e.g. for 'sky storage delete', or the storage is being re-used, e.g., for `sky start` on a stopped cluster. + sub_dir: Optional[str]; The subdirectory to use for the + storage object. """ self.name: str self.source = source @@ -526,6 +537,7 @@ def __init__(self, self.mode = mode assert mode in StorageMode self.sync_on_reconstruction = sync_on_reconstruction + self.sub_dir = sub_dir # TODO(romilb, zhwu): This is a workaround to support storage deletion # for spot. Once sky storage supports forced management for external @@ -838,7 +850,9 @@ def _add_store_from_metadata( 'to be reconstructed while the corresponding ' 'bucket was externally deleted.') continue - + # This one can't be retrieved from metadata since its set every time + # we create a new storage object. + store.sub_dir = self.sub_dir self._add_store(store, is_reconstructed=True) @classmethod @@ -894,6 +908,7 @@ def add_store(self, f'storage account {storage_account_name!r}.') else: logger.info(f'Storage type {store_type} already exists.') + return self.stores[store_type] store_cls: Type[AbstractStore] @@ -918,7 +933,8 @@ def add_store(self, name=self.name, source=self.source, region=region, - sync_on_reconstruction=self.sync_on_reconstruction) + sync_on_reconstruction=self.sync_on_reconstruction, + sub_dir=self.sub_dir) except exceptions.StorageBucketCreateError: # Creation failed, so this must be sky managed store. Add failure # to state. @@ -1047,6 +1063,7 @@ def from_yaml_config(cls, config: Dict[str, Any]) -> 'Storage': store = config.pop('store', None) mode_str = config.pop('mode', None) force_delete = config.pop('_force_delete', None) + sub_dir = config.pop('sub_dir', None) if force_delete is None: force_delete = False @@ -1066,7 +1083,8 @@ def from_yaml_config(cls, config: Dict[str, Any]) -> 'Storage': storage_obj = cls(name=name, source=source, persistent=persistent, - mode=mode) + mode=mode, + sub_dir=sub_dir) if store is not None: storage_obj.add_store(StoreType(store.upper())) @@ -1112,11 +1130,12 @@ def __init__(self, source: str, region: Optional[str] = 'us-east-2', is_sky_managed: Optional[bool] = None, - sync_on_reconstruction: bool = True): + sync_on_reconstruction: bool = True, + sub_dir: Optional[str] = None): self.client: 'boto3.client.Client' self.bucket: 'StorageHandle' super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction) + sync_on_reconstruction, sub_dir) def _validate(self): if self.source is not None and isinstance(self.source, str): @@ -1314,9 +1333,10 @@ def get_file_sync_command(base_dir_path, file_names): for file_name in file_names ]) base_dir_path = shlex.quote(base_dir_path) + sub_dir = f'/{self.sub_dir}' if self.sub_dir else '' sync_command = ('aws s3 sync --no-follow-symlinks --exclude="*" ' f'{includes} {base_dir_path} ' - f's3://{self.name}') + f's3://{self.name}{sub_dir}') return sync_command def get_dir_sync_command(src_dir_path, dest_dir_name): @@ -1328,9 +1348,11 @@ def get_dir_sync_command(src_dir_path, dest_dir_name): for file_name in excluded_list ]) src_dir_path = shlex.quote(src_dir_path) + sub_dir = f'/{self.sub_dir}' if self.sub_dir else '' sync_command = (f'aws s3 sync --no-follow-symlinks {excludes} ' f'{src_dir_path} ' - f's3://{self.name}/{dest_dir_name}') + f's3://{self.name}{sub_dir}/{dest_dir_name}') + print(sync_command) return sync_command # Generate message for upload @@ -1544,11 +1566,12 @@ def __init__(self, source: str, region: Optional[str] = 'us-central1', is_sky_managed: Optional[bool] = None, - sync_on_reconstruction: Optional[bool] = True): + sync_on_reconstruction: Optional[bool] = True, + sub_dir: Optional[str] = None): self.client: 'storage.Client' self.bucket: StorageHandle super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction) + sync_on_reconstruction, sub_dir) def _validate(self): if self.source is not None and isinstance(self.source, str): @@ -1782,9 +1805,10 @@ def get_file_sync_command(base_dir_path, file_names): sync_format = '|'.join(file_names) gsutil_alias, alias_gen = data_utils.get_gsutil_command() base_dir_path = shlex.quote(base_dir_path) + sub_dir = f'/{self.sub_dir}' if self.sub_dir else '' sync_command = (f'{alias_gen}; {gsutil_alias} ' f'rsync -e -x \'^(?!{sync_format}$).*\' ' - f'{base_dir_path} gs://{self.name}') + f'{base_dir_path} gs://{self.name}{sub_dir}') return sync_command def get_dir_sync_command(src_dir_path, dest_dir_name): @@ -1794,9 +1818,10 @@ def get_dir_sync_command(src_dir_path, dest_dir_name): excludes = '|'.join(excluded_list) gsutil_alias, alias_gen = data_utils.get_gsutil_command() src_dir_path = shlex.quote(src_dir_path) + sub_dir = f'/{self.sub_dir}' if self.sub_dir else '' sync_command = (f'{alias_gen}; {gsutil_alias} ' f'rsync -e -r -x \'({excludes})\' {src_dir_path} ' - f'gs://{self.name}/{dest_dir_name}') + f'gs://{self.name}{sub_dir}/{dest_dir_name}') return sync_command # Generate message for upload @@ -2026,7 +2051,8 @@ def __init__(self, storage_account_name: str = '', region: Optional[str] = 'eastus', is_sky_managed: Optional[bool] = None, - sync_on_reconstruction: bool = True): + sync_on_reconstruction: bool = True, + sub_dir: Optional[str] = None): self.storage_client: 'storage.Client' self.resource_client: 'storage.Client' self.container_name: str @@ -2038,7 +2064,7 @@ def __init__(self, if region is None: region = 'eastus' super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction) + sync_on_reconstruction, sub_dir) @classmethod def from_metadata(cls, metadata: AbstractStore.StoreMetadata, @@ -2506,13 +2532,15 @@ def get_file_sync_command(base_dir_path, file_names) -> str: includes_list = ';'.join(file_names) includes = f'--include-pattern "{includes_list}"' base_dir_path = shlex.quote(base_dir_path) + container_path = (f'{self.container_name}/{self.sub_dir}' + if self.sub_dir else self.container_name) sync_command = (f'az storage blob sync ' f'--account-name {self.storage_account_name} ' f'--account-key {self.storage_account_key} ' f'{includes} ' '--delete-destination false ' f'--source {base_dir_path} ' - f'--container {self.container_name}') + f'--container {container_path}') return sync_command def get_dir_sync_command(src_dir_path, dest_dir_name) -> str: @@ -2523,8 +2551,9 @@ def get_dir_sync_command(src_dir_path, dest_dir_name) -> str: [file_name.rstrip('*') for file_name in excluded_list]) excludes = f'--exclude-path "{excludes_list}"' src_dir_path = shlex.quote(src_dir_path) - container_path = (f'{self.container_name}/{dest_dir_name}' - if dest_dir_name else self.container_name) + container_path = ( + f'{self.container_name}/{self.sub_dir}/{dest_dir_name}' + if self.sub_dir else f'{self.container_name}/{dest_dir_name}') sync_command = (f'az storage blob sync ' f'--account-name {self.storage_account_name} ' f'--account-key {self.storage_account_key} ' @@ -2769,11 +2798,12 @@ def __init__(self, source: str, region: Optional[str] = 'auto', is_sky_managed: Optional[bool] = None, - sync_on_reconstruction: Optional[bool] = True): + sync_on_reconstruction: Optional[bool] = True, + sub_dir: Optional[str] = None): self.client: 'boto3.client.Client' self.bucket: 'StorageHandle' super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction) + sync_on_reconstruction, sub_dir) def _validate(self): if self.source is not None and isinstance(self.source, str): @@ -2912,11 +2942,12 @@ def get_file_sync_command(base_dir_path, file_names): ]) endpoint_url = cloudflare.create_endpoint() base_dir_path = shlex.quote(base_dir_path) + sub_dir = f'/{self.sub_dir}' if self.sub_dir else '' sync_command = ('AWS_SHARED_CREDENTIALS_FILE=' f'{cloudflare.R2_CREDENTIALS_PATH} ' 'aws s3 sync --no-follow-symlinks --exclude="*" ' f'{includes} {base_dir_path} ' - f's3://{self.name} ' + f's3://{self.name}{sub_dir} ' f'--endpoint {endpoint_url} ' f'--profile={cloudflare.R2_PROFILE_NAME}') return sync_command @@ -2931,11 +2962,12 @@ def get_dir_sync_command(src_dir_path, dest_dir_name): ]) endpoint_url = cloudflare.create_endpoint() src_dir_path = shlex.quote(src_dir_path) + sub_dir = f'/{self.sub_dir}' if self.sub_dir else '' sync_command = ('AWS_SHARED_CREDENTIALS_FILE=' f'{cloudflare.R2_CREDENTIALS_PATH} ' f'aws s3 sync --no-follow-symlinks {excludes} ' f'{src_dir_path} ' - f's3://{self.name}/{dest_dir_name} ' + f's3://{self.name}{sub_dir}/{dest_dir_name} ' f'--endpoint {endpoint_url} ' f'--profile={cloudflare.R2_PROFILE_NAME}') return sync_command @@ -3154,11 +3186,12 @@ def __init__(self, source: str, region: Optional[str] = 'us-east', is_sky_managed: Optional[bool] = None, - sync_on_reconstruction: bool = True): + sync_on_reconstruction: bool = True, + sub_dir: Optional[str] = None): self.client: 'storage.Client' self.bucket: 'StorageHandle' super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction) + sync_on_reconstruction, sub_dir) self.bucket_rclone_profile = \ Rclone.generate_rclone_bucket_profile_name( self.name, Rclone.RcloneClouds.IBM) @@ -3347,10 +3380,11 @@ def get_dir_sync_command(src_dir_path, dest_dir_name) -> str: # .git directory is excluded from the sync # wrapping src_dir_path with "" to support path with spaces src_dir_path = shlex.quote(src_dir_path) - sync_command = ( - 'rclone copy --exclude ".git/*" ' - f'{src_dir_path} ' - f'{self.bucket_rclone_profile}:{self.name}/{dest_dir_name}') + sub_dir = f'/{self.sub_dir}' if self.sub_dir else '' + sync_command = ('rclone copy --exclude ".git/*" ' + f'{src_dir_path} ' + f'{self.bucket_rclone_profile}:{self.name}{sub_dir}' + f'/{dest_dir_name}') return sync_command def get_file_sync_command(base_dir_path, file_names) -> str: @@ -3376,9 +3410,11 @@ def get_file_sync_command(base_dir_path, file_names) -> str: for file_name in file_names ]) base_dir_path = shlex.quote(base_dir_path) - sync_command = ('rclone copy ' - f'{includes} {base_dir_path} ' - f'{self.bucket_rclone_profile}:{self.name}') + sub_dir = f'/{self.sub_dir}' if self.sub_dir else '' + sync_command = ( + 'rclone copy ' + f'{includes} {base_dir_path} ' + f'{self.bucket_rclone_profile}:{self.name}{sub_dir}') return sync_command # Generate message for upload diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index e40bbf73299..72f39f3c590 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -757,6 +757,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', 'persistent': False, 'mode': 'COPY', 'store': store, + 'sub_dir': f'job-{run_id}/workdir', }) # Check of the existence of the workdir in file_mounts is done in # the task construction. @@ -781,6 +782,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', 'persistent': False, 'mode': 'COPY', 'store': store, + 'sub_dir': f'job-{run_id}/local-file-mounts/{i}', }) logger.info(f' {colorama.Style.DIM}Folder : {src!r} ' f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}') @@ -807,6 +809,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', 'persistent': False, 'mode': 'MOUNT', 'store': store, + 'sub_dir': f'job-{run_id}/tmp-files', }) if file_mount_remote_tmp_dir in original_storage_mounts: with ux_utils.print_exception_no_traceback(): diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 05febfb03c4..c6026f05952 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -300,6 +300,9 @@ def get_storage_schema(): mode.value for mode in storage.StorageMode ] }, + 'sub_dir': { + 'type': 'string', + }, '_force_delete': { 'type': 'boolean', } From 3b1adcc5ece481e69ffc380569aebea904a8c4c4 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Fri, 8 Nov 2024 17:12:21 +0800 Subject: [PATCH 12/48] rename and add smoke test --- sky/data/storage.py | 129 ++++++++++++---------- sky/utils/controller_utils.py | 6 +- sky/utils/schemas.py | 2 +- tests/test_smoke.py | 19 ++++ tests/test_yamls/intermediate_bucket.yaml | 14 +++ 5 files changed, 108 insertions(+), 62 deletions(-) create mode 100644 tests/test_yamls/intermediate_bucket.yaml diff --git a/sky/data/storage.py b/sky/data/storage.py index eab92a18cb7..e353c18ebc1 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -256,7 +256,7 @@ def __init__(self, region: Optional[str] = None, is_sky_managed: Optional[bool] = None, sync_on_reconstruction: Optional[bool] = True, - sub_dir: Optional[str] = None): + bucket_sub_path: Optional[str] = None): """Initialize AbstractStore Args: @@ -270,9 +270,9 @@ def __init__(self, there. This is set to false when the Storage object is created not for direct use, e.g. for 'sky storage delete', or the storage is being re-used, e.g., for `sky start` on a stopped cluster. - sub_dir: str; The prefix of the directory to be created in the - store, e.g. if sub_dir=my-dir, the files will be uploaded to - s3:///my-dir/. + bucket_sub_path: str; The prefix of the bucket directory to be + created in the store, e.g. if bucket_sub_path=my-dir, the files + will be uploaded to s3:///my-dir/. This only works if source is a local directory. # TODO(zpoint): Add support for non-local source. @@ -287,7 +287,7 @@ def __init__(self, self.is_sky_managed = is_sky_managed self.sync_on_reconstruction = sync_on_reconstruction - self.sub_dir = sub_dir + self.bucket_sub_path = bucket_sub_path # Whether sky is responsible for the lifecycle of the Store. self._validate() self.initialize() @@ -490,7 +490,7 @@ def __init__(self, persistent: Optional[bool] = True, mode: StorageMode = StorageMode.MOUNT, sync_on_reconstruction: bool = True, - sub_dir: Optional[str] = None) -> None: + bucket_sub_path: Optional[str] = None) -> None: """Initializes a Storage object. Three fields are required: the name of the storage, the source @@ -528,7 +528,7 @@ def __init__(self, there. This is set to false when the Storage object is created not for direct use, e.g. for 'sky storage delete', or the storage is being re-used, e.g., for `sky start` on a stopped cluster. - sub_dir: Optional[str]; The subdirectory to use for the + bucket_sub_path: Optional[str]; The subdirectory to use for the storage object. """ self.name: str @@ -537,7 +537,7 @@ def __init__(self, self.mode = mode assert mode in StorageMode self.sync_on_reconstruction = sync_on_reconstruction - self.sub_dir = sub_dir + self.bucket_sub_path = bucket_sub_path # TODO(romilb, zhwu): This is a workaround to support storage deletion # for spot. Once sky storage supports forced management for external @@ -852,7 +852,7 @@ def _add_store_from_metadata( continue # This one can't be retrieved from metadata since its set every time # we create a new storage object. - store.sub_dir = self.sub_dir + store.bucket_sub_path = self.bucket_sub_path self._add_store(store, is_reconstructed=True) @classmethod @@ -934,7 +934,7 @@ def add_store(self, source=self.source, region=region, sync_on_reconstruction=self.sync_on_reconstruction, - sub_dir=self.sub_dir) + bucket_sub_path=self.bucket_sub_path) except exceptions.StorageBucketCreateError: # Creation failed, so this must be sky managed store. Add failure # to state. @@ -1063,7 +1063,7 @@ def from_yaml_config(cls, config: Dict[str, Any]) -> 'Storage': store = config.pop('store', None) mode_str = config.pop('mode', None) force_delete = config.pop('_force_delete', None) - sub_dir = config.pop('sub_dir', None) + bucket_sub_path = config.pop('bucket_sub_path', None) if force_delete is None: force_delete = False @@ -1084,7 +1084,7 @@ def from_yaml_config(cls, config: Dict[str, Any]) -> 'Storage': source=source, persistent=persistent, mode=mode, - sub_dir=sub_dir) + bucket_sub_path=bucket_sub_path) if store is not None: storage_obj.add_store(StoreType(store.upper())) @@ -1131,11 +1131,11 @@ def __init__(self, region: Optional[str] = 'us-east-2', is_sky_managed: Optional[bool] = None, sync_on_reconstruction: bool = True, - sub_dir: Optional[str] = None): + bucket_sub_path: Optional[str] = None): self.client: 'boto3.client.Client' self.bucket: 'StorageHandle' super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction, sub_dir) + sync_on_reconstruction, bucket_sub_path) def _validate(self): if self.source is not None and isinstance(self.source, str): @@ -1333,10 +1333,11 @@ def get_file_sync_command(base_dir_path, file_names): for file_name in file_names ]) base_dir_path = shlex.quote(base_dir_path) - sub_dir = f'/{self.sub_dir}' if self.sub_dir else '' + bucket_sub_path = (f'/{self.bucket_sub_path}' + if self.bucket_sub_path else '') sync_command = ('aws s3 sync --no-follow-symlinks --exclude="*" ' f'{includes} {base_dir_path} ' - f's3://{self.name}{sub_dir}') + f's3://{self.name}{bucket_sub_path}') return sync_command def get_dir_sync_command(src_dir_path, dest_dir_name): @@ -1348,11 +1349,12 @@ def get_dir_sync_command(src_dir_path, dest_dir_name): for file_name in excluded_list ]) src_dir_path = shlex.quote(src_dir_path) - sub_dir = f'/{self.sub_dir}' if self.sub_dir else '' - sync_command = (f'aws s3 sync --no-follow-symlinks {excludes} ' - f'{src_dir_path} ' - f's3://{self.name}{sub_dir}/{dest_dir_name}') - print(sync_command) + bucket_sub_path = (f'/{self.bucket_sub_path}' + if self.bucket_sub_path else '') + sync_command = ( + f'aws s3 sync --no-follow-symlinks {excludes} ' + f'{src_dir_path} ' + f's3://{self.name}{bucket_sub_path}/{dest_dir_name}') return sync_command # Generate message for upload @@ -1567,11 +1569,11 @@ def __init__(self, region: Optional[str] = 'us-central1', is_sky_managed: Optional[bool] = None, sync_on_reconstruction: Optional[bool] = True, - sub_dir: Optional[str] = None): + bucket_sub_path: Optional[str] = None): self.client: 'storage.Client' self.bucket: StorageHandle super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction, sub_dir) + sync_on_reconstruction, bucket_sub_path) def _validate(self): if self.source is not None and isinstance(self.source, str): @@ -1805,10 +1807,12 @@ def get_file_sync_command(base_dir_path, file_names): sync_format = '|'.join(file_names) gsutil_alias, alias_gen = data_utils.get_gsutil_command() base_dir_path = shlex.quote(base_dir_path) - sub_dir = f'/{self.sub_dir}' if self.sub_dir else '' - sync_command = (f'{alias_gen}; {gsutil_alias} ' - f'rsync -e -x \'^(?!{sync_format}$).*\' ' - f'{base_dir_path} gs://{self.name}{sub_dir}') + bucket_sub_path = (f'/{self.bucket_sub_path}' + if self.bucket_sub_path else '') + sync_command = ( + f'{alias_gen}; {gsutil_alias} ' + f'rsync -e -x \'^(?!{sync_format}$).*\' ' + f'{base_dir_path} gs://{self.name}{bucket_sub_path}') return sync_command def get_dir_sync_command(src_dir_path, dest_dir_name): @@ -1818,10 +1822,12 @@ def get_dir_sync_command(src_dir_path, dest_dir_name): excludes = '|'.join(excluded_list) gsutil_alias, alias_gen = data_utils.get_gsutil_command() src_dir_path = shlex.quote(src_dir_path) - sub_dir = f'/{self.sub_dir}' if self.sub_dir else '' - sync_command = (f'{alias_gen}; {gsutil_alias} ' - f'rsync -e -r -x \'({excludes})\' {src_dir_path} ' - f'gs://{self.name}{sub_dir}/{dest_dir_name}') + bucket_sub_path = (f'/{self.bucket_sub_path}' + if self.bucket_sub_path else '') + sync_command = ( + f'{alias_gen}; {gsutil_alias} ' + f'rsync -e -r -x \'({excludes})\' {src_dir_path} ' + f'gs://{self.name}{bucket_sub_path}/{dest_dir_name}') return sync_command # Generate message for upload @@ -2052,7 +2058,7 @@ def __init__(self, region: Optional[str] = 'eastus', is_sky_managed: Optional[bool] = None, sync_on_reconstruction: bool = True, - sub_dir: Optional[str] = None): + bucket_sub_path: Optional[str] = None): self.storage_client: 'storage.Client' self.resource_client: 'storage.Client' self.container_name: str @@ -2064,7 +2070,7 @@ def __init__(self, if region is None: region = 'eastus' super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction, sub_dir) + sync_on_reconstruction, bucket_sub_path) @classmethod def from_metadata(cls, metadata: AbstractStore.StoreMetadata, @@ -2532,8 +2538,8 @@ def get_file_sync_command(base_dir_path, file_names) -> str: includes_list = ';'.join(file_names) includes = f'--include-pattern "{includes_list}"' base_dir_path = shlex.quote(base_dir_path) - container_path = (f'{self.container_name}/{self.sub_dir}' - if self.sub_dir else self.container_name) + container_path = (f'{self.container_name}/{self.bucket_sub_path}' + if self.bucket_sub_path else self.container_name) sync_command = (f'az storage blob sync ' f'--account-name {self.storage_account_name} ' f'--account-key {self.storage_account_key} ' @@ -2552,8 +2558,9 @@ def get_dir_sync_command(src_dir_path, dest_dir_name) -> str: excludes = f'--exclude-path "{excludes_list}"' src_dir_path = shlex.quote(src_dir_path) container_path = ( - f'{self.container_name}/{self.sub_dir}/{dest_dir_name}' - if self.sub_dir else f'{self.container_name}/{dest_dir_name}') + f'{self.container_name}/{self.bucket_sub_path}/{dest_dir_name}' + if self.bucket_sub_path else + f'{self.container_name}/{dest_dir_name}') sync_command = (f'az storage blob sync ' f'--account-name {self.storage_account_name} ' f'--account-key {self.storage_account_key} ' @@ -2799,11 +2806,11 @@ def __init__(self, region: Optional[str] = 'auto', is_sky_managed: Optional[bool] = None, sync_on_reconstruction: Optional[bool] = True, - sub_dir: Optional[str] = None): + bucket_sub_path: Optional[str] = None): self.client: 'boto3.client.Client' self.bucket: 'StorageHandle' super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction, sub_dir) + sync_on_reconstruction, bucket_sub_path) def _validate(self): if self.source is not None and isinstance(self.source, str): @@ -2942,12 +2949,13 @@ def get_file_sync_command(base_dir_path, file_names): ]) endpoint_url = cloudflare.create_endpoint() base_dir_path = shlex.quote(base_dir_path) - sub_dir = f'/{self.sub_dir}' if self.sub_dir else '' + bucket_sub_path = (f'/{self.bucket_sub_path}' + if self.bucket_sub_path else '') sync_command = ('AWS_SHARED_CREDENTIALS_FILE=' f'{cloudflare.R2_CREDENTIALS_PATH} ' 'aws s3 sync --no-follow-symlinks --exclude="*" ' f'{includes} {base_dir_path} ' - f's3://{self.name}{sub_dir} ' + f's3://{self.name}{bucket_sub_path} ' f'--endpoint {endpoint_url} ' f'--profile={cloudflare.R2_PROFILE_NAME}') return sync_command @@ -2962,14 +2970,16 @@ def get_dir_sync_command(src_dir_path, dest_dir_name): ]) endpoint_url = cloudflare.create_endpoint() src_dir_path = shlex.quote(src_dir_path) - sub_dir = f'/{self.sub_dir}' if self.sub_dir else '' - sync_command = ('AWS_SHARED_CREDENTIALS_FILE=' - f'{cloudflare.R2_CREDENTIALS_PATH} ' - f'aws s3 sync --no-follow-symlinks {excludes} ' - f'{src_dir_path} ' - f's3://{self.name}{sub_dir}/{dest_dir_name} ' - f'--endpoint {endpoint_url} ' - f'--profile={cloudflare.R2_PROFILE_NAME}') + bucket_sub_path = (f'/{self.bucket_sub_path}' + if self.bucket_sub_path else '') + sync_command = ( + 'AWS_SHARED_CREDENTIALS_FILE=' + f'{cloudflare.R2_CREDENTIALS_PATH} ' + f'aws s3 sync --no-follow-symlinks {excludes} ' + f'{src_dir_path} ' + f's3://{self.name}{bucket_sub_path}/{dest_dir_name} ' + f'--endpoint {endpoint_url} ' + f'--profile={cloudflare.R2_PROFILE_NAME}') return sync_command # Generate message for upload @@ -3187,11 +3197,11 @@ def __init__(self, region: Optional[str] = 'us-east', is_sky_managed: Optional[bool] = None, sync_on_reconstruction: bool = True, - sub_dir: Optional[str] = None): + bucket_sub_path: Optional[str] = None): self.client: 'storage.Client' self.bucket: 'StorageHandle' super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction, sub_dir) + sync_on_reconstruction, bucket_sub_path) self.bucket_rclone_profile = \ Rclone.generate_rclone_bucket_profile_name( self.name, Rclone.RcloneClouds.IBM) @@ -3380,11 +3390,13 @@ def get_dir_sync_command(src_dir_path, dest_dir_name) -> str: # .git directory is excluded from the sync # wrapping src_dir_path with "" to support path with spaces src_dir_path = shlex.quote(src_dir_path) - sub_dir = f'/{self.sub_dir}' if self.sub_dir else '' - sync_command = ('rclone copy --exclude ".git/*" ' - f'{src_dir_path} ' - f'{self.bucket_rclone_profile}:{self.name}{sub_dir}' - f'/{dest_dir_name}') + bucket_sub_path = (f'/{self.bucket_sub_path}' + if self.bucket_sub_path else '') + sync_command = ( + 'rclone copy --exclude ".git/*" ' + f'{src_dir_path} ' + f'{self.bucket_rclone_profile}:{self.name}{bucket_sub_path}' + f'/{dest_dir_name}') return sync_command def get_file_sync_command(base_dir_path, file_names) -> str: @@ -3410,11 +3422,12 @@ def get_file_sync_command(base_dir_path, file_names) -> str: for file_name in file_names ]) base_dir_path = shlex.quote(base_dir_path) - sub_dir = f'/{self.sub_dir}' if self.sub_dir else '' + bucket_sub_path = (f'/{self.bucket_sub_path}' + if self.bucket_sub_path else '') sync_command = ( 'rclone copy ' f'{includes} {base_dir_path} ' - f'{self.bucket_rclone_profile}:{self.name}{sub_dir}') + f'{self.bucket_rclone_profile}:{self.name}{bucket_sub_path}') return sync_command # Generate message for upload diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 72f39f3c590..f654428c14b 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -757,7 +757,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', 'persistent': False, 'mode': 'COPY', 'store': store, - 'sub_dir': f'job-{run_id}/workdir', + 'bucket_sub_path': f'job-{run_id}/workdir', }) # Check of the existence of the workdir in file_mounts is done in # the task construction. @@ -782,7 +782,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', 'persistent': False, 'mode': 'COPY', 'store': store, - 'sub_dir': f'job-{run_id}/local-file-mounts/{i}', + 'bucket_sub_path': f'job-{run_id}/local-file-mounts/{i}', }) logger.info(f' {colorama.Style.DIM}Folder : {src!r} ' f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}') @@ -809,7 +809,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', 'persistent': False, 'mode': 'MOUNT', 'store': store, - 'sub_dir': f'job-{run_id}/tmp-files', + 'bucket_sub_path': f'job-{run_id}/tmp-files', }) if file_mount_remote_tmp_dir in original_storage_mounts: with ux_utils.print_exception_no_traceback(): diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index c6026f05952..2b1c62bb9d4 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -300,7 +300,7 @@ def get_storage_schema(): mode.value for mode in storage.StorageMode ] }, - 'sub_dir': { + 'bucket_sub_path': { 'type': 'string', }, '_force_delete': { diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 61bf0954131..9ec7c73fe14 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -5772,3 +5772,22 @@ def test_kubernetes_context_failover(): env={'SKYPILOT_CONFIG': f.name}, ) run_one_test(test) + + +@pytest.mark.aws +def test_intermediate_bucket(): + name = _get_cluster_name() + bucket_name = 'sky-bucket-int' + test = Test( + 'interm-resources', + [ + '[ ! -f ~/.sky/config.yaml ] || mv ~/.sky/config.yaml ~/.sky/config.yaml.bak_intermediate_bucket_test', + f'echo "jobs:\n bucket: \"s3://{bucket_name}\"" > ~/.sky/config.yaml', + f'sky jobs launch -n {name} tests/test_yamls/intermediate_bucket.yaml -y -d', + f'sky storage ls | grep {bucket_name}' # the bucket name is created + '[ ! -f ~/.sky/config.yaml.bak_intermediate_bucket_test ] || mv ~/.sky/config.yaml.bak_intermediate_bucket_test ~/.sky/config.yaml' + ], + f'sky jobs cancel -y -n {name}', + timeout=25 * 60, + ) + run_one_test(test) diff --git a/tests/test_yamls/intermediate_bucket.yaml b/tests/test_yamls/intermediate_bucket.yaml new file mode 100644 index 00000000000..ebaffad0acc --- /dev/null +++ b/tests/test_yamls/intermediate_bucket.yaml @@ -0,0 +1,14 @@ +name: intermediate-bucket + +file_mounts: + file_mounts_dir: . + +workdir: . + + +setup: | + echo "running setup" + +run: | + conda env list + echo "task run finish" From 06c78917924fd1d65c92aac02d73220cc70e9067 Mon Sep 17 00:00:00 2001 From: zpoint Date: Sat, 9 Nov 2024 23:36:03 +0800 Subject: [PATCH 13/48] bucketname --- tests/test_smoke.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 9ec7c73fe14..297392f0f0b 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -5777,7 +5777,7 @@ def test_kubernetes_context_failover(): @pytest.mark.aws def test_intermediate_bucket(): name = _get_cluster_name() - bucket_name = 'sky-bucket-int' + bucket_name = f'sky-bucket-{int(time.time())}' test = Test( 'interm-resources', [ From 3aaf0f12001deb09b871a8257c210d9a47d11cda Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 11 Nov 2024 14:28:11 +0800 Subject: [PATCH 14/48] support sub dir mount --- sky/data/mounting_utils.py | 41 +++++++++++++++++++++++++++++--------- sky/data/storage.py | 26 ++++++++++++++---------- 2 files changed, 48 insertions(+), 19 deletions(-) diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index 5d4eb61156c..d110cf53b8a 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -30,12 +30,17 @@ def get_s3_mount_install_cmd() -> str: return install_cmd -def get_s3_mount_cmd(bucket_name: str, mount_path: str) -> str: +def get_s3_mount_cmd(bucket_name: str, bucket_sub_path: Optional[str], + mount_path: str) -> str: """Returns a command to mount an S3 bucket using goofys.""" + if bucket_sub_path is None: + bucket_sub_path = '' + else: + bucket_sub_path = f':{bucket_sub_path}' mount_cmd = ('goofys -o allow_other ' f'--stat-cache-ttl {_STAT_CACHE_TTL} ' f'--type-cache-ttl {_TYPE_CACHE_TTL} ' - f'{bucket_name} {mount_path}') + f'{bucket_name}{bucket_sub_path} {mount_path}') return mount_cmd @@ -49,15 +54,18 @@ def get_gcs_mount_install_cmd() -> str: return install_cmd -def get_gcs_mount_cmd(bucket_name: str, mount_path: str) -> str: +def get_gcs_mount_cmd(bucket_name: str, bucket_sub_path: Optional[str], + mount_path: str) -> str: """Returns a command to mount a GCS bucket using gcsfuse.""" - + bucket_sub_path_arg = f'--only-dir {bucket_sub_path} '\ + if bucket_sub_path else '' mount_cmd = ('gcsfuse -o allow_other ' '--implicit-dirs ' f'--stat-cache-capacity {_STAT_CACHE_CAPACITY} ' f'--stat-cache-ttl {_STAT_CACHE_TTL} ' f'--type-cache-ttl {_TYPE_CACHE_TTL} ' f'--rename-dir-limit {_RENAME_DIR_LIMIT} ' + f'{bucket_sub_path_arg}' f'{bucket_name} {mount_path}') return mount_cmd @@ -79,6 +87,7 @@ def get_az_mount_install_cmd() -> str: def get_az_mount_cmd(container_name: str, + bucket_sub_path: Optional[str], storage_account_name: str, mount_path: str, storage_account_key: Optional[str] = None) -> str: @@ -86,6 +95,7 @@ def get_az_mount_cmd(container_name: str, Args: container_name: Name of the mounting container. + bucket_sub_path: Sub path of the mounting container. storage_account_name: Name of the storage account the given container belongs to. mount_path: Path where the container will be mounting. @@ -106,25 +116,34 @@ def get_az_mount_cmd(container_name: str, cache_path = _BLOBFUSE_CACHE_DIR.format( storage_account_name=storage_account_name, container_name=container_name) + if bucket_sub_path is None: + bucket_sub_path_arg = '' + else: + bucket_sub_path_arg = f'--subdirectory={bucket_sub_path}/ ' mount_cmd = (f'AZURE_STORAGE_ACCOUNT={storage_account_name} ' f'{key_env_var} ' f'blobfuse2 {mount_path} --allow-other --no-symlinks ' '-o umask=022 -o default_permissions ' f'--tmp-path {cache_path} ' + f'{bucket_sub_path_arg}' f'--container-name {container_name}') return mount_cmd def get_r2_mount_cmd(r2_credentials_path: str, r2_profile_name: str, endpoint_url: str, bucket_name: str, - mount_path: str) -> str: + bucket_sub_path: Optional[str], mount_path: str) -> str: """Returns a command to install R2 mount utility goofys.""" + if bucket_sub_path is None: + bucket_sub_path = '' + else: + bucket_sub_path = f':{bucket_sub_path}' mount_cmd = (f'AWS_SHARED_CREDENTIALS_FILE={r2_credentials_path} ' f'AWS_PROFILE={r2_profile_name} goofys -o allow_other ' f'--stat-cache-ttl {_STAT_CACHE_TTL} ' f'--type-cache-ttl {_TYPE_CACHE_TTL} ' f'--endpoint {endpoint_url} ' - f'{bucket_name} {mount_path}') + f'{bucket_name}{bucket_sub_path} {mount_path}') return mount_cmd @@ -138,7 +157,7 @@ def get_cos_mount_install_cmd() -> str: def get_cos_mount_cmd(rclone_config_data: str, rclone_config_path: str, bucket_rclone_profile: str, bucket_name: str, - mount_path: str) -> str: + bucket_sub_path: Optional[str], mount_path: str) -> str: """Returns a command to mount an IBM COS bucket using rclone.""" # creates a fusermount soft link on older (<22) Ubuntu systems for # rclone's mount utility. @@ -150,10 +169,14 @@ def get_cos_mount_cmd(rclone_config_data: str, rclone_config_path: str, 'mkdir -p ~/.config/rclone/ && ' f'echo "{rclone_config_data}" >> ' f'{rclone_config_path}') + if bucket_sub_path is None: + sub_path_arg = f'{bucket_name}/{bucket_name}' + else: + sub_path_arg = f'/{bucket_name}' # --daemon will keep the mounting process running in the background. mount_cmd = (f'{configure_rclone_profile} && ' 'rclone mount ' - f'{bucket_rclone_profile}:{bucket_name} {mount_path} ' + f'{bucket_rclone_profile}:{sub_path_arg} {mount_path} ' '--daemon') return mount_cmd @@ -209,7 +232,7 @@ def get_mounting_script( script = textwrap.dedent(f""" #!/usr/bin/env bash set -e - + {command_runner.ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD} MOUNT_PATH={mount_path} diff --git a/sky/data/storage.py b/sky/data/storage.py index e353c18ebc1..f926923b396 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -287,7 +287,10 @@ def __init__(self, self.is_sky_managed = is_sky_managed self.sync_on_reconstruction = sync_on_reconstruction - self.bucket_sub_path = bucket_sub_path + if bucket_sub_path is not None: + self.bucket_sub_path: Optional[str] = bucket_sub_path.strip('/') + else: + self.bucket_sub_path = None # Whether sky is responsible for the lifecycle of the Store. self._validate() self.initialize() @@ -1465,6 +1468,7 @@ def mount_command(self, mount_path: str) -> str: """ install_cmd = mounting_utils.get_s3_mount_install_cmd() mount_cmd = mounting_utils.get_s3_mount_cmd(self.bucket.name, + self.bucket_sub_path, mount_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, mount_cmd) @@ -1925,6 +1929,7 @@ def mount_command(self, mount_path: str) -> str: """ install_cmd = mounting_utils.get_gcs_mount_install_cmd() mount_cmd = mounting_utils.get_gcs_mount_cmd(self.bucket.name, + self.bucket_sub_path, mount_path) version_check_cmd = ( f'gcsfuse --version | grep -q {mounting_utils.GCSFUSE_VERSION}') @@ -2557,10 +2562,11 @@ def get_dir_sync_command(src_dir_path, dest_dir_name) -> str: [file_name.rstrip('*') for file_name in excluded_list]) excludes = f'--exclude-path "{excludes_list}"' src_dir_path = shlex.quote(src_dir_path) - container_path = ( - f'{self.container_name}/{self.bucket_sub_path}/{dest_dir_name}' - if self.bucket_sub_path else - f'{self.container_name}/{dest_dir_name}') + container_path = (f'{self.container_name}/{self.bucket_sub_path}' + if self.bucket_sub_path else + f'{self.container_name}') + if dest_dir_name: + container_path = f'{container_path}/{dest_dir_name}' sync_command = (f'az storage blob sync ' f'--account-name {self.storage_account_name} ' f'--account-key {self.storage_account_key} ' @@ -2704,6 +2710,7 @@ def mount_command(self, mount_path: str) -> str: """ install_cmd = mounting_utils.get_az_mount_install_cmd() mount_cmd = mounting_utils.get_az_mount_cmd(self.container_name, + self.bucket_sub_path, self.storage_account_name, mount_path, self.storage_account_key) @@ -3101,11 +3108,9 @@ def mount_command(self, mount_path: str) -> str: endpoint_url = cloudflare.create_endpoint() r2_credential_path = cloudflare.R2_CREDENTIALS_PATH r2_profile_name = cloudflare.R2_PROFILE_NAME - mount_cmd = mounting_utils.get_r2_mount_cmd(r2_credential_path, - r2_profile_name, - endpoint_url, - self.bucket.name, - mount_path) + mount_cmd = mounting_utils.get_r2_mount_cmd( + r2_credential_path, r2_profile_name, endpoint_url, self.bucket.name, + self.bucket_sub_path, mount_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, mount_cmd) @@ -3551,6 +3556,7 @@ def mount_command(self, mount_path: str) -> str: Rclone.RCLONE_CONFIG_PATH, self.bucket_rclone_profile, self.bucket.name, + self.bucket_sub_path, mount_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, mount_cmd) From 0116fa04eb9358e809dac1f0522229320c18800f Mon Sep 17 00:00:00 2001 From: zepingguo Date: Tue, 12 Nov 2024 12:12:46 +0800 Subject: [PATCH 15/48] private member for _bucket_sub_path and smoke test fix --- sky/data/storage.py | 65 ++++++++++++----------- sky/utils/controller_utils.py | 6 +-- sky/utils/schemas.py | 2 +- tests/test_yamls/intermediate_bucket.yaml | 7 ++- 4 files changed, 43 insertions(+), 37 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index f926923b396..efdb8a370fc 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -288,9 +288,9 @@ def __init__(self, self.sync_on_reconstruction = sync_on_reconstruction if bucket_sub_path is not None: - self.bucket_sub_path: Optional[str] = bucket_sub_path.strip('/') + self._bucket_sub_path: Optional[str] = bucket_sub_path.strip('/') else: - self.bucket_sub_path = None + self._bucket_sub_path = None # Whether sky is responsible for the lifecycle of the Store. self._validate() self.initialize() @@ -540,7 +540,7 @@ def __init__(self, self.mode = mode assert mode in StorageMode self.sync_on_reconstruction = sync_on_reconstruction - self.bucket_sub_path = bucket_sub_path + self._bucket_sub_path = bucket_sub_path # TODO(romilb, zhwu): This is a workaround to support storage deletion # for spot. Once sky storage supports forced management for external @@ -855,7 +855,10 @@ def _add_store_from_metadata( continue # This one can't be retrieved from metadata since its set every time # we create a new storage object. - store.bucket_sub_path = self.bucket_sub_path + # This private member setting against coding style guide, but + # we want to keep it private member for internal usage only. + # pylint: disable=protected-access + store._bucket_sub_path = self._bucket_sub_path self._add_store(store, is_reconstructed=True) @classmethod @@ -937,7 +940,7 @@ def add_store(self, source=self.source, region=region, sync_on_reconstruction=self.sync_on_reconstruction, - bucket_sub_path=self.bucket_sub_path) + bucket_sub_path=self._bucket_sub_path) except exceptions.StorageBucketCreateError: # Creation failed, so this must be sky managed store. Add failure # to state. @@ -1066,7 +1069,7 @@ def from_yaml_config(cls, config: Dict[str, Any]) -> 'Storage': store = config.pop('store', None) mode_str = config.pop('mode', None) force_delete = config.pop('_force_delete', None) - bucket_sub_path = config.pop('bucket_sub_path', None) + bucket_sub_path = config.pop('_bucket_sub_path', None) if force_delete is None: force_delete = False @@ -1336,8 +1339,8 @@ def get_file_sync_command(base_dir_path, file_names): for file_name in file_names ]) base_dir_path = shlex.quote(base_dir_path) - bucket_sub_path = (f'/{self.bucket_sub_path}' - if self.bucket_sub_path else '') + bucket_sub_path = (f'/{self._bucket_sub_path}' + if self._bucket_sub_path else '') sync_command = ('aws s3 sync --no-follow-symlinks --exclude="*" ' f'{includes} {base_dir_path} ' f's3://{self.name}{bucket_sub_path}') @@ -1352,8 +1355,8 @@ def get_dir_sync_command(src_dir_path, dest_dir_name): for file_name in excluded_list ]) src_dir_path = shlex.quote(src_dir_path) - bucket_sub_path = (f'/{self.bucket_sub_path}' - if self.bucket_sub_path else '') + bucket_sub_path = (f'/{self._bucket_sub_path}' + if self._bucket_sub_path else '') sync_command = ( f'aws s3 sync --no-follow-symlinks {excludes} ' f'{src_dir_path} ' @@ -1468,7 +1471,7 @@ def mount_command(self, mount_path: str) -> str: """ install_cmd = mounting_utils.get_s3_mount_install_cmd() mount_cmd = mounting_utils.get_s3_mount_cmd(self.bucket.name, - self.bucket_sub_path, + self._bucket_sub_path, mount_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, mount_cmd) @@ -1811,8 +1814,8 @@ def get_file_sync_command(base_dir_path, file_names): sync_format = '|'.join(file_names) gsutil_alias, alias_gen = data_utils.get_gsutil_command() base_dir_path = shlex.quote(base_dir_path) - bucket_sub_path = (f'/{self.bucket_sub_path}' - if self.bucket_sub_path else '') + bucket_sub_path = (f'/{self._bucket_sub_path}' + if self._bucket_sub_path else '') sync_command = ( f'{alias_gen}; {gsutil_alias} ' f'rsync -e -x \'^(?!{sync_format}$).*\' ' @@ -1826,8 +1829,8 @@ def get_dir_sync_command(src_dir_path, dest_dir_name): excludes = '|'.join(excluded_list) gsutil_alias, alias_gen = data_utils.get_gsutil_command() src_dir_path = shlex.quote(src_dir_path) - bucket_sub_path = (f'/{self.bucket_sub_path}' - if self.bucket_sub_path else '') + bucket_sub_path = (f'/{self._bucket_sub_path}' + if self._bucket_sub_path else '') sync_command = ( f'{alias_gen}; {gsutil_alias} ' f'rsync -e -r -x \'({excludes})\' {src_dir_path} ' @@ -1929,7 +1932,7 @@ def mount_command(self, mount_path: str) -> str: """ install_cmd = mounting_utils.get_gcs_mount_install_cmd() mount_cmd = mounting_utils.get_gcs_mount_cmd(self.bucket.name, - self.bucket_sub_path, + self._bucket_sub_path, mount_path) version_check_cmd = ( f'gcsfuse --version | grep -q {mounting_utils.GCSFUSE_VERSION}') @@ -2543,8 +2546,8 @@ def get_file_sync_command(base_dir_path, file_names) -> str: includes_list = ';'.join(file_names) includes = f'--include-pattern "{includes_list}"' base_dir_path = shlex.quote(base_dir_path) - container_path = (f'{self.container_name}/{self.bucket_sub_path}' - if self.bucket_sub_path else self.container_name) + container_path = (f'{self.container_name}/{self._bucket_sub_path}' + if self._bucket_sub_path else self.container_name) sync_command = (f'az storage blob sync ' f'--account-name {self.storage_account_name} ' f'--account-key {self.storage_account_key} ' @@ -2562,8 +2565,8 @@ def get_dir_sync_command(src_dir_path, dest_dir_name) -> str: [file_name.rstrip('*') for file_name in excluded_list]) excludes = f'--exclude-path "{excludes_list}"' src_dir_path = shlex.quote(src_dir_path) - container_path = (f'{self.container_name}/{self.bucket_sub_path}' - if self.bucket_sub_path else + container_path = (f'{self.container_name}/{self._bucket_sub_path}' + if self._bucket_sub_path else f'{self.container_name}') if dest_dir_name: container_path = f'{container_path}/{dest_dir_name}' @@ -2710,7 +2713,7 @@ def mount_command(self, mount_path: str) -> str: """ install_cmd = mounting_utils.get_az_mount_install_cmd() mount_cmd = mounting_utils.get_az_mount_cmd(self.container_name, - self.bucket_sub_path, + self._bucket_sub_path, self.storage_account_name, mount_path, self.storage_account_key) @@ -2956,8 +2959,8 @@ def get_file_sync_command(base_dir_path, file_names): ]) endpoint_url = cloudflare.create_endpoint() base_dir_path = shlex.quote(base_dir_path) - bucket_sub_path = (f'/{self.bucket_sub_path}' - if self.bucket_sub_path else '') + bucket_sub_path = (f'/{self._bucket_sub_path}' + if self._bucket_sub_path else '') sync_command = ('AWS_SHARED_CREDENTIALS_FILE=' f'{cloudflare.R2_CREDENTIALS_PATH} ' 'aws s3 sync --no-follow-symlinks --exclude="*" ' @@ -2977,8 +2980,8 @@ def get_dir_sync_command(src_dir_path, dest_dir_name): ]) endpoint_url = cloudflare.create_endpoint() src_dir_path = shlex.quote(src_dir_path) - bucket_sub_path = (f'/{self.bucket_sub_path}' - if self.bucket_sub_path else '') + bucket_sub_path = (f'/{self._bucket_sub_path}' + if self._bucket_sub_path else '') sync_command = ( 'AWS_SHARED_CREDENTIALS_FILE=' f'{cloudflare.R2_CREDENTIALS_PATH} ' @@ -3110,7 +3113,7 @@ def mount_command(self, mount_path: str) -> str: r2_profile_name = cloudflare.R2_PROFILE_NAME mount_cmd = mounting_utils.get_r2_mount_cmd( r2_credential_path, r2_profile_name, endpoint_url, self.bucket.name, - self.bucket_sub_path, mount_path) + self._bucket_sub_path, mount_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, mount_cmd) @@ -3395,8 +3398,8 @@ def get_dir_sync_command(src_dir_path, dest_dir_name) -> str: # .git directory is excluded from the sync # wrapping src_dir_path with "" to support path with spaces src_dir_path = shlex.quote(src_dir_path) - bucket_sub_path = (f'/{self.bucket_sub_path}' - if self.bucket_sub_path else '') + bucket_sub_path = (f'/{self._bucket_sub_path}' + if self._bucket_sub_path else '') sync_command = ( 'rclone copy --exclude ".git/*" ' f'{src_dir_path} ' @@ -3427,8 +3430,8 @@ def get_file_sync_command(base_dir_path, file_names) -> str: for file_name in file_names ]) base_dir_path = shlex.quote(base_dir_path) - bucket_sub_path = (f'/{self.bucket_sub_path}' - if self.bucket_sub_path else '') + bucket_sub_path = (f'/{self._bucket_sub_path}' + if self._bucket_sub_path else '') sync_command = ( 'rclone copy ' f'{includes} {base_dir_path} ' @@ -3556,7 +3559,7 @@ def mount_command(self, mount_path: str) -> str: Rclone.RCLONE_CONFIG_PATH, self.bucket_rclone_profile, self.bucket.name, - self.bucket_sub_path, + self._bucket_sub_path, mount_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, mount_cmd) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index f654428c14b..741684240ad 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -757,7 +757,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', 'persistent': False, 'mode': 'COPY', 'store': store, - 'bucket_sub_path': f'job-{run_id}/workdir', + '_bucket_sub_path': f'job-{run_id}/workdir', }) # Check of the existence of the workdir in file_mounts is done in # the task construction. @@ -782,7 +782,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', 'persistent': False, 'mode': 'COPY', 'store': store, - 'bucket_sub_path': f'job-{run_id}/local-file-mounts/{i}', + '_bucket_sub_path': f'job-{run_id}/local-file-mounts/{i}', }) logger.info(f' {colorama.Style.DIM}Folder : {src!r} ' f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}') @@ -809,7 +809,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', 'persistent': False, 'mode': 'MOUNT', 'store': store, - 'bucket_sub_path': f'job-{run_id}/tmp-files', + '_bucket_sub_path': f'job-{run_id}/tmp-files', }) if file_mount_remote_tmp_dir in original_storage_mounts: with ux_utils.print_exception_no_traceback(): diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 2b1c62bb9d4..f4013a4a67f 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -300,7 +300,7 @@ def get_storage_schema(): mode.value for mode in storage.StorageMode ] }, - 'bucket_sub_path': { + '_bucket_sub_path': { 'type': 'string', }, '_force_delete': { diff --git a/tests/test_yamls/intermediate_bucket.yaml b/tests/test_yamls/intermediate_bucket.yaml index ebaffad0acc..f795e794dfd 100644 --- a/tests/test_yamls/intermediate_bucket.yaml +++ b/tests/test_yamls/intermediate_bucket.yaml @@ -1,7 +1,7 @@ name: intermediate-bucket file_mounts: - file_mounts_dir: . + /file_mounts_dir: . workdir: . @@ -10,5 +10,8 @@ setup: | echo "running setup" run: | - conda env list + echo "listing workdir" + ls . + echo "listing file_mounts_dir" + ls /file_mounts_dir echo "task run finish" From 0750900ea090da19820aa848b992b86bbc3f558a Mon Sep 17 00:00:00 2001 From: zepingguo Date: Tue, 12 Nov 2024 14:50:07 +0800 Subject: [PATCH 16/48] support copy mount for sub dir --- sky/data/storage.py | 26 ++++++++++++++++++-------- sky/task.py | 10 ++++++++++ 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index efdb8a370fc..a5f1c003b60 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -287,14 +287,19 @@ def __init__(self, self.is_sky_managed = is_sky_managed self.sync_on_reconstruction = sync_on_reconstruction - if bucket_sub_path is not None: - self._bucket_sub_path: Optional[str] = bucket_sub_path.strip('/') - else: - self._bucket_sub_path = None + # To avoid mypy error + self._bucket_sub_path: Optional[str] = None + self.set_bucket_sub_path(bucket_sub_path) # Whether sky is responsible for the lifecycle of the Store. self._validate() self.initialize() + def set_bucket_sub_path(self, bucket_sub_path: Optional[str]) -> None: + if bucket_sub_path is not None: + self._bucket_sub_path = bucket_sub_path.strip('/') + else: + self._bucket_sub_path = None + @classmethod def from_metadata(cls, metadata: StoreMetadata, **override_args): """Create a Store from a StoreMetadata object. @@ -601,6 +606,12 @@ def __init__(self, elif self.source.startswith('cos://'): self.add_store(StoreType.IBM) + def get_bucket_sub_path_prefix(self, blob_path: str) -> str: + """Adds the bucket sub path prefix to the blob path.""" + if self._bucket_sub_path is not None: + return f'{blob_path}/{self._bucket_sub_path}' + return blob_path + @staticmethod def _validate_source( source: SourceType, mode: StorageMode, @@ -855,10 +866,7 @@ def _add_store_from_metadata( continue # This one can't be retrieved from metadata since its set every time # we create a new storage object. - # This private member setting against coding style guide, but - # we want to keep it private member for internal usage only. - # pylint: disable=protected-access - store._bucket_sub_path = self._bucket_sub_path + store.set_bucket_sub_path(self._bucket_sub_path) self._add_store(store, is_reconstructed=True) @classmethod @@ -1121,6 +1129,8 @@ def add_if_not_none(key: str, value: Optional[Any]): add_if_not_none('mode', self.mode.value) if self.force_delete: config['_force_delete'] = True + if self._bucket_sub_path is not None: + config['_bucket_sub_path'] = self._bucket_sub_path return config diff --git a/sky/task.py b/sky/task.py index cebc616dc6d..559da68f4b9 100644 --- a/sky/task.py +++ b/sky/task.py @@ -977,6 +977,8 @@ def sync_storage_mounts(self) -> None: else: assert storage.name is not None, storage blob_path = 's3://' + storage.name + blob_path = storage.get_bucket_sub_path_prefix( + blob_path) self.update_file_mounts({ mnt_path: blob_path, }) @@ -987,6 +989,8 @@ def sync_storage_mounts(self) -> None: else: assert storage.name is not None, storage blob_path = 'gs://' + storage.name + blob_path = storage.get_bucket_sub_path_prefix( + blob_path) self.update_file_mounts({ mnt_path: blob_path, }) @@ -1005,6 +1009,8 @@ def sync_storage_mounts(self) -> None: blob_path = data_utils.AZURE_CONTAINER_URL.format( storage_account_name=storage_account_name, container_name=storage.name) + blob_path = storage.get_bucket_sub_path_prefix( + blob_path) self.update_file_mounts({ mnt_path: blob_path, }) @@ -1015,6 +1021,8 @@ def sync_storage_mounts(self) -> None: blob_path = storage.source else: blob_path = 'r2://' + storage.name + blob_path = storage.get_bucket_sub_path_prefix( + blob_path) self.update_file_mounts({ mnt_path: blob_path, }) @@ -1030,6 +1038,8 @@ def sync_storage_mounts(self) -> None: cos_region = data_utils.Rclone.get_region_from_rclone( storage.name, data_utils.Rclone.RcloneClouds.IBM) blob_path = f'cos://{cos_region}/{storage.name}' + blob_path = storage.get_bucket_sub_path_prefix( + blob_path) self.update_file_mounts({mnt_path: blob_path}) else: with ux_utils.print_exception_no_traceback(): From 51dfcd33dd3c70381ca8218bf4613b7f5c0a1c34 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Tue, 12 Nov 2024 17:38:57 +0800 Subject: [PATCH 17/48] support gcs, s3 delete folder --- sky/backends/cloud_vm_ray_backend.py | 2 +- sky/data/storage.py | 160 +++++++++++++++++++++------ 2 files changed, 130 insertions(+), 32 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 0013e6cbaf9..8ab64ddd4df 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3505,7 +3505,7 @@ def _teardown_ephemeral_storage(self, task: task_lib.Task) -> None: if storage_mounts is not None: for _, storage in storage_mounts.items(): if not storage.persistent: - storage.delete() + storage.delete(only_delete_sub_path_if_exists=True) def _teardown(self, handle: CloudVmRayResourceHandle, diff --git a/sky/data/storage.py b/sky/data/storage.py index a5f1c003b60..125d4927812 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -294,6 +294,9 @@ def __init__(self, self._validate() self.initialize() + def get_bucket_sub_path(self) -> Optional[str]: + return self._bucket_sub_path + def set_bucket_sub_path(self, bucket_sub_path: Optional[str]) -> None: if bucket_sub_path is not None: self._bucket_sub_path = bucket_sub_path.strip('/') @@ -347,7 +350,10 @@ def upload(self) -> None: raise NotImplementedError def delete(self) -> None: - """Removes the Storage object from the cloud.""" + """Removes the Storage from the cloud.""" + raise NotImplementedError + + def remove_objects_from_sub_path(self) -> None: raise NotImplementedError def get_handle(self) -> StorageHandle: @@ -990,7 +996,9 @@ def _add_store(self, store: AbstractStore, is_reconstructed: bool = False): global_user_state.add_or_update_storage(self.name, self.handle, StorageStatus.INIT) - def delete(self, store_type: Optional[StoreType] = None) -> None: + def delete(self, + store_type: Optional[StoreType] = None, + only_delete_sub_path_if_exists: bool = False) -> None: """Deletes data for all sky-managed storage objects. If a storage is not managed by sky, it is not deleted from the cloud. @@ -999,12 +1007,22 @@ def delete(self, store_type: Optional[StoreType] = None) -> None: Args: store_type: StoreType; Specific cloud store to remove from the list of backing stores. + only_delete_sub_path_if_exists: bool; Whether to delete only the + bucket sub path instead of the whole bucket if bucket sub path + is set. """ - if not self.stores: + if not self.stores and not only_delete_sub_path_if_exists: logger.info('No backing stores found. Deleting storage.') global_user_state.remove_storage(self.name) if store_type: store = self.stores[store_type] + # We delete the bucket sub path if it exists, and then return. + # Without interfering with the global state. + # User should still call storage.delete() to remove the bucket. + if only_delete_sub_path_if_exists and store.get_bucket_sub_path(): + store.remove_objects_from_sub_path() + return + is_sky_managed = store.is_sky_managed # We delete a store from the cloud if it's sky managed. Else just # remove handle and return @@ -1024,15 +1042,24 @@ def delete(self, store_type: Optional[StoreType] = None) -> None: # Remove store from bookkeeping del self.stores[store_type] else: - for _, store in self.stores.items(): + keys_to_delete = [] + for key, store in self.stores.items(): + if only_delete_sub_path_if_exists and store.get_bucket_sub_path( + ): + store.remove_objects_from_sub_path() + continue + if store.is_sky_managed: self.handle.remove_store(store) store.delete() elif self.force_delete: store.delete() - self.stores = {} - # Remove storage from global_user_state if present - global_user_state.remove_storage(self.name) + keys_to_delete.append(key) + for key in keys_to_delete: + del self.stores[key] + if len(self.stores) == 0: + # Remove storage from global_user_state if present + global_user_state.remove_storage(self.name) def sync_all_stores(self): """Syncs the source and destinations of all stores in the Storage""" @@ -1319,6 +1346,19 @@ def delete(self) -> None: logger.info(f'{colorama.Fore.GREEN}{msg_str}' f'{colorama.Style.RESET_ALL}') + def remove_objects_from_sub_path(self) -> None: + assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' + deleted_by_skypilot = self._delete_s3_bucket_sub_path( + self.name, self._bucket_sub_path) + if deleted_by_skypilot: + msg_str = f'Removed objects from S3 bucket ' \ + f'{self.name}/{self._bucket_sub_path}.' + else: + msg_str = f'Failed to remove objects from S3 bucket ' \ + f'{self.name}/{self._bucket_sub_path}.' + logger.info(f'{colorama.Fore.GREEN}{msg_str}' + f'{colorama.Style.RESET_ALL}') + def get_handle(self) -> StorageHandle: return aws.resource('s3').Bucket(self.name) @@ -1532,6 +1572,27 @@ def _create_s3_bucket(self, ) from e return aws.resource('s3').Bucket(bucket_name) + def _execute_s3_remove_command(self, command: str, bucket_name: str, + hint_operating: str, + hint_failed: str) -> bool: + try: + with rich_utils.safe_status( + ux_utils.spinner_message(hint_operating)): + subprocess.check_output(command.split(' '), + stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + if 'NoSuchBucket' in e.output.decode('utf-8'): + logger.debug( + _BUCKET_EXTERNALLY_DELETED_DEBUG_MESSAGE.format( + bucket_name=bucket_name)) + return False + else: + with ux_utils.print_exception_no_traceback(): + raise exceptions.StorageBucketDeleteError( + f'{hint_failed}' + f'Detailed error: {e.output}') + return True + def _delete_s3_bucket(self, bucket_name: str) -> bool: """Deletes S3 bucket, including all objects in bucket @@ -1549,29 +1610,28 @@ def _delete_s3_bucket(self, bucket_name: str) -> bool: # The fastest way to delete is to run `aws s3 rb --force`, # which removes the bucket by force. remove_command = f'aws s3 rb s3://{bucket_name} --force' - try: - with rich_utils.safe_status( - ux_utils.spinner_message( - f'Deleting S3 bucket [green]{bucket_name}')): - subprocess.check_output(remove_command.split(' '), - stderr=subprocess.STDOUT) - except subprocess.CalledProcessError as e: - if 'NoSuchBucket' in e.output.decode('utf-8'): - logger.debug( - _BUCKET_EXTERNALLY_DELETED_DEBUG_MESSAGE.format( - bucket_name=bucket_name)) - return False - else: - with ux_utils.print_exception_no_traceback(): - raise exceptions.StorageBucketDeleteError( - f'Failed to delete S3 bucket {bucket_name}.' - f'Detailed error: {e.output}') + success = self._execute_s3_remove_command( + remove_command, bucket_name, + f'Deleting S3 bucket [green]{bucket_name}', + f'Failed to delete S3 bucket {bucket_name}.') + if not success: + return False # Wait until bucket deletion propagates on AWS servers while data_utils.verify_s3_bucket(bucket_name): time.sleep(0.1) return True + def _delete_s3_bucket_sub_path(self, bucket_name: str, + sub_path: str) -> bool: + """Deletes the sub path from the bucket.""" + remove_command = f'aws s3 rm s3://{bucket_name}/{sub_path}/ --recursive' + return self._execute_s3_remove_command( + remove_command, bucket_name, + f'Removing objects from S3 bucket [green]{bucket_name}/{sub_path}', + f'Failed to remove objects from S3 bucket {bucket_name}/{sub_path}.' + ) + class GcsStore(AbstractStore): """GcsStore inherits from Storage Object and represents the backend @@ -1757,6 +1817,19 @@ def delete(self) -> None: logger.info(f'{colorama.Fore.GREEN}{msg_str}' f'{colorama.Style.RESET_ALL}') + def remove_objects_from_sub_path(self) -> None: + assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' + deleted_by_skypilot = self._delete_gcs_bucket(self.name, + self._bucket_sub_path) + if deleted_by_skypilot: + msg_str = f'Deleted objects in GCS bucket ' \ + f'{self.name}/{self._bucket_sub_path}.' + else: + msg_str = f'GCS bucket {self.name} may have ' \ + 'been deleted externally.' + logger.info(f'{colorama.Fore.GREEN}{msg_str}' + f'{colorama.Style.RESET_ALL}') + def get_handle(self) -> StorageHandle: return self.client.get_bucket(self.name) @@ -1983,19 +2056,30 @@ def _create_gcs_bucket(self, f'{new_bucket.storage_class}{colorama.Style.RESET_ALL}') return new_bucket - def _delete_gcs_bucket(self, bucket_name: str) -> bool: - """Deletes GCS bucket, including all objects in bucket + def _delete_gcs_bucket(self, + bucket_name: str, + bucket_sub_path: Optional[str] = None) -> bool: + """Deletes objects in GCS bucket Args: bucket_name: str; Name of bucket + bucket_sub_path: str; Sub path in the bucket, if provided only objects + in the sub path will be deleted, else the whole bucket will be + deleted Returns: bool; True if bucket was deleted, False if it was deleted externally. """ - + if bucket_sub_path is not None: + command_suffix = f'/{bucket_sub_path}' + hint_text = 'objects in ' + else: + command_suffix = '' + hint_text = '' with rich_utils.safe_status( ux_utils.spinner_message( - f'Deleting GCS bucket [green]{bucket_name}')): + f'Deleting {hint_text}GCS bucket ' + f'[green]{bucket_name}{command_suffix}')): try: self.client.get_bucket(bucket_name) except gcp.forbidden_exception() as e: @@ -2013,8 +2097,9 @@ def _delete_gcs_bucket(self, bucket_name: str) -> bool: return False try: gsutil_alias, alias_gen = data_utils.get_gsutil_command() - remove_obj_command = (f'{alias_gen};{gsutil_alias} ' - f'rm -r gs://{bucket_name}') + remove_obj_command = ( + f'{alias_gen};{gsutil_alias} ' + f'rm -r gs://{bucket_name}{command_suffix}') subprocess.check_output(remove_obj_command, stderr=subprocess.STDOUT, shell=True, @@ -2023,7 +2108,8 @@ def _delete_gcs_bucket(self, bucket_name: str) -> bool: except subprocess.CalledProcessError as e: with ux_utils.print_exception_no_traceback(): raise exceptions.StorageBucketDeleteError( - f'Failed to delete GCS bucket {bucket_name}.' + f'Failed to delete {hint_text}GCS bucket ' + f'{bucket_name}{command_suffix}.' f'Detailed error: {e.output}') @@ -2530,6 +2616,10 @@ def delete(self) -> None: logger.info(f'{colorama.Fore.GREEN}{msg_str}' f'{colorama.Style.RESET_ALL}') + def remove_objects_from_sub_path(self) -> None: + assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' + raise NotImplementedError('Not implemented') + def get_handle(self) -> StorageHandle: """Returns the Storage Handle object.""" return self.storage_client.blob_containers.get( @@ -2938,6 +3028,10 @@ def delete(self) -> None: logger.info(f'{colorama.Fore.GREEN}{msg_str}' f'{colorama.Style.RESET_ALL}') + def remove_objects_from_sub_path(self) -> None: + assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' + raise NotImplementedError('Not implemented') + def get_handle(self) -> StorageHandle: return cloudflare.resource('s3').Bucket(self.name) @@ -3368,6 +3462,10 @@ def delete(self) -> None: logger.info(f'{colorama.Fore.GREEN}Deleted COS bucket {self.name}.' f'{colorama.Style.RESET_ALL}') + def remove_objects_from_sub_path(self) -> None: + assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' + raise NotImplementedError('Not implemented') + def get_handle(self) -> StorageHandle: return self.s3_resource.Bucket(self.name) From 6ba05cc6c1e539152914e1ee3eec11f67f6de89e Mon Sep 17 00:00:00 2001 From: zepingguo Date: Tue, 12 Nov 2024 17:46:05 +0800 Subject: [PATCH 18/48] doc --- docs/source/reference/config.rst | 4 ++++ sky/utils/schemas.py | 14 ++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index b8255b46402..59e7e28cc3f 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -24,6 +24,10 @@ Available fields and semantics: # # Ref: https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html#customizing-job-controller-resources jobs: + # Bucket to store managed jobs mount files and tmp files. + # Its optional, if not set, SkyPilot will create a new bucket for each managed jobs . + # Support https|s3|gs|r2|cos + bucket: s3://sky-bucket-with-permission controller: resources: # same spec as 'resources' in a task YAML cloud: gcp diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index f4013a4a67f..b4ddbbbeea9 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -3,7 +3,6 @@ Schemas conform to the JSON Schema specification as defined at https://json-schema.org/ """ -import copy import enum from typing import Any, Dict, List, Tuple @@ -709,14 +708,13 @@ def get_config_schema(): 'resources': resources_schema, } }, + 'bucket': { + 'type': 'string', + 'pattern': '^(https|s3|gs|r2|cos)://.+', + 'required': [], + } } } - jobs_configs = copy.deepcopy(controller_resources_schema) - jobs_configs['properties']['bucket'] = { - 'type': 'string', - 'pattern': '^(https|s3|gs|r2|cos)://.+', - 'required': [] - } cloud_configs = { 'aws': { 'type': 'object', @@ -937,7 +935,7 @@ def get_config_schema(): 'required': [], 'additionalProperties': False, 'properties': { - 'jobs': jobs_configs, + 'jobs': controller_resources_schema, 'spot': controller_resources_schema, 'serve': controller_resources_schema, 'allowed_clouds': allowed_clouds, From 1751fab549426c8a868ec1b01dece640fbc03306 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Tue, 12 Nov 2024 18:20:51 +0800 Subject: [PATCH 19/48] r2 remove_objects_from_sub_path --- sky/data/storage.py | 72 +++++++++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 19 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index 125d4927812..b3e680fb1e3 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -3030,7 +3030,16 @@ def delete(self) -> None: def remove_objects_from_sub_path(self) -> None: assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' - raise NotImplementedError('Not implemented') + deleted_by_skypilot = self._delete_r2_bucket_sub_path( + self.name, self._bucket_sub_path) + if deleted_by_skypilot: + msg_str = f'Removed objects from R2 bucket ' \ + f'{self.name}/{self._bucket_sub_path}.' + else: + msg_str = f'Failed to remove objects from R2 bucket ' \ + f'{self.name}/{self._bucket_sub_path}.' + logger.info(f'{colorama.Fore.GREEN}{msg_str}' + f'{colorama.Style.RESET_ALL}') def get_handle(self) -> StorageHandle: return cloudflare.resource('s3').Bucket(self.name) @@ -3250,6 +3259,43 @@ def _create_r2_bucket(self, f'{self.name} but failed.') from e return cloudflare.resource('s3').Bucket(bucket_name) + def _execute_r2_remove_command(self, command: str, bucket_name: str, + hint_operating: str, + hint_failed: str) -> bool: + try: + with rich_utils.safe_status( + ux_utils.spinner_message(hint_operating)): + subprocess.check_output(command.split(' '), + stderr=subprocess.STDOUT, + shell=True) + except subprocess.CalledProcessError as e: + if 'NoSuchBucket' in e.output.decode('utf-8'): + logger.debug( + _BUCKET_EXTERNALLY_DELETED_DEBUG_MESSAGE.format( + bucket_name=bucket_name)) + return False + else: + with ux_utils.print_exception_no_traceback(): + raise exceptions.StorageBucketDeleteError( + f'{hint_failed}' + f'Detailed error: {e.output}') + return True + + def _delete_r2_bucket_sub_path(self, bucket_name: str, + sub_path: str) -> bool: + """Deletes the sub path from the bucket.""" + endpoint_url = cloudflare.create_endpoint() + remove_command = ( + f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} ' + f'aws s3 rm s3://{bucket_name}/{sub_path}/ --recursive ' + f'--endpoint {endpoint_url} ' + f'--profile={cloudflare.R2_PROFILE_NAME}') + return self._execute_r2_remove_command( + remove_command, bucket_name, + f'Removing objects from R2 bucket {bucket_name}/{sub_path}', + f'Failed to remove objects from R2 bucket {bucket_name}/{sub_path}.' + ) + def _delete_r2_bucket(self, bucket_name: str) -> bool: """Deletes R2 bucket, including all objects in bucket @@ -3272,24 +3318,12 @@ def _delete_r2_bucket(self, bucket_name: str) -> bool: f'aws s3 rb s3://{bucket_name} --force ' f'--endpoint {endpoint_url} ' f'--profile={cloudflare.R2_PROFILE_NAME}') - try: - with rich_utils.safe_status( - ux_utils.spinner_message( - f'Deleting R2 bucket {bucket_name}')): - subprocess.check_output(remove_command, - stderr=subprocess.STDOUT, - shell=True) - except subprocess.CalledProcessError as e: - if 'NoSuchBucket' in e.output.decode('utf-8'): - logger.debug( - _BUCKET_EXTERNALLY_DELETED_DEBUG_MESSAGE.format( - bucket_name=bucket_name)) - return False - else: - with ux_utils.print_exception_no_traceback(): - raise exceptions.StorageBucketDeleteError( - f'Failed to delete R2 bucket {bucket_name}.' - f'Detailed error: {e.output}') + + success = self._execute_r2_remove_command( + remove_command, bucket_name, f'Deleting R2 bucket {bucket_name}', + f'Failed to delete R2 bucket {bucket_name}.') + if not success: + return False # Wait until bucket deletion propagates on AWS servers while data_utils.verify_r2_bucket(bucket_name): From 3bb60c87a130bc9542849ae38aad5cdea486c391 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Wed, 13 Nov 2024 12:22:13 +0800 Subject: [PATCH 20/48] support azure remove directory and cos remove --- sky/data/storage.py | 55 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 8 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index b3e680fb1e3..caac953a5ca 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -2618,7 +2618,29 @@ def delete(self) -> None: def remove_objects_from_sub_path(self) -> None: assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' - raise NotImplementedError('Not implemented') + try: + container_url = data_utils.AZURE_CONTAINER_URL.format( + storage_account_name=self.storage_account_name, + container_name=self.name) + container_client = data_utils.create_az_client( + client_type='container', + container_url=container_url, + storage_account_name=self.storage_account_name, + resource_group_name=self.resource_group_name) + # List and delete blobs in the specified directory + blobs = container_client.list_blobs( + name_starts_with=self._bucket_sub_path + '/') + for blob in blobs: + container_client.delete_blob(blob.name) + logger.info( + f'Deleted objects from sub path {self._bucket_sub_path} ' + f'in container {self.name}.') + except Exception as e: # pylint: disable=broad-except + logger.error( + f'Failed to delete objects from sub path ' + f'{self._bucket_sub_path} in container {self.name}. ' + f'Details: {common_utils.format_exception(e, use_bracket=True)}' + ) def get_handle(self) -> StorageHandle: """Returns the Storage Handle object.""" @@ -3498,7 +3520,12 @@ def delete(self) -> None: def remove_objects_from_sub_path(self) -> None: assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' - raise NotImplementedError('Not implemented') + bucket = self.s3_resource.Bucket(self.name) + try: + self._delete_cos_bucket_objects(bucket, self._bucket_sub_path + '/') + except ibm.ibm_botocore.exceptions.ClientError as e: + if e.__class__.__name__ == 'NoSuchBucket': + logger.debug('bucket already removed') def get_handle(self) -> StorageHandle: return self.s3_resource.Bucket(self.name) @@ -3740,15 +3767,27 @@ def _create_cos_bucket(self, return self.bucket - def _delete_cos_bucket(self): - bucket = self.s3_resource.Bucket(self.name) - try: - bucket_versioning = self.s3_resource.BucketVersioning(self.name) - if bucket_versioning.status == 'Enabled': + def _delete_cos_bucket_objects(self, + bucket: Any, + prefix: Optional[str] = None): + bucket_versioning = self.s3_resource.BucketVersioning(bucket.name) + if bucket_versioning.status == 'Enabled': + if prefix is not None: + res = list( + bucket.object_versions.filter(Prefix=prefix).delete()) + else: res = list(bucket.object_versions.delete()) + else: + if prefix is not None: + res = list(bucket.objects.filter(Prefix=prefix).delete()) else: res = list(bucket.objects.delete()) - logger.debug(f'Deleted bucket\'s content:\n{res}') + logger.debug(f'Deleted bucket\'s content:\n{res}, prefix: {prefix}') + + def _delete_cos_bucket(self): + bucket = self.s3_resource.Bucket(self.name) + try: + self._delete_cos_bucket_objects(bucket) bucket.delete() bucket.wait_until_not_exists() except ibm.ibm_botocore.exceptions.ClientError as e: From 415a0892f38414ceaf1c5289120abdbbfb573d0c Mon Sep 17 00:00:00 2001 From: zepingguo Date: Wed, 13 Nov 2024 12:26:07 +0800 Subject: [PATCH 21/48] doc string for remove_objects_from_sub_path --- sky/data/storage.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sky/data/storage.py b/sky/data/storage.py index caac953a5ca..a576e01683b 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -354,6 +354,7 @@ def delete(self) -> None: raise NotImplementedError def remove_objects_from_sub_path(self) -> None: + """Removes objects from the sub path in the bucket.""" raise NotImplementedError def get_handle(self) -> StorageHandle: From f0f1fe1b5b64f76fbb56c1b93fb5fc446ac47774 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Wed, 13 Nov 2024 15:12:48 +0800 Subject: [PATCH 22/48] fix sky jobs subdir issue --- sky/task.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/sky/task.py b/sky/task.py index 559da68f4b9..46a479aeaa5 100644 --- a/sky/task.py +++ b/sky/task.py @@ -977,8 +977,7 @@ def sync_storage_mounts(self) -> None: else: assert storage.name is not None, storage blob_path = 's3://' + storage.name - blob_path = storage.get_bucket_sub_path_prefix( - blob_path) + blob_path = storage.get_bucket_sub_path_prefix(blob_path) self.update_file_mounts({ mnt_path: blob_path, }) @@ -989,8 +988,7 @@ def sync_storage_mounts(self) -> None: else: assert storage.name is not None, storage blob_path = 'gs://' + storage.name - blob_path = storage.get_bucket_sub_path_prefix( - blob_path) + blob_path = storage.get_bucket_sub_path_prefix(blob_path) self.update_file_mounts({ mnt_path: blob_path, }) @@ -1009,8 +1007,7 @@ def sync_storage_mounts(self) -> None: blob_path = data_utils.AZURE_CONTAINER_URL.format( storage_account_name=storage_account_name, container_name=storage.name) - blob_path = storage.get_bucket_sub_path_prefix( - blob_path) + blob_path = storage.get_bucket_sub_path_prefix(blob_path) self.update_file_mounts({ mnt_path: blob_path, }) @@ -1021,8 +1018,7 @@ def sync_storage_mounts(self) -> None: blob_path = storage.source else: blob_path = 'r2://' + storage.name - blob_path = storage.get_bucket_sub_path_prefix( - blob_path) + blob_path = storage.get_bucket_sub_path_prefix(blob_path) self.update_file_mounts({ mnt_path: blob_path, }) @@ -1038,8 +1034,7 @@ def sync_storage_mounts(self) -> None: cos_region = data_utils.Rclone.get_region_from_rclone( storage.name, data_utils.Rclone.RcloneClouds.IBM) blob_path = f'cos://{cos_region}/{storage.name}' - blob_path = storage.get_bucket_sub_path_prefix( - blob_path) + blob_path = storage.get_bucket_sub_path_prefix(blob_path) self.update_file_mounts({mnt_path: blob_path}) else: with ux_utils.print_exception_no_traceback(): From 1a62d060a6ae980758254fd0005814682aff7abb Mon Sep 17 00:00:00 2001 From: zepingguo Date: Wed, 13 Nov 2024 16:00:55 +0800 Subject: [PATCH 23/48] test case update --- tests/test_smoke.py | 3 ++- tests/test_yamls/intermediate_bucket.yaml | 10 +++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 297392f0f0b..d922be6ac79 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -5783,8 +5783,9 @@ def test_intermediate_bucket(): [ '[ ! -f ~/.sky/config.yaml ] || mv ~/.sky/config.yaml ~/.sky/config.yaml.bak_intermediate_bucket_test', f'echo "jobs:\n bucket: \"s3://{bucket_name}\"" > ~/.sky/config.yaml', - f'sky jobs launch -n {name} tests/test_yamls/intermediate_bucket.yaml -y -d', + f'sky jobs launch -n {name} tests/test_yamls/intermediate_bucket.yaml -y', f'sky storage ls | grep {bucket_name}' # the bucket name is created + f'{_GET_JOB_QUEUE} | grep {name} | grep SUCCEEDED', '[ ! -f ~/.sky/config.yaml.bak_intermediate_bucket_test ] || mv ~/.sky/config.yaml.bak_intermediate_bucket_test ~/.sky/config.yaml' ], f'sky jobs cancel -y -n {name}', diff --git a/tests/test_yamls/intermediate_bucket.yaml b/tests/test_yamls/intermediate_bucket.yaml index f795e794dfd..fe9aafd0675 100644 --- a/tests/test_yamls/intermediate_bucket.yaml +++ b/tests/test_yamls/intermediate_bucket.yaml @@ -1,7 +1,9 @@ name: intermediate-bucket file_mounts: - /file_mounts_dir: . + /setup.py: ./setup.py + /sky: . + /train-00001-of-01024: gs://cloud-tpu-test-datasets/fake_imagenet/train-00001-of-01024 workdir: . @@ -12,6 +14,8 @@ setup: | run: | echo "listing workdir" ls . - echo "listing file_mounts_dir" - ls /file_mounts_dir + echo "listing file_mounts" + ls /setup.py + ls /sky + ls /train-00001-of-01024 echo "task run finish" From 79ea48ab879a824a23166ab8e639c6bb7824c32f Mon Sep 17 00:00:00 2001 From: zepingguo Date: Thu, 14 Nov 2024 14:29:22 +0800 Subject: [PATCH 24/48] rename to _bucket_sub_path --- sky/data/mounting_utils.py | 66 ++++++++------ sky/data/storage.py | 178 +++++++++++++++++++------------------ 2 files changed, 130 insertions(+), 114 deletions(-) diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index d110cf53b8a..74777dc21c1 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -30,17 +30,19 @@ def get_s3_mount_install_cmd() -> str: return install_cmd -def get_s3_mount_cmd(bucket_name: str, bucket_sub_path: Optional[str], - mount_path: str) -> str: +# pylint: disable=invalid-name +def get_s3_mount_cmd(bucket_name: str, + mount_path: str, + _bucket_sub_path: Optional[str] = None) -> str: """Returns a command to mount an S3 bucket using goofys.""" - if bucket_sub_path is None: - bucket_sub_path = '' + if _bucket_sub_path is None: + _bucket_sub_path = '' else: - bucket_sub_path = f':{bucket_sub_path}' + _bucket_sub_path = f':{_bucket_sub_path}' mount_cmd = ('goofys -o allow_other ' f'--stat-cache-ttl {_STAT_CACHE_TTL} ' f'--type-cache-ttl {_TYPE_CACHE_TTL} ' - f'{bucket_name}{bucket_sub_path} {mount_path}') + f'{bucket_name}{_bucket_sub_path} {mount_path}') return mount_cmd @@ -54,11 +56,13 @@ def get_gcs_mount_install_cmd() -> str: return install_cmd -def get_gcs_mount_cmd(bucket_name: str, bucket_sub_path: Optional[str], - mount_path: str) -> str: +# pylint: disable=invalid-name +def get_gcs_mount_cmd(bucket_name: str, + mount_path: str, + _bucket_sub_path: Optional[str] = None) -> str: """Returns a command to mount a GCS bucket using gcsfuse.""" - bucket_sub_path_arg = f'--only-dir {bucket_sub_path} '\ - if bucket_sub_path else '' + bucket_sub_path_arg = f'--only-dir {_bucket_sub_path} '\ + if _bucket_sub_path else '' mount_cmd = ('gcsfuse -o allow_other ' '--implicit-dirs ' f'--stat-cache-capacity {_STAT_CACHE_CAPACITY} ' @@ -86,20 +90,21 @@ def get_az_mount_install_cmd() -> str: return install_cmd +# pylint: disable=invalid-name def get_az_mount_cmd(container_name: str, - bucket_sub_path: Optional[str], storage_account_name: str, mount_path: str, - storage_account_key: Optional[str] = None) -> str: + storage_account_key: Optional[str] = None, + _bucket_sub_path: Optional[str] = None) -> str: """Returns a command to mount an AZ Container using blobfuse2. Args: container_name: Name of the mounting container. - bucket_sub_path: Sub path of the mounting container. storage_account_name: Name of the storage account the given container belongs to. mount_path: Path where the container will be mounting. storage_account_key: Access key for the given storage account. + _bucket_sub_path: Sub path of the mounting container. Returns: str: Command used to mount AZ container with blobfuse2. @@ -116,10 +121,10 @@ def get_az_mount_cmd(container_name: str, cache_path = _BLOBFUSE_CACHE_DIR.format( storage_account_name=storage_account_name, container_name=container_name) - if bucket_sub_path is None: + if _bucket_sub_path is None: bucket_sub_path_arg = '' else: - bucket_sub_path_arg = f'--subdirectory={bucket_sub_path}/ ' + bucket_sub_path_arg = f'--subdirectory={_bucket_sub_path}/ ' mount_cmd = (f'AZURE_STORAGE_ACCOUNT={storage_account_name} ' f'{key_env_var} ' f'blobfuse2 {mount_path} --allow-other --no-symlinks ' @@ -130,20 +135,24 @@ def get_az_mount_cmd(container_name: str, return mount_cmd -def get_r2_mount_cmd(r2_credentials_path: str, r2_profile_name: str, - endpoint_url: str, bucket_name: str, - bucket_sub_path: Optional[str], mount_path: str) -> str: +# pylint: disable=invalid-name +def get_r2_mount_cmd(r2_credentials_path: str, + r2_profile_name: str, + endpoint_url: str, + bucket_name: str, + mount_path: str, + _bucket_sub_path: Optional[str] = None) -> str: """Returns a command to install R2 mount utility goofys.""" - if bucket_sub_path is None: - bucket_sub_path = '' + if _bucket_sub_path is None: + _bucket_sub_path = '' else: - bucket_sub_path = f':{bucket_sub_path}' + _bucket_sub_path = f':{_bucket_sub_path}' mount_cmd = (f'AWS_SHARED_CREDENTIALS_FILE={r2_credentials_path} ' f'AWS_PROFILE={r2_profile_name} goofys -o allow_other ' f'--stat-cache-ttl {_STAT_CACHE_TTL} ' f'--type-cache-ttl {_TYPE_CACHE_TTL} ' f'--endpoint {endpoint_url} ' - f'{bucket_name}{bucket_sub_path} {mount_path}') + f'{bucket_name}{_bucket_sub_path} {mount_path}') return mount_cmd @@ -155,9 +164,12 @@ def get_cos_mount_install_cmd() -> str: return install_cmd -def get_cos_mount_cmd(rclone_config_data: str, rclone_config_path: str, - bucket_rclone_profile: str, bucket_name: str, - bucket_sub_path: Optional[str], mount_path: str) -> str: +def get_cos_mount_cmd(rclone_config_data: str, + rclone_config_path: str, + bucket_rclone_profile: str, + bucket_name: str, + mount_path: str, + _bucket_sub_path: Optional[str] = None) -> str: """Returns a command to mount an IBM COS bucket using rclone.""" # creates a fusermount soft link on older (<22) Ubuntu systems for # rclone's mount utility. @@ -169,8 +181,8 @@ def get_cos_mount_cmd(rclone_config_data: str, rclone_config_path: str, 'mkdir -p ~/.config/rclone/ && ' f'echo "{rclone_config_data}" >> ' f'{rclone_config_path}') - if bucket_sub_path is None: - sub_path_arg = f'{bucket_name}/{bucket_name}' + if _bucket_sub_path is None: + sub_path_arg = f'{bucket_name}/{_bucket_sub_path}' else: sub_path_arg = f'/{bucket_name}' # --daemon will keep the mounting process running in the background. diff --git a/sky/data/storage.py b/sky/data/storage.py index a576e01683b..499f5b48d18 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -256,7 +256,7 @@ def __init__(self, region: Optional[str] = None, is_sky_managed: Optional[bool] = None, sync_on_reconstruction: Optional[bool] = True, - bucket_sub_path: Optional[str] = None): + _bucket_sub_path: Optional[str] = None): # pylint: disable=invalid-name """Initialize AbstractStore Args: @@ -270,8 +270,8 @@ def __init__(self, there. This is set to false when the Storage object is created not for direct use, e.g. for 'sky storage delete', or the storage is being re-used, e.g., for `sky start` on a stopped cluster. - bucket_sub_path: str; The prefix of the bucket directory to be - created in the store, e.g. if bucket_sub_path=my-dir, the files + _bucket_sub_path: str; The prefix of the bucket directory to be + created in the store, e.g. if _bucket_sub_path=my-dir, the files will be uploaded to s3:///my-dir/. This only works if source is a local directory. # TODO(zpoint): Add support for non-local source. @@ -289,7 +289,7 @@ def __init__(self, # To avoid mypy error self._bucket_sub_path: Optional[str] = None - self.set_bucket_sub_path(bucket_sub_path) + self.set_bucket_sub_path(_bucket_sub_path) # Whether sky is responsible for the lifecycle of the Store. self._validate() self.initialize() @@ -297,9 +297,10 @@ def __init__(self, def get_bucket_sub_path(self) -> Optional[str]: return self._bucket_sub_path - def set_bucket_sub_path(self, bucket_sub_path: Optional[str]) -> None: - if bucket_sub_path is not None: - self._bucket_sub_path = bucket_sub_path.strip('/') + # pylint: disable=invalid-name + def set_bucket_sub_path(self, _bucket_sub_path: Optional[str]) -> None: + if _bucket_sub_path is not None: + self._bucket_sub_path = _bucket_sub_path.strip('/') else: self._bucket_sub_path = None @@ -498,14 +499,17 @@ def remove_store(self, store: AbstractStore) -> None: if storetype in self.sky_stores: del self.sky_stores[storetype] - def __init__(self, - name: Optional[str] = None, - source: Optional[SourceType] = None, - stores: Optional[Dict[StoreType, AbstractStore]] = None, - persistent: Optional[bool] = True, - mode: StorageMode = StorageMode.MOUNT, - sync_on_reconstruction: bool = True, - bucket_sub_path: Optional[str] = None) -> None: + def __init__( + self, + name: Optional[str] = None, + source: Optional[SourceType] = None, + stores: Optional[Dict[StoreType, AbstractStore]] = None, + persistent: Optional[bool] = True, + mode: StorageMode = StorageMode.MOUNT, + sync_on_reconstruction: bool = True, + # pylint: disable=invalid-name + _bucket_sub_path: Optional[str] = None + ) -> None: """Initializes a Storage object. Three fields are required: the name of the storage, the source @@ -543,7 +547,7 @@ def __init__(self, there. This is set to false when the Storage object is created not for direct use, e.g. for 'sky storage delete', or the storage is being re-used, e.g., for `sky start` on a stopped cluster. - bucket_sub_path: Optional[str]; The subdirectory to use for the + _bucket_sub_path: Optional[str]; The subdirectory to use for the storage object. """ self.name: str @@ -552,7 +556,7 @@ def __init__(self, self.mode = mode assert mode in StorageMode self.sync_on_reconstruction = sync_on_reconstruction - self._bucket_sub_path = bucket_sub_path + self._bucket_sub_path = _bucket_sub_path # TODO(romilb, zhwu): This is a workaround to support storage deletion # for spot. Once sky storage supports forced management for external @@ -955,7 +959,7 @@ def add_store(self, source=self.source, region=region, sync_on_reconstruction=self.sync_on_reconstruction, - bucket_sub_path=self._bucket_sub_path) + _bucket_sub_path=self._bucket_sub_path) except exceptions.StorageBucketCreateError: # Creation failed, so this must be sky managed store. Add failure # to state. @@ -1105,7 +1109,8 @@ def from_yaml_config(cls, config: Dict[str, Any]) -> 'Storage': store = config.pop('store', None) mode_str = config.pop('mode', None) force_delete = config.pop('_force_delete', None) - bucket_sub_path = config.pop('_bucket_sub_path', None) + # pylint: disable=invalid-name + _bucket_sub_path = config.pop('_bucket_sub_path', None) if force_delete is None: force_delete = False @@ -1126,7 +1131,7 @@ def from_yaml_config(cls, config: Dict[str, Any]) -> 'Storage': source=source, persistent=persistent, mode=mode, - bucket_sub_path=bucket_sub_path) + _bucket_sub_path=_bucket_sub_path) if store is not None: storage_obj.add_store(StoreType(store.upper())) @@ -1175,11 +1180,11 @@ def __init__(self, region: Optional[str] = 'us-east-2', is_sky_managed: Optional[bool] = None, sync_on_reconstruction: bool = True, - bucket_sub_path: Optional[str] = None): + _bucket_sub_path: Optional[str] = None): self.client: 'boto3.client.Client' self.bucket: 'StorageHandle' super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction, bucket_sub_path) + sync_on_reconstruction, _bucket_sub_path) def _validate(self): if self.source is not None and isinstance(self.source, str): @@ -1390,11 +1395,11 @@ def get_file_sync_command(base_dir_path, file_names): for file_name in file_names ]) base_dir_path = shlex.quote(base_dir_path) - bucket_sub_path = (f'/{self._bucket_sub_path}' - if self._bucket_sub_path else '') + sub_path = (f'/{self._bucket_sub_path}' + if self._bucket_sub_path else '') sync_command = ('aws s3 sync --no-follow-symlinks --exclude="*" ' f'{includes} {base_dir_path} ' - f's3://{self.name}{bucket_sub_path}') + f's3://{self.name}{sub_path}') return sync_command def get_dir_sync_command(src_dir_path, dest_dir_name): @@ -1406,12 +1411,11 @@ def get_dir_sync_command(src_dir_path, dest_dir_name): for file_name in excluded_list ]) src_dir_path = shlex.quote(src_dir_path) - bucket_sub_path = (f'/{self._bucket_sub_path}' - if self._bucket_sub_path else '') - sync_command = ( - f'aws s3 sync --no-follow-symlinks {excludes} ' - f'{src_dir_path} ' - f's3://{self.name}{bucket_sub_path}/{dest_dir_name}') + sub_path = (f'/{self._bucket_sub_path}' + if self._bucket_sub_path else '') + sync_command = (f'aws s3 sync --no-follow-symlinks {excludes} ' + f'{src_dir_path} ' + f's3://{self.name}{sub_path}/{dest_dir_name}') return sync_command # Generate message for upload @@ -1522,8 +1526,8 @@ def mount_command(self, mount_path: str) -> str: """ install_cmd = mounting_utils.get_s3_mount_install_cmd() mount_cmd = mounting_utils.get_s3_mount_cmd(self.bucket.name, - self._bucket_sub_path, - mount_path) + mount_path, + self._bucket_sub_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, mount_cmd) @@ -1647,11 +1651,11 @@ def __init__(self, region: Optional[str] = 'us-central1', is_sky_managed: Optional[bool] = None, sync_on_reconstruction: Optional[bool] = True, - bucket_sub_path: Optional[str] = None): + _bucket_sub_path: Optional[str] = None): self.client: 'storage.Client' self.bucket: StorageHandle super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction, bucket_sub_path) + sync_on_reconstruction, _bucket_sub_path) def _validate(self): if self.source is not None and isinstance(self.source, str): @@ -1898,12 +1902,11 @@ def get_file_sync_command(base_dir_path, file_names): sync_format = '|'.join(file_names) gsutil_alias, alias_gen = data_utils.get_gsutil_command() base_dir_path = shlex.quote(base_dir_path) - bucket_sub_path = (f'/{self._bucket_sub_path}' - if self._bucket_sub_path else '') - sync_command = ( - f'{alias_gen}; {gsutil_alias} ' - f'rsync -e -x \'^(?!{sync_format}$).*\' ' - f'{base_dir_path} gs://{self.name}{bucket_sub_path}') + sub_path = (f'/{self._bucket_sub_path}' + if self._bucket_sub_path else '') + sync_command = (f'{alias_gen}; {gsutil_alias} ' + f'rsync -e -x \'^(?!{sync_format}$).*\' ' + f'{base_dir_path} gs://{self.name}{sub_path}') return sync_command def get_dir_sync_command(src_dir_path, dest_dir_name): @@ -1913,12 +1916,11 @@ def get_dir_sync_command(src_dir_path, dest_dir_name): excludes = '|'.join(excluded_list) gsutil_alias, alias_gen = data_utils.get_gsutil_command() src_dir_path = shlex.quote(src_dir_path) - bucket_sub_path = (f'/{self._bucket_sub_path}' - if self._bucket_sub_path else '') - sync_command = ( - f'{alias_gen}; {gsutil_alias} ' - f'rsync -e -r -x \'({excludes})\' {src_dir_path} ' - f'gs://{self.name}{bucket_sub_path}/{dest_dir_name}') + sub_path = (f'/{self._bucket_sub_path}' + if self._bucket_sub_path else '') + sync_command = (f'{alias_gen}; {gsutil_alias} ' + f'rsync -e -r -x \'({excludes})\' {src_dir_path} ' + f'gs://{self.name}{sub_path}/{dest_dir_name}') return sync_command # Generate message for upload @@ -2016,8 +2018,8 @@ def mount_command(self, mount_path: str) -> str: """ install_cmd = mounting_utils.get_gcs_mount_install_cmd() mount_cmd = mounting_utils.get_gcs_mount_cmd(self.bucket.name, - self._bucket_sub_path, - mount_path) + mount_path, + self._bucket_sub_path) version_check_cmd = ( f'gcsfuse --version | grep -q {mounting_utils.GCSFUSE_VERSION}') return mounting_utils.get_mounting_command(mount_path, install_cmd, @@ -2057,22 +2059,25 @@ def _create_gcs_bucket(self, f'{new_bucket.storage_class}{colorama.Style.RESET_ALL}') return new_bucket - def _delete_gcs_bucket(self, - bucket_name: str, - bucket_sub_path: Optional[str] = None) -> bool: + def _delete_gcs_bucket( + self, + bucket_name: str, + # pylint: disable=invalid-name + _bucket_sub_path: Optional[str] = None + ) -> bool: """Deletes objects in GCS bucket Args: bucket_name: str; Name of bucket - bucket_sub_path: str; Sub path in the bucket, if provided only objects - in the sub path will be deleted, else the whole bucket will be - deleted + _bucket_sub_path: str; Sub path in the bucket, if provided only + objects in the sub path will be deleted, else the whole bucket will + be deleted Returns: bool; True if bucket was deleted, False if it was deleted externally. """ - if bucket_sub_path is not None: - command_suffix = f'/{bucket_sub_path}' + if _bucket_sub_path is not None: + command_suffix = f'/{_bucket_sub_path}' hint_text = 'objects in ' else: command_suffix = '' @@ -2163,7 +2168,7 @@ def __init__(self, region: Optional[str] = 'eastus', is_sky_managed: Optional[bool] = None, sync_on_reconstruction: bool = True, - bucket_sub_path: Optional[str] = None): + _bucket_sub_path: Optional[str] = None): self.storage_client: 'storage.Client' self.resource_client: 'storage.Client' self.container_name: str @@ -2175,7 +2180,7 @@ def __init__(self, if region is None: region = 'eastus' super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction, bucket_sub_path) + sync_on_reconstruction, _bucket_sub_path) @classmethod def from_metadata(cls, metadata: AbstractStore.StoreMetadata, @@ -2836,10 +2841,10 @@ def mount_command(self, mount_path: str) -> str: """ install_cmd = mounting_utils.get_az_mount_install_cmd() mount_cmd = mounting_utils.get_az_mount_cmd(self.container_name, - self._bucket_sub_path, self.storage_account_name, mount_path, - self.storage_account_key) + self.storage_account_key, + self._bucket_sub_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, mount_cmd) @@ -2939,11 +2944,11 @@ def __init__(self, region: Optional[str] = 'auto', is_sky_managed: Optional[bool] = None, sync_on_reconstruction: Optional[bool] = True, - bucket_sub_path: Optional[str] = None): + _bucket_sub_path: Optional[str] = None): self.client: 'boto3.client.Client' self.bucket: 'StorageHandle' super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction, bucket_sub_path) + sync_on_reconstruction, _bucket_sub_path) def _validate(self): if self.source is not None and isinstance(self.source, str): @@ -3095,13 +3100,13 @@ def get_file_sync_command(base_dir_path, file_names): ]) endpoint_url = cloudflare.create_endpoint() base_dir_path = shlex.quote(base_dir_path) - bucket_sub_path = (f'/{self._bucket_sub_path}' - if self._bucket_sub_path else '') + sub_path = (f'/{self._bucket_sub_path}' + if self._bucket_sub_path else '') sync_command = ('AWS_SHARED_CREDENTIALS_FILE=' f'{cloudflare.R2_CREDENTIALS_PATH} ' 'aws s3 sync --no-follow-symlinks --exclude="*" ' f'{includes} {base_dir_path} ' - f's3://{self.name}{bucket_sub_path} ' + f's3://{self.name}{sub_path} ' f'--endpoint {endpoint_url} ' f'--profile={cloudflare.R2_PROFILE_NAME}') return sync_command @@ -3116,16 +3121,15 @@ def get_dir_sync_command(src_dir_path, dest_dir_name): ]) endpoint_url = cloudflare.create_endpoint() src_dir_path = shlex.quote(src_dir_path) - bucket_sub_path = (f'/{self._bucket_sub_path}' - if self._bucket_sub_path else '') - sync_command = ( - 'AWS_SHARED_CREDENTIALS_FILE=' - f'{cloudflare.R2_CREDENTIALS_PATH} ' - f'aws s3 sync --no-follow-symlinks {excludes} ' - f'{src_dir_path} ' - f's3://{self.name}{bucket_sub_path}/{dest_dir_name} ' - f'--endpoint {endpoint_url} ' - f'--profile={cloudflare.R2_PROFILE_NAME}') + sub_path = (f'/{self._bucket_sub_path}' + if self._bucket_sub_path else '') + sync_command = ('AWS_SHARED_CREDENTIALS_FILE=' + f'{cloudflare.R2_CREDENTIALS_PATH} ' + f'aws s3 sync --no-follow-symlinks {excludes} ' + f'{src_dir_path} ' + f's3://{self.name}{sub_path}/{dest_dir_name} ' + f'--endpoint {endpoint_url} ' + f'--profile={cloudflare.R2_PROFILE_NAME}') return sync_command # Generate message for upload @@ -3249,7 +3253,7 @@ def mount_command(self, mount_path: str) -> str: r2_profile_name = cloudflare.R2_PROFILE_NAME mount_cmd = mounting_utils.get_r2_mount_cmd( r2_credential_path, r2_profile_name, endpoint_url, self.bucket.name, - self._bucket_sub_path, mount_path) + mount_path, self._bucket_sub_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, mount_cmd) @@ -3366,11 +3370,11 @@ def __init__(self, region: Optional[str] = 'us-east', is_sky_managed: Optional[bool] = None, sync_on_reconstruction: bool = True, - bucket_sub_path: Optional[str] = None): + _bucket_sub_path: Optional[str] = None): self.client: 'storage.Client' self.bucket: 'StorageHandle' super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction, bucket_sub_path) + sync_on_reconstruction, _bucket_sub_path) self.bucket_rclone_profile = \ Rclone.generate_rclone_bucket_profile_name( self.name, Rclone.RcloneClouds.IBM) @@ -3568,12 +3572,12 @@ def get_dir_sync_command(src_dir_path, dest_dir_name) -> str: # .git directory is excluded from the sync # wrapping src_dir_path with "" to support path with spaces src_dir_path = shlex.quote(src_dir_path) - bucket_sub_path = (f'/{self._bucket_sub_path}' - if self._bucket_sub_path else '') + sub_path = (f'/{self._bucket_sub_path}' + if self._bucket_sub_path else '') sync_command = ( 'rclone copy --exclude ".git/*" ' f'{src_dir_path} ' - f'{self.bucket_rclone_profile}:{self.name}{bucket_sub_path}' + f'{self.bucket_rclone_profile}:{self.name}{sub_path}' f'/{dest_dir_name}') return sync_command @@ -3600,12 +3604,12 @@ def get_file_sync_command(base_dir_path, file_names) -> str: for file_name in file_names ]) base_dir_path = shlex.quote(base_dir_path) - bucket_sub_path = (f'/{self._bucket_sub_path}' - if self._bucket_sub_path else '') + sub_path = (f'/{self._bucket_sub_path}' + if self._bucket_sub_path else '') sync_command = ( 'rclone copy ' f'{includes} {base_dir_path} ' - f'{self.bucket_rclone_profile}:{self.name}{bucket_sub_path}') + f'{self.bucket_rclone_profile}:{self.name}{sub_path}') return sync_command # Generate message for upload @@ -3729,8 +3733,8 @@ def mount_command(self, mount_path: str) -> str: Rclone.RCLONE_CONFIG_PATH, self.bucket_rclone_profile, self.bucket.name, - self._bucket_sub_path, - mount_path) + mount_path, + self._bucket_sub_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, mount_cmd) From 8fd2c8c3843c9b0df24eebe04f1f4a747c4439e7 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Thu, 14 Nov 2024 15:52:14 +0800 Subject: [PATCH 25/48] change the config schema --- sky/data/storage.py | 39 +++++++---------------------------- sky/skylet/constants.py | 6 ++++++ sky/task.py | 14 +++++++++++++ sky/utils/controller_utils.py | 28 +++++++------------------ sky/utils/schemas.py | 24 +++++++++++++++++++-- 5 files changed, 58 insertions(+), 53 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index 499f5b48d18..0c29055c328 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -189,29 +189,6 @@ def get_endpoint_url(cls, store: 'AbstractStore', path: str) -> str: return bucket_endpoint_url -class StorePrefix(enum.Enum): - """Enum for the prefix of different stores.""" - S3 = 's3://' - GCS = 'gs://' - AZURE = 'https://' - R2 = 'r2://' - IBM = 'cos://' - - def to_store_type(self) -> StoreType: - if self == StorePrefix.S3: - return StoreType.S3 - elif self == StorePrefix.GCS: - return StoreType.GCS - elif self == StorePrefix.AZURE: - return StoreType.AZURE - elif self == StorePrefix.R2: - return StoreType.R2 - elif self == StorePrefix.IBM: - return StoreType.IBM - else: - raise ValueError(f'Unknown store prefix: {self}') - - class StorageMode(enum.Enum): MOUNT = 'MOUNT' COPY = 'COPY' @@ -354,7 +331,7 @@ def delete(self) -> None: """Removes the Storage from the cloud.""" raise NotImplementedError - def remove_objects_from_sub_path(self) -> None: + def delete_sub_path(self) -> None: """Removes objects from the sub path in the bucket.""" raise NotImplementedError @@ -1025,7 +1002,7 @@ def delete(self, # Without interfering with the global state. # User should still call storage.delete() to remove the bucket. if only_delete_sub_path_if_exists and store.get_bucket_sub_path(): - store.remove_objects_from_sub_path() + store.delete_sub_path() return is_sky_managed = store.is_sky_managed @@ -1051,7 +1028,7 @@ def delete(self, for key, store in self.stores.items(): if only_delete_sub_path_if_exists and store.get_bucket_sub_path( ): - store.remove_objects_from_sub_path() + store.delete_sub_path() continue if store.is_sky_managed: @@ -1352,7 +1329,7 @@ def delete(self) -> None: logger.info(f'{colorama.Fore.GREEN}{msg_str}' f'{colorama.Style.RESET_ALL}') - def remove_objects_from_sub_path(self) -> None: + def delete_sub_path(self) -> None: assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' deleted_by_skypilot = self._delete_s3_bucket_sub_path( self.name, self._bucket_sub_path) @@ -1822,7 +1799,7 @@ def delete(self) -> None: logger.info(f'{colorama.Fore.GREEN}{msg_str}' f'{colorama.Style.RESET_ALL}') - def remove_objects_from_sub_path(self) -> None: + def delete_sub_path(self) -> None: assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' deleted_by_skypilot = self._delete_gcs_bucket(self.name, self._bucket_sub_path) @@ -2622,7 +2599,7 @@ def delete(self) -> None: logger.info(f'{colorama.Fore.GREEN}{msg_str}' f'{colorama.Style.RESET_ALL}') - def remove_objects_from_sub_path(self) -> None: + def delete_sub_path(self) -> None: assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' try: container_url = data_utils.AZURE_CONTAINER_URL.format( @@ -3056,7 +3033,7 @@ def delete(self) -> None: logger.info(f'{colorama.Fore.GREEN}{msg_str}' f'{colorama.Style.RESET_ALL}') - def remove_objects_from_sub_path(self) -> None: + def delete_sub_path(self) -> None: assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' deleted_by_skypilot = self._delete_r2_bucket_sub_path( self.name, self._bucket_sub_path) @@ -3523,7 +3500,7 @@ def delete(self) -> None: logger.info(f'{colorama.Fore.GREEN}Deleted COS bucket {self.name}.' f'{colorama.Style.RESET_ALL}') - def remove_objects_from_sub_path(self) -> None: + def delete_sub_path(self) -> None: assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' bucket = self.s3_resource.Bucket(self.name) try: diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 0a297dc9f13..c9ae262d6a6 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -245,6 +245,12 @@ FILE_MOUNTS_LOCAL_TMP_DIR = 'skypilot-filemounts-files-{id}' FILE_MOUNTS_REMOTE_TMP_DIR = '/tmp/sky-{}-filemounts-files' +# Used when an managed jobs are created and +# files are synced up to the cloud. +FILE_MOUNTS_WORKDIR_SUBPATH = 'job-{run_id}/workdir' +FILE_MOUNTS_SUBPATH = 'job-{run_id}/local-file-mounts/{i}' +FILE_MOUNTS_TMP_SUBPATH = 'job-{run_id}/tmp-files' + # The default idle timeout for SkyPilot controllers. This include spot # controller and sky serve controller. # TODO(tian): Refactor to controller_utils. Current blocker: circular import. diff --git a/sky/task.py b/sky/task.py index 46a479aeaa5..657f6fddfb7 100644 --- a/sky/task.py +++ b/sky/task.py @@ -14,6 +14,7 @@ from sky import clouds from sky import exceptions from sky import sky_logging +from sky import skypilot_config import sky.dag from sky.data import data_utils from sky.data import storage as storage_lib @@ -903,6 +904,19 @@ def update_storage_mounts( task_storage_mounts.update(storage_mounts) return self.set_storage_mounts(task_storage_mounts) + def get_bucket_name_and_store_type_from_job_config( + self) -> Tuple[Optional[str], Optional[str]]: + """Returns the bucket name and store type from the job config.""" + bucket_dict = skypilot_config.get_nested(('jobs', 'bucket'), None) + store_type, _ = self._get_preferred_store() + if store_type.value.lower() in bucket_dict: + bucket_name = bucket_dict[store_type.value.lower()] + elif 'default' in bucket_dict: + bucket_name = bucket_dict['default'] + else: + return None, None + return bucket_name, store_type.value + def _get_preferred_store( self) -> Tuple[storage_lib.StoreType, Optional[str]]: """Returns the preferred store type and region for this task.""" diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 741684240ad..1003e579087 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -6,7 +6,7 @@ import os import tempfile import typing -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple +from typing import Any, Dict, Iterable, List, Optional, Set import colorama @@ -676,21 +676,6 @@ def replace_skypilot_config_path_in_file_mounts( f'with the real path in file mounts: {file_mounts}') -def _get_bucket_name_and_store_type_from_job_config( -) -> Tuple[Optional[str], Optional[str]]: - bucket_wth_prefix = skypilot_config.get_nested(('jobs', 'bucket'), None) - if bucket_wth_prefix is None: - return None, None - - for prefix in storage_lib.StorePrefix: - if bucket_wth_prefix.startswith(prefix.value): - bucket_name = bucket_wth_prefix[len(prefix.value):] - store = prefix.to_store_type().value - return bucket_name, store - - raise ValueError(f'Invalid bucket name with prefix: {bucket_wth_prefix}') - - def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', path: str) -> None: """Translates local->VM mounts into Storage->VM, then syncs up any Storage. @@ -734,7 +719,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', # Get the bucket name for the workdir and file mounts, # we stores all these files in same bucket from config. - bucket_name, store = _get_bucket_name_and_store_type_from_job_config() + bucket_name, store = task.get_bucket_name_and_store_type_from_job_config() if bucket_name is None: bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format( username=common_utils.get_cleaned_username(), id=run_id) @@ -757,7 +742,8 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', 'persistent': False, 'mode': 'COPY', 'store': store, - '_bucket_sub_path': f'job-{run_id}/workdir', + '_bucket_sub_path': + constants.FILE_MOUNTS_WORKDIR_SUBPATH.format(run_id=run_id), }) # Check of the existence of the workdir in file_mounts is done in # the task construction. @@ -782,7 +768,8 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', 'persistent': False, 'mode': 'COPY', 'store': store, - '_bucket_sub_path': f'job-{run_id}/local-file-mounts/{i}', + '_bucket_sub_path': constants.FILE_MOUNTS_SUBPATH.format( + i=i, run_id=run_id), }) logger.info(f' {colorama.Style.DIM}Folder : {src!r} ' f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}') @@ -809,7 +796,8 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', 'persistent': False, 'mode': 'MOUNT', 'store': store, - '_bucket_sub_path': f'job-{run_id}/tmp-files', + '_bucket_sub_path': + constants.FILE_MOUNTS_TMP_SUBPATH.format(run_id=run_id), }) if file_mount_remote_tmp_dir in original_storage_mounts: with ux_utils.print_exception_no_traceback(): diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index b4ddbbbeea9..3d5c52d559d 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -709,9 +709,29 @@ def get_config_schema(): } }, 'bucket': { - 'type': 'string', - 'pattern': '^(https|s3|gs|r2|cos)://.+', + 'type': 'object', 'required': [], + 'additionalProperties': False, + 'properties': { + 'aws': { + 'type': 'string', + }, + 'gcp': { + 'type': 'string', + }, + 'azure': { + 'type': 'string', + }, + 'r2': { + 'type': 'string', + }, + 'ibm': { + 'type': 'string', + }, + 'default': { + 'type': 'string', + } + } } } } From 7d57aef3164a8e7bdbdae76326a1edf610b87edd Mon Sep 17 00:00:00 2001 From: zepingguo Date: Thu, 14 Nov 2024 17:03:04 +0800 Subject: [PATCH 26/48] setter --- sky/data/storage.py | 22 +++++++++++++--------- sky/utils/schemas.py | 4 ++-- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index 0c29055c328..4a027b27634 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -266,18 +266,23 @@ def __init__(self, # To avoid mypy error self._bucket_sub_path: Optional[str] = None - self.set_bucket_sub_path(_bucket_sub_path) + # Trigger the setter to strip any leading/trailing slashes. + self.bucket_sub_path = self._bucket_sub_path # Whether sky is responsible for the lifecycle of the Store. self._validate() self.initialize() - def get_bucket_sub_path(self) -> Optional[str]: + @property + def bucket_sub_path(self) -> Optional[str]: + """Get the bucket_sub_path.""" return self._bucket_sub_path + @bucket_sub_path.setter # pylint: disable=invalid-name - def set_bucket_sub_path(self, _bucket_sub_path: Optional[str]) -> None: - if _bucket_sub_path is not None: - self._bucket_sub_path = _bucket_sub_path.strip('/') + def bucket_sub_path(self, bucket_sub_path: Optional[str]) -> None: + """Set the bucket_sub_path, stripping any leading/trailing slashes.""" + if bucket_sub_path is not None: + self._bucket_sub_path = bucket_sub_path.strip('/') else: self._bucket_sub_path = None @@ -854,7 +859,7 @@ def _add_store_from_metadata( continue # This one can't be retrieved from metadata since its set every time # we create a new storage object. - store.set_bucket_sub_path(self._bucket_sub_path) + store.bucket_sub_path = self._bucket_sub_path self._add_store(store, is_reconstructed=True) @classmethod @@ -1001,7 +1006,7 @@ def delete(self, # We delete the bucket sub path if it exists, and then return. # Without interfering with the global state. # User should still call storage.delete() to remove the bucket. - if only_delete_sub_path_if_exists and store.get_bucket_sub_path(): + if only_delete_sub_path_if_exists and store.bucket_sub_path: store.delete_sub_path() return @@ -1026,8 +1031,7 @@ def delete(self, else: keys_to_delete = [] for key, store in self.stores.items(): - if only_delete_sub_path_if_exists and store.get_bucket_sub_path( - ): + if only_delete_sub_path_if_exists and store.bucket_sub_path: store.delete_sub_path() continue diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 3d5c52d559d..875986ba66f 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -713,10 +713,10 @@ def get_config_schema(): 'required': [], 'additionalProperties': False, 'properties': { - 'aws': { + 's3': { 'type': 'string', }, - 'gcp': { + 'gcs': { 'type': 'string', }, 'azure': { From 5da18de35b180da01d3fc0ddc7c49162f6772044 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Thu, 14 Nov 2024 19:27:00 +0800 Subject: [PATCH 27/48] bug fix and test update --- sky/utils/controller_utils.py | 6 +- tests/test_smoke.py | 78 +++++++++---------- tests/test_yamls/use_intermediate_bucket.yaml | 5 ++ 3 files changed, 45 insertions(+), 44 deletions(-) create mode 100644 tests/test_yamls/use_intermediate_bucket.yaml diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 1003e579087..b13cc73ae07 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -776,6 +776,8 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', # Step 3: Translate local file mounts with file in src to SkyPilot storage. # Hard link the files in src to a temporary directory, and upload folder. + file_mounts_tmp_subpath = constants.FILE_MOUNTS_TMP_SUBPATH.format( + run_id=run_id) local_fm_path = os.path.join( tempfile.gettempdir(), constants.FILE_MOUNTS_LOCAL_TMP_DIR.format(id=run_id)) @@ -796,8 +798,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', 'persistent': False, 'mode': 'MOUNT', 'store': store, - '_bucket_sub_path': - constants.FILE_MOUNTS_TMP_SUBPATH.format(run_id=run_id), + '_bucket_sub_path': file_mounts_tmp_subpath, }) if file_mount_remote_tmp_dir in original_storage_mounts: with ux_utils.print_exception_no_traceback(): @@ -858,6 +859,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', store_object = storage_obj.stores[store_type] bucket_url = storage_lib.StoreType.get_endpoint_url( store_object, bucket_name) + bucket_url += f'/{file_mounts_tmp_subpath}' for dst, src in copy_mounts_with_file_in_src.items(): file_id = src_to_file_id[src] new_file_mounts[dst] = bucket_url + f'/file-{file_id}' diff --git a/tests/test_smoke.py b/tests/test_smoke.py index d922be6ac79..ce4ce2e1966 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3230,6 +3230,10 @@ def test_managed_jobs_storage(generic_cloud: str): storage_name = f'sky-test-{timestamp}' output_storage_name = f'sky-test-output-{timestamp}' + yaml_str_user_config = pathlib.Path( + 'tests/test_yamls/use_intermediate_bucket.yaml').read_text() + intermediate_storage_name = f'bucket-jobs-s3-{timestamp}' + # Also perform region testing for bucket creation to validate if buckets are # created in the correct region and correctly mounted in managed jobs. # However, we inject this testing only for AWS and GCP since they are the @@ -3284,28 +3288,38 @@ def test_managed_jobs_storage(generic_cloud: str): yaml_str = yaml_str.replace('sky-workdir-zhwu', storage_name) yaml_str = yaml_str.replace('sky-output-bucket', output_storage_name) - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(yaml_str) - f.flush() - file_path = f.name - test = Test( - 'managed_jobs_storage', - [ - *STORAGE_SETUP_COMMANDS, - f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y', - region_validation_cmd, # Check if the bucket is created in the correct region - 'sleep 60', # Wait the spot queue to be updated - f'{_GET_JOB_QUEUE} | grep {name} | grep SUCCEEDED', - f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]', - # Check if file was written to the mounted output bucket - output_check_cmd - ], - (f'sky jobs cancel -y -n {name}', - f'; sky storage delete {output_storage_name} || true'), - # Increase timeout since sky jobs queue -r can be blocked by other spot tests. - timeout=20 * 60, - ) - run_one_test(test) + yaml_str_user_config = yaml_str_user_config.replace( + 'bucket-jobs-s3', intermediate_storage_name) + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f_user_config: + f_user_config.write(yaml_str_user_config) + f_user_config.flush() + user_config_path = f_user_config.name + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f_task: + f_task.write(yaml_str) + f_task.flush() + file_path = f_task.name + test = Test( + 'managed_jobs_storage', + [ + *STORAGE_SETUP_COMMANDS, + f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y', + region_validation_cmd, # Check if the bucket is created in the correct region + 'sleep 60', # Wait the spot queue to be updated + f'{_GET_JOB_QUEUE} | grep {name} | grep SUCCEEDED', + f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]', + # check intermediate bucket exists + f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{intermediate_storage_name}\')].Name" --output text | wc -l) -eq 0 ]', + # Check if file was written to the mounted output bucket + output_check_cmd + ], + (f'sky jobs cancel -y -n {name}', + f'; sky storage delete {intermediate_storage_name}', + f'; sky storage delete {output_storage_name} || true'), + env={'SKYPILOT_CONFIG': user_config_path}, + # Increase timeout since sky jobs queue -r can be blocked by other spot tests. + timeout=20 * 60, + ) + run_one_test(test) # ---------- Testing spot TPU ---------- @@ -5772,23 +5786,3 @@ def test_kubernetes_context_failover(): env={'SKYPILOT_CONFIG': f.name}, ) run_one_test(test) - - -@pytest.mark.aws -def test_intermediate_bucket(): - name = _get_cluster_name() - bucket_name = f'sky-bucket-{int(time.time())}' - test = Test( - 'interm-resources', - [ - '[ ! -f ~/.sky/config.yaml ] || mv ~/.sky/config.yaml ~/.sky/config.yaml.bak_intermediate_bucket_test', - f'echo "jobs:\n bucket: \"s3://{bucket_name}\"" > ~/.sky/config.yaml', - f'sky jobs launch -n {name} tests/test_yamls/intermediate_bucket.yaml -y', - f'sky storage ls | grep {bucket_name}' # the bucket name is created - f'{_GET_JOB_QUEUE} | grep {name} | grep SUCCEEDED', - '[ ! -f ~/.sky/config.yaml.bak_intermediate_bucket_test ] || mv ~/.sky/config.yaml.bak_intermediate_bucket_test ~/.sky/config.yaml' - ], - f'sky jobs cancel -y -n {name}', - timeout=25 * 60, - ) - run_one_test(test) diff --git a/tests/test_yamls/use_intermediate_bucket.yaml b/tests/test_yamls/use_intermediate_bucket.yaml new file mode 100644 index 00000000000..4dbbbfa11fa --- /dev/null +++ b/tests/test_yamls/use_intermediate_bucket.yaml @@ -0,0 +1,5 @@ +jobs: + bucket: + s3: "bucket-jobs-s3" + gcs: "bucket-jobs-gcs" + default: "bucket-jobs-default" From 7c82fd7b4826e5194159eaab3a6bcd654d04eccb Mon Sep 17 00:00:00 2001 From: zepingguo Date: Fri, 15 Nov 2024 15:38:16 +0800 Subject: [PATCH 28/48] delete bucket depends on user config or sky generated --- sky/backends/cloud_vm_ray_backend.py | 10 +++++++++- sky/data/storage.py | 6 +++++- sky/task.py | 2 ++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 8ab64ddd4df..99c529adaf5 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3505,7 +3505,15 @@ def _teardown_ephemeral_storage(self, task: task_lib.Task) -> None: if storage_mounts is not None: for _, storage in storage_mounts.items(): if not storage.persistent: - storage.delete(only_delete_sub_path_if_exists=True) + is_bucket_name_generated_by_sky = ( + storage.is_bucket_name_auto_generated_by_sky()) + # If the bucket name is auto-generated by SkyPilot, we keep + # the original behaviour delete the bucket, otherwise, we + # only delete the sub-path if it exists because miltiple + # jobs might share the same bucket, delete bucket could + # potential cause other jobs to fail during file operation + storage.delete(only_delete_sub_path_if_exists= + not is_bucket_name_generated_by_sky) def _teardown(self, handle: CloudVmRayResourceHandle, diff --git a/sky/data/storage.py b/sky/data/storage.py index 4a027b27634..7f0a008ea7e 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -267,7 +267,7 @@ def __init__(self, # To avoid mypy error self._bucket_sub_path: Optional[str] = None # Trigger the setter to strip any leading/trailing slashes. - self.bucket_sub_path = self._bucket_sub_path + self.bucket_sub_path = _bucket_sub_path # Whether sky is responsible for the lifecycle of the Store. self._validate() self.initialize() @@ -983,6 +983,10 @@ def _add_store(self, store: AbstractStore, is_reconstructed: bool = False): global_user_state.add_or_update_storage(self.name, self.handle, StorageStatus.INIT) + def is_bucket_name_auto_generated_by_sky(self) -> bool: + return re.match(r'^skypilot-filemounts-.+-[a-z0-9]{8}$', + self.name) is not None + def delete(self, store_type: Optional[StoreType] = None, only_delete_sub_path_if_exists: bool = False) -> None: diff --git a/sky/task.py b/sky/task.py index 657f6fddfb7..aa93e842575 100644 --- a/sky/task.py +++ b/sky/task.py @@ -908,6 +908,8 @@ def get_bucket_name_and_store_type_from_job_config( self) -> Tuple[Optional[str], Optional[str]]: """Returns the bucket name and store type from the job config.""" bucket_dict = skypilot_config.get_nested(('jobs', 'bucket'), None) + if bucket_dict is None: + return None, None store_type, _ = self._get_preferred_store() if store_type.value.lower() in bucket_dict: bucket_name = bucket_dict[store_type.value.lower()] From cb2a574d1fbbb2ebdc7fcf7dec847f1c60ec91b7 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Fri, 15 Nov 2024 17:41:49 +0800 Subject: [PATCH 29/48] add test case --- docs/source/reference/config.rst | 11 ++- tests/test_smoke.py | 126 ++++++++++++++++++++++++++++++- 2 files changed, 129 insertions(+), 8 deletions(-) diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index 59e7e28cc3f..ab2b96277d6 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -24,10 +24,13 @@ Available fields and semantics: # # Ref: https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html#customizing-job-controller-resources jobs: - # Bucket to store managed jobs mount files and tmp files. - # Its optional, if not set, SkyPilot will create a new bucket for each managed jobs . - # Support https|s3|gs|r2|cos - bucket: s3://sky-bucket-with-permission + bucket: + # Bucket to store managed jobs mount files and tmp files. + # Its optional, if not set, SkyPilot will create a new bucket for each managed jobs . + # Support s3, gcs, azure, r2, ibm, default. + s3: "bucket-jobs-s3" + gcs: "bucket-jobs-gcs" + default: "bucket-jobs-default" controller: resources: # same spec as 'resources' in a task YAML cloud: gcp diff --git a/tests/test_smoke.py b/tests/test_smoke.py index ce4ce2e1966..5dbc41a6eff 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3307,8 +3307,8 @@ def test_managed_jobs_storage(generic_cloud: str): 'sleep 60', # Wait the spot queue to be updated f'{_GET_JOB_QUEUE} | grep {name} | grep SUCCEEDED', f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]', - # check intermediate bucket exists - f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{intermediate_storage_name}\')].Name" --output text | wc -l) -eq 0 ]', + # check intermediate bucket exists, it won't be deletd if its user specific + f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{intermediate_storage_name}\')].Name" --output text | wc -l) -eq 1 ]', # Check if file was written to the mounted output bucket output_check_cmd ], @@ -4803,6 +4803,12 @@ def tmp_source(self, tmp_path): circle_link.symlink_to(tmp_dir, target_is_directory=True) yield str(tmp_dir) + @pytest.fixture + def tmp_sub_path(self): + tmp_dir1 = uuid.uuid4().hex[:8] + tmp_dir2 = uuid.uuid4().hex[:8] + yield "/".join([tmp_dir1, tmp_dir2]) + @staticmethod def generate_bucket_name(): # Creates a temporary bucket name @@ -4822,13 +4828,15 @@ def yield_storage_object( stores: Optional[Dict[storage_lib.StoreType, storage_lib.AbstractStore]] = None, persistent: Optional[bool] = True, - mode: storage_lib.StorageMode = storage_lib.StorageMode.MOUNT): + mode: storage_lib.StorageMode = storage_lib.StorageMode.MOUNT, + _bucket_sub_path: Optional[str] = None): # Creates a temporary storage object. Stores must be added in the test. storage_obj = storage_lib.Storage(name=name, source=source, stores=stores, persistent=persistent, - mode=mode) + mode=mode, + _bucket_sub_path=_bucket_sub_path) yield storage_obj handle = global_user_state.get_handle_from_storage_name( storage_obj.name) @@ -4895,6 +4903,15 @@ def tmp_local_storage_obj(self, tmp_bucket_name, tmp_source): yield from self.yield_storage_object(name=tmp_bucket_name, source=tmp_source) + @pytest.fixture + def tmp_local_storage_obj_with_sub_path(self, tmp_bucket_name, tmp_source, + tmp_sub_path): + # Creates a temporary storage object with sub. Stores must be added in the test. + list_source = [tmp_source, tmp_source + '/tmp-file'] + yield from self.yield_storage_object(name=tmp_bucket_name, + source=list_source, + _bucket_sub_path=tmp_sub_path) + @pytest.fixture def tmp_local_list_storage_obj(self, tmp_bucket_name, tmp_source): # Creates a temp storage object which uses a list of paths as source. @@ -5053,6 +5070,107 @@ def test_new_bucket_creation_and_deletion(self, tmp_local_storage_obj, out = subprocess.check_output(['sky', 'storage', 'ls']) assert tmp_local_storage_obj.name not in out.decode('utf-8') + @pytest.mark.no_fluidstack + @pytest.mark.parametrize('store_type', [ + storage_lib.StoreType.S3, + pytest.param(storage_lib.StoreType.GCS, marks=pytest.mark.gcp), + pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), + pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm), + pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare) + ]) + def test_bucket_sub_path(self, tmp_local_storage_obj_with_sub_path, + store_type): + + def _list_all_files(): + if store_type == storage_lib.StoreType.S3: + # aws s3 ls command, list all files in bucket + cmd = f'aws s3 ls s3://{tmp_local_storage_obj_with_sub_path.name}/ --recursive' + out = subprocess.check_output(cmd, shell=True) + files = [ + line.split()[-1] + for line in out.decode('utf-8').splitlines() + ] + elif store_type == storage_lib.StoreType.GCS: + # gsutil ls command, list all files in bucket + cmd = f'gsutil ls "gs://{tmp_local_storage_obj_with_sub_path.name}/**"' + try: + out = subprocess.check_output(cmd, + shell=True, + stderr=subprocess.PIPE) + files = [ + line[5:] for line in out.decode('utf-8').splitlines() + ] + except subprocess.CalledProcessError as e: + error_output = e.stderr.decode('utf-8') + if "One or more URLs matched no objects" in error_output: + files = [] + else: + raise + elif store_type == storage_lib.StoreType.AZURE: + # az storage file list command, list all files in container + store = tmp_local_storage_obj_with_sub_path.stores[store_type] + container_url = data_utils.AZURE_CONTAINER_URL.format( + storage_account_name=store.storage_account_name, + container_name=store.name) + container_client = data_utils.create_az_client( + client_type='container', + container_url=container_url, + storage_account_name=store.storage_account_name, + resource_group_name=store.resource_group_name) + # List and delete blobs in the specified directory + blobs = container_client.list_blobs() + files = [blob.name for blob in blobs] + elif store_type == storage_lib.StoreType.IBM: + # ibm cos ls command, list all files in bucket + store = tmp_local_storage_obj_with_sub_path.stores[store_type] + bucket = store.s3_resource.Bucket(store.name) + files = [obj.key for obj in bucket.objects.all()] + elif store_type == storage_lib.StoreType.R2: + # r2 ls command, list all files in bucket + cmd = ( + f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls s3://{tmp_local_storage_obj_with_sub_path.name}/ ' + f'--recursive --endpoint {cloudflare.create_endpoint()} --profile=r2' + ) + out = subprocess.check_output(cmd, shell=True) + files = [ + line.split()[-1] + for line in out.decode('utf-8').splitlines() + ] + return files + + # Creates a new bucket with a local source, uploads files to it + # and deletes it. + tmp_local_storage_obj_with_sub_path.add_store(store_type) + + # Check files under bucket and filter by prefix + files = _list_all_files() + assert len(files) > 0 + if store_type == storage_lib.StoreType.GCS: + assert all([ + file.startswith( + tmp_local_storage_obj_with_sub_path.name + '/' + + tmp_local_storage_obj_with_sub_path._bucket_sub_path) + for file in files + ]) + else: + assert all([ + file.startswith( + tmp_local_storage_obj_with_sub_path._bucket_sub_path) + for file in files + ]) + + # check bucket is empty, all files under sub directory should be deleted + tmp_local_storage_obj_with_sub_path.delete( + only_delete_sub_path_if_exists=True) + files = _list_all_files() + + tmp_local_storage_obj_with_sub_path.delete() + + # Run sky storage ls to check if storage object is deleted + out = subprocess.check_output(['sky', 'storage', 'ls']) + assert tmp_local_storage_obj_with_sub_path.name not in out.decode( + 'utf-8') + @pytest.mark.no_fluidstack @pytest.mark.xdist_group('multiple_bucket_deletion') @pytest.mark.parametrize('store_type', [ From df603175d53075c79cb8fe8f31f7a78e163da41a Mon Sep 17 00:00:00 2001 From: zepingguo Date: Fri, 15 Nov 2024 18:24:37 +0800 Subject: [PATCH 30/48] smoke test bug fix --- tests/test_smoke.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 5dbc41a6eff..1d555525db5 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -29,6 +29,7 @@ import json import os import pathlib +import re import shlex import shutil import subprocess @@ -3232,7 +3233,7 @@ def test_managed_jobs_storage(generic_cloud: str): yaml_str_user_config = pathlib.Path( 'tests/test_yamls/use_intermediate_bucket.yaml').read_text() - intermediate_storage_name = f'bucket-jobs-s3-{timestamp}' + intermediate_storage_name = f'bucket-jobs-intermediate-smoke-test-{timestamp}' # Also perform region testing for bucket creation to validate if buckets are # created in the correct region and correctly mounted in managed jobs. @@ -3288,8 +3289,9 @@ def test_managed_jobs_storage(generic_cloud: str): yaml_str = yaml_str.replace('sky-workdir-zhwu', storage_name) yaml_str = yaml_str.replace('sky-output-bucket', output_storage_name) - yaml_str_user_config = yaml_str_user_config.replace( - 'bucket-jobs-s3', intermediate_storage_name) + yaml_str_user_config = re.sub(r'bucket-jobs-[\w\d]+', + intermediate_storage_name, + yaml_str_user_config) with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f_user_config: f_user_config.write(yaml_str_user_config) f_user_config.flush() From a2efd8c4f924d74e5397601e457d13b0a831fbaa Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 18 Nov 2024 10:20:49 +0800 Subject: [PATCH 31/48] robust smoke test --- tests/test_smoke.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 1d555525db5..472da40b1fc 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -5161,11 +5161,13 @@ def _list_all_files(): for file in files ]) - # check bucket is empty, all files under sub directory should be deleted + # Check bucket is empty, all files under sub directory should be deleted tmp_local_storage_obj_with_sub_path.delete( only_delete_sub_path_if_exists=True) files = _list_all_files() + assert len(files) == 0 + # Now, delete the entire bucket tmp_local_storage_obj_with_sub_path.delete() # Run sky storage ls to check if storage object is deleted From c43c70564a9b3db87987aefe01e92904dd985c5c Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Tue, 26 Nov 2024 19:26:50 +0800 Subject: [PATCH 32/48] fix comment --- docs/source/reference/config.rst | 11 ++-- sky/data/storage.py | 28 +++++++++ sky/task.py | 16 ----- sky/utils/controller_utils.py | 61 +++++++++++++++---- sky/utils/schemas.py | 24 +------- tests/test_smoke.py | 2 +- tests/test_yamls/use_intermediate_bucket.yaml | 5 -- .../use_intermediate_bucket_config.yaml | 2 + 8 files changed, 87 insertions(+), 62 deletions(-) delete mode 100644 tests/test_yamls/use_intermediate_bucket.yaml create mode 100644 tests/test_yamls/use_intermediate_bucket_config.yaml diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index ab2b96277d6..2d7307c386b 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -24,13 +24,10 @@ Available fields and semantics: # # Ref: https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html#customizing-job-controller-resources jobs: - bucket: - # Bucket to store managed jobs mount files and tmp files. - # Its optional, if not set, SkyPilot will create a new bucket for each managed jobs . - # Support s3, gcs, azure, r2, ibm, default. - s3: "bucket-jobs-s3" - gcs: "bucket-jobs-gcs" - default: "bucket-jobs-default" + # Bucket to store managed jobs mount files and tmp files. + # Its optional, if not set, SkyPilot will create a new bucket for each managed jobs. + # Support s3://, gs://, https://.blob.core.windows.net/, r2://, cos:/// + bucket: s3://my-bucket/ controller: resources: # same spec as 'resources' in a task YAML cloud: gcp diff --git a/sky/data/storage.py b/sky/data/storage.py index 7f0a008ea7e..36f88e79dc8 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -188,6 +188,34 @@ def get_endpoint_url(cls, store: 'AbstractStore', path: str) -> str: bucket_endpoint_url = f'{store_type.store_prefix()}{path}' return bucket_endpoint_url + @classmethod + def from_store_url(cls, store_url: str) -> Tuple['StoreType', str, str]: + """Returns the store type, bucket name, and sub path from a store URL. + + Args: + store_url: str; The store URL. + """ + for store_type in StoreType: + if store_url.startswith(store_type.store_prefix()): + if store_type == StoreType.AZURE: + _, container_name, sub_path = data_utils.split_az_path( + store_url) + return store_type, container_name, sub_path + elif store_type == StoreType.IBM: + bucket_name, sub_path, _ = data_utils.split_cos_path( + store_url) + return store_type, bucket_name, sub_path + elif store_type == StoreType.R2: + bucket_name, sub_path = data_utils.split_r2_path(store_url) + return store_type, bucket_name, sub_path + elif store_type == StoreType.GCS: + bucket_name, sub_path = data_utils.split_gcs_path(store_url) + return store_type, bucket_name, sub_path + elif store_type == StoreType.S3: + bucket_name, sub_path = data_utils.split_s3_path(store_url) + return store_type, bucket_name, sub_path + raise ValueError(f'Unknown store URL: {store_url}') + class StorageMode(enum.Enum): MOUNT = 'MOUNT' diff --git a/sky/task.py b/sky/task.py index aa93e842575..46a479aeaa5 100644 --- a/sky/task.py +++ b/sky/task.py @@ -14,7 +14,6 @@ from sky import clouds from sky import exceptions from sky import sky_logging -from sky import skypilot_config import sky.dag from sky.data import data_utils from sky.data import storage as storage_lib @@ -904,21 +903,6 @@ def update_storage_mounts( task_storage_mounts.update(storage_mounts) return self.set_storage_mounts(task_storage_mounts) - def get_bucket_name_and_store_type_from_job_config( - self) -> Tuple[Optional[str], Optional[str]]: - """Returns the bucket name and store type from the job config.""" - bucket_dict = skypilot_config.get_nested(('jobs', 'bucket'), None) - if bucket_dict is None: - return None, None - store_type, _ = self._get_preferred_store() - if store_type.value.lower() in bucket_dict: - bucket_name = bucket_dict[store_type.value.lower()] - elif 'default' in bucket_dict: - bucket_name = bucket_dict['default'] - else: - return None, None - return bucket_name, store_type.value - def _get_preferred_store( self) -> Tuple[storage_lib.StoreType, Optional[str]]: """Returns the preferred store type and region for this task.""" diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index b13cc73ae07..2f553532278 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -687,10 +687,16 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', still sync up any storage mounts with local source paths (which do not undergo translation). """ + # ================================================================ # Translate the workdir and local file mounts to cloud file mounts. # ================================================================ + def _sub_path_join(sub_path: Optional[str], path: str) -> str: + if sub_path is None: + return path + return os.path.join(sub_path, path).strip('/') + run_id = common_utils.get_usage_run_id()[:8] original_file_mounts = task.file_mounts if task.file_mounts else {} original_storage_mounts = task.storage_mounts if task.storage_mounts else {} @@ -719,10 +725,14 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', # Get the bucket name for the workdir and file mounts, # we stores all these files in same bucket from config. - bucket_name, store = task.get_bucket_name_and_store_type_from_job_config() - if bucket_name is None: + bucket_wth_prefix = skypilot_config.get_nested(('jobs', 'bucket'), None) + if bucket_wth_prefix is None: + store_type = sub_path = None bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format( username=common_utils.get_cleaned_username(), id=run_id) + else: + store_type, bucket_name, sub_path = \ + storage_lib.StoreType.from_store_url(bucket_wth_prefix) # Step 1: Translate the workdir to SkyPilot storage. new_storage_mounts = {} @@ -741,9 +751,11 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', 'source': workdir, 'persistent': False, 'mode': 'COPY', - 'store': store, - '_bucket_sub_path': - constants.FILE_MOUNTS_WORKDIR_SUBPATH.format(run_id=run_id), + 'store': store_type, + '_bucket_sub_path': _sub_path_join( + sub_path, + constants.FILE_MOUNTS_WORKDIR_SUBPATH.format( + run_id=run_id)), }) # Check of the existence of the workdir in file_mounts is done in # the task construction. @@ -767,17 +779,18 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', 'source': src, 'persistent': False, 'mode': 'COPY', - 'store': store, - '_bucket_sub_path': constants.FILE_MOUNTS_SUBPATH.format( - i=i, run_id=run_id), + 'store': store_type, + '_bucket_sub_path': _sub_path_join( + sub_path, + constants.FILE_MOUNTS_SUBPATH.format(i=i, run_id=run_id)), }) logger.info(f' {colorama.Style.DIM}Folder : {src!r} ' f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}') # Step 3: Translate local file mounts with file in src to SkyPilot storage. # Hard link the files in src to a temporary directory, and upload folder. - file_mounts_tmp_subpath = constants.FILE_MOUNTS_TMP_SUBPATH.format( - run_id=run_id) + file_mounts_tmp_subpath = _sub_path_join( + sub_path, constants.FILE_MOUNTS_TMP_SUBPATH.format(run_id=run_id)) local_fm_path = os.path.join( tempfile.gettempdir(), constants.FILE_MOUNTS_LOCAL_TMP_DIR.format(id=run_id)) @@ -797,7 +810,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', 'source': local_fm_path, 'persistent': False, 'mode': 'MOUNT', - 'store': store, + 'store': store_type, '_bucket_sub_path': file_mounts_tmp_subpath, }) if file_mount_remote_tmp_dir in original_storage_mounts: @@ -811,6 +824,32 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', logger.info(f' {colorama.Style.DIM}Files (listed below) ' f' -> storage: {bucket_name}:' f'\n {sources_str}{colorama.Style.RESET_ALL}') + + if bucket_wth_prefix is not None and len(new_storage_mounts) > 0: + # The full path from the user config of IBM COS contains the region, + # and Azure Blob Storage contains the storage account name, we need to + # check the region and storage account name is match with the system + # configured. + if store_type is storage_lib.StoreType.IBM: + store: storage_lib.IBMCosStore = list( # type: ignore + new_storage_mounts.values())[0].stores[store_type] + _, _, region = data_utils.split_cos_path(bucket_wth_prefix) + assert store.region == region, ( + f'The region from job config {bucket_wth_prefix} does ' + f'not match the region of the storage sky supports ' + f'{store.region}') + elif store_type is storage_lib.StoreType.AZURE: + store: storage_lib.AzureBlobStore = list( # type: ignore + new_storage_mounts.values())[0].stores[store_type] + storage_account_name, _, _ = data_utils.split_az_path( + bucket_wth_prefix) + env_storage_account_name = \ + store.storage_account_name # type: ignore + assert storage_account_name == env_storage_account_name, ( + f'The storage_account_name from job config ' + f'{bucket_wth_prefix} does not match the storage_account_name ' + f'of the storage sky configured {env_storage_account_name}') + rich_utils.force_update_status( ux_utils.spinner_message('Uploading translated local files/folders')) task.update_storage_mounts(new_storage_mounts) diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 875986ba66f..b4ddbbbeea9 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -709,29 +709,9 @@ def get_config_schema(): } }, 'bucket': { - 'type': 'object', + 'type': 'string', + 'pattern': '^(https|s3|gs|r2|cos)://.+', 'required': [], - 'additionalProperties': False, - 'properties': { - 's3': { - 'type': 'string', - }, - 'gcs': { - 'type': 'string', - }, - 'azure': { - 'type': 'string', - }, - 'r2': { - 'type': 'string', - }, - 'ibm': { - 'type': 'string', - }, - 'default': { - 'type': 'string', - } - } } } } diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 472da40b1fc..1e92ed395b7 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3232,7 +3232,7 @@ def test_managed_jobs_storage(generic_cloud: str): output_storage_name = f'sky-test-output-{timestamp}' yaml_str_user_config = pathlib.Path( - 'tests/test_yamls/use_intermediate_bucket.yaml').read_text() + 'tests/test_yamls/use_intermediate_bucket_config.yaml').read_text() intermediate_storage_name = f'bucket-jobs-intermediate-smoke-test-{timestamp}' # Also perform region testing for bucket creation to validate if buckets are diff --git a/tests/test_yamls/use_intermediate_bucket.yaml b/tests/test_yamls/use_intermediate_bucket.yaml deleted file mode 100644 index 4dbbbfa11fa..00000000000 --- a/tests/test_yamls/use_intermediate_bucket.yaml +++ /dev/null @@ -1,5 +0,0 @@ -jobs: - bucket: - s3: "bucket-jobs-s3" - gcs: "bucket-jobs-gcs" - default: "bucket-jobs-default" diff --git a/tests/test_yamls/use_intermediate_bucket_config.yaml b/tests/test_yamls/use_intermediate_bucket_config.yaml new file mode 100644 index 00000000000..cdfb5fbabc1 --- /dev/null +++ b/tests/test_yamls/use_intermediate_bucket_config.yaml @@ -0,0 +1,2 @@ +jobs: + bucket: "s3://bucket-jobs-s3" From cc8a8a6de8bb120c8ca89f2663dea548878f8c26 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Tue, 26 Nov 2024 20:03:34 +0800 Subject: [PATCH 33/48] bug fix --- sky/data/storage.py | 13 +++++++------ sky/utils/controller_utils.py | 10 ++++++---- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index 36f88e79dc8..7b4f313c21d 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -189,31 +189,32 @@ def get_endpoint_url(cls, store: 'AbstractStore', path: str) -> str: return bucket_endpoint_url @classmethod - def from_store_url(cls, store_url: str) -> Tuple['StoreType', str, str]: + def from_store_url(cls, store_url: str) -> Tuple[str, str, str]: """Returns the store type, bucket name, and sub path from a store URL. Args: store_url: str; The store URL. """ for store_type in StoreType: + store_type_value = store_type.value if store_url.startswith(store_type.store_prefix()): if store_type == StoreType.AZURE: _, container_name, sub_path = data_utils.split_az_path( store_url) - return store_type, container_name, sub_path + return store_type_value, container_name, sub_path elif store_type == StoreType.IBM: bucket_name, sub_path, _ = data_utils.split_cos_path( store_url) - return store_type, bucket_name, sub_path + return store_type_value, bucket_name, sub_path elif store_type == StoreType.R2: bucket_name, sub_path = data_utils.split_r2_path(store_url) - return store_type, bucket_name, sub_path + return store_type_value, bucket_name, sub_path elif store_type == StoreType.GCS: bucket_name, sub_path = data_utils.split_gcs_path(store_url) - return store_type, bucket_name, sub_path + return store_type_value, bucket_name, sub_path elif store_type == StoreType.S3: bucket_name, sub_path = data_utils.split_s3_path(store_url) - return store_type, bucket_name, sub_path + return store_type_value, bucket_name, sub_path raise ValueError(f'Unknown store URL: {store_url}') diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 2f553532278..c083f2b5360 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -830,17 +830,19 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: # and Azure Blob Storage contains the storage account name, we need to # check the region and storage account name is match with the system # configured. - if store_type is storage_lib.StoreType.IBM: + if store_type == storage_lib.StoreType.IBM.value: store: storage_lib.IBMCosStore = list( # type: ignore - new_storage_mounts.values())[0].stores[store_type] + new_storage_mounts.values())[0].stores[storage_lib.StoreType( + store_type)] _, _, region = data_utils.split_cos_path(bucket_wth_prefix) assert store.region == region, ( f'The region from job config {bucket_wth_prefix} does ' f'not match the region of the storage sky supports ' f'{store.region}') - elif store_type is storage_lib.StoreType.AZURE: + elif store_type == storage_lib.StoreType.AZURE.value: store: storage_lib.AzureBlobStore = list( # type: ignore - new_storage_mounts.values())[0].stores[store_type] + new_storage_mounts.values())[0].stores[storage_lib.StoreType( + store_type)] storage_account_name, _, _ = data_utils.split_az_path( bucket_wth_prefix) env_storage_account_name = \ From b939f7fe6ed577d14e185b7186f7481f9b49aa85 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Wed, 27 Nov 2024 09:15:49 +0800 Subject: [PATCH 34/48] set the storage manually --- sky/data/storage.py | 47 ++++++++++----- sky/utils/controller_utils.py | 104 ++++++++++++++++------------------ 2 files changed, 82 insertions(+), 69 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index 7b4f313c21d..23971fc62ec 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -919,9 +919,12 @@ def from_metadata(cls, metadata: StorageMetadata, return storage_obj - def add_store(self, - store_type: Union[str, StoreType], - region: Optional[str] = None) -> AbstractStore: + def add_store( + self, + store_type: Union[str, StoreType], + region: Optional[str] = None, + store_init_kwargs: Optional[Dict[str, + Any]] = None) -> AbstractStore: """Initializes and adds a new store to the storage. Invoked by the optimizer after it has selected a store to @@ -931,6 +934,8 @@ def add_store(self, store_type: StoreType; Type of the storage [S3, GCS, AZURE, R2, IBM] region: str; Region to place the bucket in. Caller must ensure that the region is valid for the chosen store_type. + store_init_kwargs: Dict[str, Any]; Additional keyword arguments + to pass to the store constructor.ß """ if isinstance(store_type, str): store_type = StoreType(store_type) @@ -970,7 +975,8 @@ def add_store(self, source=self.source, region=region, sync_on_reconstruction=self.sync_on_reconstruction, - _bucket_sub_path=self._bucket_sub_path) + _bucket_sub_path=self._bucket_sub_path, + **(store_init_kwargs or {})) except exceptions.StorageBucketCreateError: # Creation failed, so this must be sky managed store. Add failure # to state. @@ -1114,7 +1120,10 @@ def warn_for_git_dir(source: str): global_user_state.set_storage_status(self.name, StorageStatus.READY) @classmethod - def from_yaml_config(cls, config: Dict[str, Any]) -> 'Storage': + def from_yaml_config( + cls, + config: Dict[str, Any], + store_init_kwargs: Optional[Dict[str, Any]] = None) -> 'Storage': common_utils.validate_schema(config, schemas.get_storage_schema(), 'Invalid storage YAML: ') @@ -1147,7 +1156,8 @@ def from_yaml_config(cls, config: Dict[str, Any]) -> 'Storage': mode=mode, _bucket_sub_path=_bucket_sub_path) if store is not None: - storage_obj.add_store(StoreType(store.upper())) + storage_obj.add_store(StoreType(store.upper()), + store_init_kwargs=store_init_kwargs or {}) # Add force deletion flag storage_obj.force_delete = force_delete @@ -2341,6 +2351,17 @@ def initialize(self): """ self.storage_client = data_utils.create_az_client('storage') self.resource_client = data_utils.create_az_client('resource') + self._update_storage_account_name_and_resource() + + self.container_name, is_new_bucket = self._get_bucket() + if self.is_sky_managed is None: + # If is_sky_managed is not specified, then this is a new storage + # object (i.e., did not exist in global_user_state) and we should + # set the is_sky_managed property. + # If is_sky_managed is specified, then we take no action. + self.is_sky_managed = is_new_bucket + + def _update_storage_account_name_and_resource(self): self.storage_account_name, self.resource_group_name = ( self._get_storage_account_and_resource_group()) @@ -2351,13 +2372,13 @@ def initialize(self): self.storage_account_name, self.resource_group_name, self.storage_client, self.resource_client) - self.container_name, is_new_bucket = self._get_bucket() - if self.is_sky_managed is None: - # If is_sky_managed is not specified, then this is a new storage - # object (i.e., did not exist in global_user_state) and we should - # set the is_sky_managed property. - # If is_sky_managed is specified, then we take no action. - self.is_sky_managed = is_new_bucket + def update_storage_attributes(self, **kwargs: Dict[str, Any]): + assert 'storage_account_name' in kwargs, ( + 'only storage_account_name supported') + assert isinstance(kwargs['storage_account_name'], + str), ('storage_account_name must be a string') + self.storage_account_name = kwargs['storage_account_name'] + self._update_storage_account_name_and_resource() @staticmethod def get_default_storage_account_name(region: Optional[str]) -> str: diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index c083f2b5360..9bd401c8914 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -726,6 +726,7 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: # Get the bucket name for the workdir and file mounts, # we stores all these files in same bucket from config. bucket_wth_prefix = skypilot_config.get_nested(('jobs', 'bucket'), None) + store_init_kwargs = {} if bucket_wth_prefix is None: store_type = sub_path = None bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format( @@ -734,6 +735,18 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: store_type, bucket_name, sub_path = \ storage_lib.StoreType.from_store_url(bucket_wth_prefix) + # The full path from the user config of IBM COS contains the region, + # and Azure Blob Storage contains the storage account name, we need to + # check the region and storage account name is match with the system + # configured. + if store_type == storage_lib.StoreType.IBM.value: + _, _, region = data_utils.split_cos_path(bucket_wth_prefix) + store_init_kwargs['region'] = region + elif store_type == storage_lib.StoreType.AZURE.value: + storage_account_name, _, _ = data_utils.split_az_path( + bucket_wth_prefix) + store_init_kwargs['storage_account_name'] = storage_account_name + # Step 1: Translate the workdir to SkyPilot storage. new_storage_mounts = {} if task.workdir is not None: @@ -746,17 +759,19 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: 'workdir and file_mounts contains it as the target.') new_storage_mounts[ constants. - SKY_REMOTE_WORKDIR] = storage_lib.Storage.from_yaml_config({ - 'name': bucket_name, - 'source': workdir, - 'persistent': False, - 'mode': 'COPY', - 'store': store_type, - '_bucket_sub_path': _sub_path_join( - sub_path, - constants.FILE_MOUNTS_WORKDIR_SUBPATH.format( - run_id=run_id)), - }) + SKY_REMOTE_WORKDIR] = storage_lib.Storage.from_yaml_config( + { + 'name': bucket_name, + 'source': workdir, + 'persistent': False, + 'mode': 'COPY', + 'store': store_type, + '_bucket_sub_path': _sub_path_join( + sub_path, + constants.FILE_MOUNTS_WORKDIR_SUBPATH.format( + run_id=run_id)), + }, + store_init_kwargs=store_init_kwargs) # Check of the existence of the workdir in file_mounts is done in # the task construction. logger.info(f' {colorama.Style.DIM}Workdir: {workdir!r} ' @@ -774,16 +789,18 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: if os.path.isfile(os.path.abspath(os.path.expanduser(src))): copy_mounts_with_file_in_src[dst] = src continue - new_storage_mounts[dst] = storage_lib.Storage.from_yaml_config({ - 'name': bucket_name, - 'source': src, - 'persistent': False, - 'mode': 'COPY', - 'store': store_type, - '_bucket_sub_path': _sub_path_join( - sub_path, - constants.FILE_MOUNTS_SUBPATH.format(i=i, run_id=run_id)), - }) + new_storage_mounts[dst] = storage_lib.Storage.from_yaml_config( + { + 'name': bucket_name, + 'source': src, + 'persistent': False, + 'mode': 'COPY', + 'store': store_type, + '_bucket_sub_path': _sub_path_join( + sub_path, + constants.FILE_MOUNTS_SUBPATH.format(i=i, run_id=run_id)), + }, + store_init_kwargs=store_init_kwargs) logger.info(f' {colorama.Style.DIM}Folder : {src!r} ' f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}') @@ -805,14 +822,16 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: os.path.join(local_fm_path, f'file-{i}')) new_storage_mounts[ - file_mount_remote_tmp_dir] = storage_lib.Storage.from_yaml_config({ - 'name': bucket_name, - 'source': local_fm_path, - 'persistent': False, - 'mode': 'MOUNT', - 'store': store_type, - '_bucket_sub_path': file_mounts_tmp_subpath, - }) + file_mount_remote_tmp_dir] = storage_lib.Storage.from_yaml_config( + { + 'name': bucket_name, + 'source': local_fm_path, + 'persistent': False, + 'mode': 'MOUNT', + 'store': store_type, + '_bucket_sub_path': file_mounts_tmp_subpath, + }, + store_init_kwargs=store_init_kwargs) if file_mount_remote_tmp_dir in original_storage_mounts: with ux_utils.print_exception_no_traceback(): raise ValueError( @@ -825,33 +844,6 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: f' -> storage: {bucket_name}:' f'\n {sources_str}{colorama.Style.RESET_ALL}') - if bucket_wth_prefix is not None and len(new_storage_mounts) > 0: - # The full path from the user config of IBM COS contains the region, - # and Azure Blob Storage contains the storage account name, we need to - # check the region and storage account name is match with the system - # configured. - if store_type == storage_lib.StoreType.IBM.value: - store: storage_lib.IBMCosStore = list( # type: ignore - new_storage_mounts.values())[0].stores[storage_lib.StoreType( - store_type)] - _, _, region = data_utils.split_cos_path(bucket_wth_prefix) - assert store.region == region, ( - f'The region from job config {bucket_wth_prefix} does ' - f'not match the region of the storage sky supports ' - f'{store.region}') - elif store_type == storage_lib.StoreType.AZURE.value: - store: storage_lib.AzureBlobStore = list( # type: ignore - new_storage_mounts.values())[0].stores[storage_lib.StoreType( - store_type)] - storage_account_name, _, _ = data_utils.split_az_path( - bucket_wth_prefix) - env_storage_account_name = \ - store.storage_account_name # type: ignore - assert storage_account_name == env_storage_account_name, ( - f'The storage_account_name from job config ' - f'{bucket_wth_prefix} does not match the storage_account_name ' - f'of the storage sky configured {env_storage_account_name}') - rich_utils.force_update_status( ux_utils.spinner_message('Uploading translated local files/folders')) task.update_storage_mounts(new_storage_mounts) From fb0edf52eddf3e5dd75f3c02222d6c162bc9febd Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Wed, 27 Nov 2024 09:56:54 +0800 Subject: [PATCH 35/48] better structure --- sky/data/storage.py | 23 ++++++++++++++--------- sky/utils/controller_utils.py | 16 ++-------------- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index 23971fc62ec..c2cd53f7bfe 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -189,32 +189,37 @@ def get_endpoint_url(cls, store: 'AbstractStore', path: str) -> str: return bucket_endpoint_url @classmethod - def from_store_url(cls, store_url: str) -> Tuple[str, str, str]: + def from_store_url(cls, + store_url: str) -> Tuple[str, str, str, Dict[str, Any]]: """Returns the store type, bucket name, and sub path from a store URL. Args: store_url: str; The store URL. """ + # The full path from the user config of IBM COS contains the region, + # and Azure Blob Storage contains the storage account name, we need to + # pass these information to the store constructor. + store_init_kwargs = {} for store_type in StoreType: store_type_value = store_type.value if store_url.startswith(store_type.store_prefix()): if store_type == StoreType.AZURE: - _, container_name, sub_path = data_utils.split_az_path( - store_url) - return store_type_value, container_name, sub_path + storage_account_name, bucket_name, sub_path = \ + data_utils.split_az_path(store_url) + store_init_kwargs[ + 'storage_account_name'] = storage_account_name elif store_type == StoreType.IBM: - bucket_name, sub_path, _ = data_utils.split_cos_path( + bucket_name, sub_path, region = data_utils.split_cos_path( store_url) - return store_type_value, bucket_name, sub_path + store_init_kwargs['region'] = region elif store_type == StoreType.R2: bucket_name, sub_path = data_utils.split_r2_path(store_url) - return store_type_value, bucket_name, sub_path elif store_type == StoreType.GCS: bucket_name, sub_path = data_utils.split_gcs_path(store_url) - return store_type_value, bucket_name, sub_path elif store_type == StoreType.S3: bucket_name, sub_path = data_utils.split_s3_path(store_url) - return store_type_value, bucket_name, sub_path + return store_type_value, bucket_name, \ + sub_path, store_init_kwargs raise ValueError(f'Unknown store URL: {store_url}') diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 9bd401c8914..cae4401f952 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -726,27 +726,15 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: # Get the bucket name for the workdir and file mounts, # we stores all these files in same bucket from config. bucket_wth_prefix = skypilot_config.get_nested(('jobs', 'bucket'), None) - store_init_kwargs = {} + store_init_kwargs: Dict[str, Any] = {} if bucket_wth_prefix is None: store_type = sub_path = None bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format( username=common_utils.get_cleaned_username(), id=run_id) else: - store_type, bucket_name, sub_path = \ + store_type, bucket_name, sub_path, store_init_kwargs = \ storage_lib.StoreType.from_store_url(bucket_wth_prefix) - # The full path from the user config of IBM COS contains the region, - # and Azure Blob Storage contains the storage account name, we need to - # check the region and storage account name is match with the system - # configured. - if store_type == storage_lib.StoreType.IBM.value: - _, _, region = data_utils.split_cos_path(bucket_wth_prefix) - store_init_kwargs['region'] = region - elif store_type == storage_lib.StoreType.AZURE.value: - storage_account_name, _, _ = data_utils.split_az_path( - bucket_wth_prefix) - store_init_kwargs['storage_account_name'] = storage_account_name - # Step 1: Translate the workdir to SkyPilot storage. new_storage_mounts = {} if task.workdir is not None: From e4619cb3ffb92cb4d76e239521b5353c3e4cc4be Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Wed, 27 Nov 2024 13:17:03 +0800 Subject: [PATCH 36/48] fix mypy --- sky/utils/controller_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 37ea13ed906..a09be64cef8 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -728,11 +728,11 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: bucket_wth_prefix = skypilot_config.get_nested(('jobs', 'bucket'), None) store_init_kwargs: Dict[str, Any] = {} if bucket_wth_prefix is None: - store_type = sub_path = None + store_type_str = sub_path = None bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format( username=common_utils.get_cleaned_username(), id=run_id) else: - store_type, bucket_name, sub_path, store_init_kwargs = \ + store_type_str, bucket_name, sub_path, store_init_kwargs = \ storage_lib.StoreType.from_store_url(bucket_wth_prefix) # Step 1: Translate the workdir to SkyPilot storage. @@ -753,7 +753,7 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: 'source': workdir, 'persistent': False, 'mode': 'COPY', - 'store': store_type, + 'store': store_type_str, '_bucket_sub_path': _sub_path_join( sub_path, constants.FILE_MOUNTS_WORKDIR_SUBPATH.format( @@ -783,7 +783,7 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: 'source': src, 'persistent': False, 'mode': 'COPY', - 'store': store_type, + 'store': store_type_str, '_bucket_sub_path': _sub_path_join( sub_path, constants.FILE_MOUNTS_SUBPATH.format(i=i, run_id=run_id)), @@ -816,7 +816,7 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: 'source': local_fm_path, 'persistent': False, 'mode': 'MOUNT', - 'store': store_type, + 'store': store_type_str, '_bucket_sub_path': file_mounts_tmp_subpath, }, store_init_kwargs=store_init_kwargs) From abe0d9913988d89891e8b708788779afd4da25e6 Mon Sep 17 00:00:00 2001 From: zpoint Date: Mon, 2 Dec 2024 15:52:49 +0800 Subject: [PATCH 37/48] Update docs/source/reference/config.rst Co-authored-by: Romil Bhardwaj --- docs/source/reference/config.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index 110160089fc..a3fbd6137da 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -25,7 +25,7 @@ Available fields and semantics: # Ref: https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html#customizing-job-controller-resources jobs: # Bucket to store managed jobs mount files and tmp files. - # Its optional, if not set, SkyPilot will create a new bucket for each managed jobs. + # Optional. If not set, SkyPilot will create a new bucket for each managed job launch. # Support s3://, gs://, https://.blob.core.windows.net/, r2://, cos:/// bucket: s3://my-bucket/ controller: From 1669fb822f3d2121607fcf604574535c2692fdc3 Mon Sep 17 00:00:00 2001 From: zpoint Date: Mon, 2 Dec 2024 15:53:02 +0800 Subject: [PATCH 38/48] Update docs/source/reference/config.rst Co-authored-by: Romil Bhardwaj --- docs/source/reference/config.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index a3fbd6137da..37c7911b567 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -26,7 +26,7 @@ Available fields and semantics: jobs: # Bucket to store managed jobs mount files and tmp files. # Optional. If not set, SkyPilot will create a new bucket for each managed job launch. - # Support s3://, gs://, https://.blob.core.windows.net/, r2://, cos:/// + # Supports s3://, gs://, https://.blob.core.windows.net/, r2://, cos:/// bucket: s3://my-bucket/ controller: resources: # same spec as 'resources' in a task YAML From 5d7ea0f7e52f0ff60a97d7a18ef4683e553cd745 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Mon, 2 Dec 2024 18:53:29 +0800 Subject: [PATCH 39/48] limit creation for bucket and delete sub dir only --- sky/backends/cloud_vm_ray_backend.py | 10 +- sky/data/storage.py | 161 +++++++++++++++++---------- sky/utils/controller_utils.py | 1 + 3 files changed, 104 insertions(+), 68 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 7869da663e6..5682cf24586 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3522,15 +3522,7 @@ def _teardown_ephemeral_storage(self, task: task_lib.Task) -> None: if storage_mounts is not None: for _, storage in storage_mounts.items(): if not storage.persistent: - is_bucket_name_generated_by_sky = ( - storage.is_bucket_name_auto_generated_by_sky()) - # If the bucket name is auto-generated by SkyPilot, we keep - # the original behaviour delete the bucket, otherwise, we - # only delete the sub-path if it exists because miltiple - # jobs might share the same bucket, delete bucket could - # potential cause other jobs to fail during file operation - storage.delete(only_delete_sub_path_if_exists= - not is_bucket_name_generated_by_sky) + storage.delete() def _teardown(self, handle: CloudVmRayResourceHandle, diff --git a/sky/data/storage.py b/sky/data/storage.py index 9d1363096cb..8fa148af646 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -261,13 +261,15 @@ def __repr__(self): f'\n\tregion={self.region},' f'\n\tis_sky_managed={self.is_sky_managed})') - def __init__(self, - name: str, - source: Optional[SourceType], - region: Optional[str] = None, - is_sky_managed: Optional[bool] = None, - sync_on_reconstruction: Optional[bool] = True, - _bucket_sub_path: Optional[str] = None): # pylint: disable=invalid-name + def __init__( + self, + name: str, + source: Optional[SourceType], + region: Optional[str] = None, + is_sky_managed: Optional[bool] = None, + sync_on_reconstruction: Optional[bool] = True, + _bucket_sub_path: Optional[str] = None, # pylint: disable=invalid-name + _allow_bucket_creation: bool = True): # pylint: disable=invalid-name """Initialize AbstractStore Args: @@ -286,7 +288,8 @@ def __init__(self, will be uploaded to s3:///my-dir/. This only works if source is a local directory. # TODO(zpoint): Add support for non-local source. - + _allow_bucket_creation: bool; Whether to allow bucket creation + if the bucket does not exist. Raises: StorageBucketCreateError: If bucket creation fails StorageBucketGetError: If fetching existing bucket fails @@ -302,6 +305,7 @@ def __init__(self, self._bucket_sub_path: Optional[str] = None # Trigger the setter to strip any leading/trailing slashes. self.bucket_sub_path = _bucket_sub_path + self._allow_bucket_creation = _allow_bucket_creation # Whether sky is responsible for the lifecycle of the Store. self._validate() self.initialize() @@ -849,29 +853,34 @@ def _add_store_from_metadata( store = S3Store.from_metadata( s_metadata, source=self.source, - sync_on_reconstruction=self.sync_on_reconstruction) + sync_on_reconstruction=self.sync_on_reconstruction, + _bucket_sub_path=self._bucket_sub_path) elif s_type == StoreType.GCS: store = GcsStore.from_metadata( s_metadata, source=self.source, - sync_on_reconstruction=self.sync_on_reconstruction) + sync_on_reconstruction=self.sync_on_reconstruction, + _bucket_sub_path=self._bucket_sub_path) elif s_type == StoreType.AZURE: assert isinstance(s_metadata, AzureBlobStore.AzureBlobStoreMetadata) store = AzureBlobStore.from_metadata( s_metadata, source=self.source, - sync_on_reconstruction=self.sync_on_reconstruction) + sync_on_reconstruction=self.sync_on_reconstruction, + _bucket_sub_path=self._bucket_sub_path) elif s_type == StoreType.R2: store = R2Store.from_metadata( s_metadata, source=self.source, - sync_on_reconstruction=self.sync_on_reconstruction) + sync_on_reconstruction=self.sync_on_reconstruction, + _bucket_sub_path=self._bucket_sub_path) elif s_type == StoreType.IBM: store = IBMCosStore.from_metadata( s_metadata, source=self.source, - sync_on_reconstruction=self.sync_on_reconstruction) + sync_on_reconstruction=self.sync_on_reconstruction, + _bucket_sub_path=self._bucket_sub_path) else: with ux_utils.print_exception_no_traceback(): raise ValueError(f'Unknown store type: {s_type}') @@ -891,9 +900,6 @@ def _add_store_from_metadata( 'to be reconstructed while the corresponding ' 'bucket was externally deleted.') continue - # This one can't be retrieved from metadata since its set every time - # we create a new storage object. - store.bucket_sub_path = self._bucket_sub_path self._add_store(store, is_reconstructed=True) @classmethod @@ -1023,13 +1029,23 @@ def _add_store(self, store: AbstractStore, is_reconstructed: bool = False): global_user_state.add_or_update_storage(self.name, self.handle, StorageStatus.INIT) - def is_bucket_name_auto_generated_by_sky(self) -> bool: - return re.match(r'^skypilot-filemounts-.+-[a-z0-9]{8}$', - self.name) is not None + def _store_is_configured_by_user(self, store: AbstractStore) -> bool: + """Check if the store is configured by user in the config. + + If the bucket is specified in the config, it is managed by user. + """ + bucket_wth_prefix = skypilot_config.get_nested(('jobs', 'bucket'), None) + if bucket_wth_prefix is None: + return False - def delete(self, - store_type: Optional[StoreType] = None, - only_delete_sub_path_if_exists: bool = False) -> None: + store_type_str, bucket_name, _, _ = StoreType.from_store_url( + bucket_wth_prefix) + store_type = StoreType(store_type_str) + if StoreType.from_store(store) != store_type: + return False + return store.name == bucket_name and store.bucket_sub_path is not None + + def delete(self, store_type: Optional[StoreType] = None) -> None: """Deletes data for all sky-managed storage objects. If a storage is not managed by sky, it is not deleted from the cloud. @@ -1038,28 +1054,21 @@ def delete(self, Args: store_type: StoreType; Specific cloud store to remove from the list of backing stores. - only_delete_sub_path_if_exists: bool; Whether to delete only the - bucket sub path instead of the whole bucket if bucket sub path - is set. """ - if not self.stores and not only_delete_sub_path_if_exists: + if not self.stores: logger.info('No backing stores found. Deleting storage.') global_user_state.remove_storage(self.name) if store_type: store = self.stores[store_type] - # We delete the bucket sub path if it exists, and then return. - # Without interfering with the global state. - # User should still call storage.delete() to remove the bucket. - if only_delete_sub_path_if_exists and store.bucket_sub_path: - store.delete_sub_path() - return - is_sky_managed = store.is_sky_managed # We delete a store from the cloud if it's sky managed. Else just # remove handle and return - if is_sky_managed: + if is_sky_managed or self._store_is_configured_by_user(store): self.handle.remove_store(store) - store.delete() + if is_sky_managed: + store.delete() + else: + store.delete_sub_path() # Check remaining stores - if none is sky managed, remove # the storage from global_user_state. delete = all( @@ -1073,23 +1082,19 @@ def delete(self, # Remove store from bookkeeping del self.stores[store_type] else: - keys_to_delete = [] - for key, store in self.stores.items(): - if only_delete_sub_path_if_exists and store.bucket_sub_path: - store.delete_sub_path() - continue - - if store.is_sky_managed: + for _, store in self.stores.items(): + if store.is_sky_managed or self._store_is_configured_by_user( + store): self.handle.remove_store(store) - store.delete() + if store.is_sky_managed: + store.delete() + else: + store.delete_sub_path() elif self.force_delete: store.delete() - keys_to_delete.append(key) - for key in keys_to_delete: - del self.stores[key] - if len(self.stores) == 0: - # Remove storage from global_user_state if present - global_user_state.remove_storage(self.name) + self.stores = {} + # Remove storage from global_user_state if present + global_user_state.remove_storage(self.name) def sync_all_stores(self): """Syncs the source and destinations of all stores in the Storage""" @@ -1215,7 +1220,8 @@ def __init__(self, region: Optional[str] = _DEFAULT_REGION, is_sky_managed: Optional[bool] = None, sync_on_reconstruction: bool = True, - _bucket_sub_path: Optional[str] = None): + _bucket_sub_path: Optional[str] = None, + _allow_bucket_creation: bool = True): self.client: 'boto3.client.Client' self.bucket: 'StorageHandle' # TODO(romilb): This is purely a stopgap fix for @@ -1228,7 +1234,8 @@ def __init__(self, f'{self._DEFAULT_REGION} for bucket {name!r}.') region = self._DEFAULT_REGION super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction, _bucket_sub_path) + sync_on_reconstruction, _bucket_sub_path, + _allow_bucket_creation) def _validate(self): if self.source is not None and isinstance(self.source, str): @@ -1535,6 +1542,11 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]: f'{self.source}. Consider using `aws s3 ls ' f'{self.source}` to debug.') + if not self._allow_bucket_creation: + with ux_utils.print_exception_no_traceback(): + raise exceptions.StorageBucketCreateError( + f'Configured to use a non-existent bucket: {self.name}') + # If bucket cannot be found in both private and public settings, # the bucket is to be created by Sky. However, creation is skipped if # Store object is being reconstructed for deletion or re-mount with @@ -1695,11 +1707,13 @@ def __init__(self, region: Optional[str] = 'us-central1', is_sky_managed: Optional[bool] = None, sync_on_reconstruction: Optional[bool] = True, - _bucket_sub_path: Optional[str] = None): + _bucket_sub_path: Optional[str] = None, + _allow_bucket_creation: bool = True): self.client: 'storage.Client' self.bucket: StorageHandle super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction, _bucket_sub_path) + sync_on_reconstruction, _bucket_sub_path, + _allow_bucket_creation) def _validate(self): if self.source is not None and isinstance(self.source, str): @@ -2021,6 +2035,11 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]: 'Attempted to use a non-existent bucket as a source: ' f'{self.source}') from e else: + if not self._allow_bucket_creation: + with ux_utils.print_exception_no_traceback(): + raise exceptions.StorageBucketCreateError( + f'Configured to use a non-existent bucket: ' + f'{self.name}') # If bucket cannot be found (i.e., does not exist), it is to be # created by Sky. However, creation is skipped if Store object # is being reconstructed for deletion or re-mount with @@ -2212,7 +2231,8 @@ def __init__(self, region: Optional[str] = 'eastus', is_sky_managed: Optional[bool] = None, sync_on_reconstruction: bool = True, - _bucket_sub_path: Optional[str] = None): + _bucket_sub_path: Optional[str] = None, + _allow_bucket_creation: bool = True): self.storage_client: 'storage.Client' self.resource_client: 'storage.Client' self.container_name: str @@ -2224,7 +2244,8 @@ def __init__(self, if region is None: region = 'eastus' super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction, _bucket_sub_path) + sync_on_reconstruction, _bucket_sub_path, + _allow_bucket_creation) @classmethod def from_metadata(cls, metadata: AbstractStore.StoreMetadata, @@ -2868,6 +2889,12 @@ def _get_bucket(self) -> Tuple[str, bool]: f'{self.storage_account_name!r}.' 'Details: ' f'{common_utils.format_exception(e, use_bracket=True)}') + + if not self._allow_bucket_creation: + with ux_utils.print_exception_no_traceback(): + raise exceptions.StorageBucketCreateError( + f'Configured to use a non-existent container: ' + f'{self.name}') # If the container cannot be found in both private and public settings, # the container is to be created by Sky. However, creation is skipped # if Store object is being reconstructed for deletion or re-mount with @@ -2999,11 +3026,13 @@ def __init__(self, region: Optional[str] = 'auto', is_sky_managed: Optional[bool] = None, sync_on_reconstruction: Optional[bool] = True, - _bucket_sub_path: Optional[str] = None): + _bucket_sub_path: Optional[str] = None, + _allow_bucket_creation: bool = True): self.client: 'boto3.client.Client' self.bucket: 'StorageHandle' super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction, _bucket_sub_path) + sync_on_reconstruction, _bucket_sub_path, + _allow_bucket_creation) def _validate(self): if self.source is not None and isinstance(self.source, str): @@ -3269,6 +3298,11 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]: f'--profile={cloudflare.R2_PROFILE_NAME}\' ' 'to debug.') + if not self._allow_bucket_creation: + with ux_utils.print_exception_no_traceback(): + raise exceptions.StorageBucketCreateError( + f'Configured to use a non-existent bucket: ' + f'{self.name}') # If bucket cannot be found in both private and public settings, # the bucket is to be created by Sky. However, creation is skipped if # Store object is being reconstructed for deletion or re-mount with @@ -3425,11 +3459,13 @@ def __init__(self, region: Optional[str] = 'us-east', is_sky_managed: Optional[bool] = None, sync_on_reconstruction: bool = True, - _bucket_sub_path: Optional[str] = None): + _bucket_sub_path: Optional[str] = None, + _allow_bucket_creation: bool = True): self.client: 'storage.Client' self.bucket: 'StorageHandle' super().__init__(name, source, region, is_sky_managed, - sync_on_reconstruction, _bucket_sub_path) + sync_on_reconstruction, _bucket_sub_path, + _allow_bucket_creation) self.bucket_rclone_profile = \ Rclone.generate_rclone_bucket_profile_name( self.name, Rclone.RcloneClouds.IBM) @@ -3742,6 +3778,13 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]: Rclone.RcloneClouds.IBM, self.region, # type: ignore ) + + if not self._allow_bucket_creation: + with ux_utils.print_exception_no_traceback(): + raise exceptions.StorageBucketCreateError( + f'Configured to use a non-existent bucket: ' + f'{self.name}') + if not bucket_region and self.sync_on_reconstruction: # bucket doesn't exist return self._create_cos_bucket(self.name, self.region), True diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index a09be64cef8..3536612ee47 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -734,6 +734,7 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: else: store_type_str, bucket_name, sub_path, store_init_kwargs = \ storage_lib.StoreType.from_store_url(bucket_wth_prefix) + store_init_kwargs['_allow_bucket_creation'] = False # Step 1: Translate the workdir to SkyPilot storage. new_storage_mounts = {} From fc2d48e4b9e7f94db4afda5cbba2bea1caa670fd Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Mon, 2 Dec 2024 22:54:02 +0800 Subject: [PATCH 40/48] resolve comment --- sky/data/storage.py | 144 +++++++++++++++++++++------------- sky/utils/controller_utils.py | 99 ++++++++++++----------- 2 files changed, 139 insertions(+), 104 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index 8fa148af646..e767c7036d1 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -189,9 +189,11 @@ def get_endpoint_url(cls, store: 'AbstractStore', path: str) -> str: return bucket_endpoint_url @classmethod - def from_store_url(cls, - store_url: str) -> Tuple[str, str, str, Dict[str, Any]]: - """Returns the store type, bucket name, and sub path from a store URL. + def from_store_url( + cls, store_url: str + ) -> Tuple['StoreType', str, str, Optional[str], Optional[str]]: + """Returns the store type, bucket name, and sub path from a store URL, + and the storage account name and region if applicable. Args: store_url: str; The store URL. @@ -199,27 +201,57 @@ def from_store_url(cls, # The full path from the user config of IBM COS contains the region, # and Azure Blob Storage contains the storage account name, we need to # pass these information to the store constructor. - store_init_kwargs = {} + storage_account_name = None + region = None for store_type in StoreType: - store_type_value = store_type.value if store_url.startswith(store_type.store_prefix()): if store_type == StoreType.AZURE: storage_account_name, bucket_name, sub_path = \ data_utils.split_az_path(store_url) - store_init_kwargs[ - 'storage_account_name'] = storage_account_name elif store_type == StoreType.IBM: bucket_name, sub_path, region = data_utils.split_cos_path( store_url) - store_init_kwargs['region'] = region elif store_type == StoreType.R2: bucket_name, sub_path = data_utils.split_r2_path(store_url) elif store_type == StoreType.GCS: bucket_name, sub_path = data_utils.split_gcs_path(store_url) elif store_type == StoreType.S3: bucket_name, sub_path = data_utils.split_s3_path(store_url) - return store_type_value, bucket_name, \ - sub_path, store_init_kwargs + return store_type, bucket_name, \ + sub_path, storage_account_name, region + raise ValueError(f'Unknown store URL: {store_url}') + + @classmethod + def get_fields_from_store_url( + cls, store_url: str + ) -> Tuple['StoreType', str, str, Optional[str], Optional[str]]: + """Returns the store type, bucket name, and sub path from a store URL, + and the storage account name and region if applicable. + + Args: + store_url: str; The store URL. + """ + # The full path from the user config of IBM COS contains the region, + # and Azure Blob Storage contains the storage account name, we need to + # pass these information to the store constructor. + storage_account_name = None + region = None + for store_type in StoreType: + if store_url.startswith(store_type.store_prefix()): + if store_type == StoreType.AZURE: + storage_account_name, bucket_name, sub_path = \ + data_utils.split_az_path(store_url) + elif store_type == StoreType.IBM: + bucket_name, sub_path, region = data_utils.split_cos_path( + store_url) + elif store_type == StoreType.R2: + bucket_name, sub_path = data_utils.split_r2_path(store_url) + elif store_type == StoreType.GCS: + bucket_name, sub_path = data_utils.split_gcs_path(store_url) + elif store_type == StoreType.S3: + bucket_name, sub_path = data_utils.split_s3_path(store_url) + return store_type, bucket_name, \ + sub_path, storage_account_name, region raise ValueError(f'Unknown store URL: {store_url}') @@ -930,39 +962,14 @@ def from_metadata(cls, metadata: StorageMetadata, return storage_obj - def add_store( - self, - store_type: Union[str, StoreType], - region: Optional[str] = None, - store_init_kwargs: Optional[Dict[str, - Any]] = None) -> AbstractStore: - """Initializes and adds a new store to the storage. - - Invoked by the optimizer after it has selected a store to - add it to Storage. - - Args: - store_type: StoreType; Type of the storage [S3, GCS, AZURE, R2, IBM] - region: str; Region to place the bucket in. Caller must ensure that - the region is valid for the chosen store_type. - store_init_kwargs: Dict[str, Any]; Additional keyword arguments - to pass to the store constructor.ß - """ - if isinstance(store_type, str): - store_type = StoreType(store_type) - - if store_type in self.stores: - if store_type == StoreType.AZURE: - azure_store_obj = self.stores[store_type] - assert isinstance(azure_store_obj, AzureBlobStore) - storage_account_name = azure_store_obj.storage_account_name - logger.info(f'Storage type {store_type} already exists under ' - f'storage account {storage_account_name!r}.') - else: - logger.info(f'Storage type {store_type} already exists.') - - return self.stores[store_type] - + def construct_store( + self, + store_type: StoreType, + region: Optional[str] = None, + storage_account_name: Optional[str] = None, + _allow_bucket_creation: bool = True # pylint: disable=invalid-name + ) -> AbstractStore: + """Initialize store object and get/create bucket.""" store_cls: Type[AbstractStore] if store_type == StoreType.S3: store_cls = S3Store @@ -978,16 +985,18 @@ def add_store( with ux_utils.print_exception_no_traceback(): raise exceptions.StorageSpecError( f'{store_type} not supported as a Store.') - - # Initialize store object and get/create bucket try: + kwargs: Dict[str, Any] = {} + if storage_account_name is not None: + kwargs['storage_account_name'] = storage_account_name store = store_cls( name=self.name, source=self.source, region=region, sync_on_reconstruction=self.sync_on_reconstruction, _bucket_sub_path=self._bucket_sub_path, - **(store_init_kwargs or {})) + _allow_bucket_creation=_allow_bucket_creation, + **kwargs) except exceptions.StorageBucketCreateError: # Creation failed, so this must be sky managed store. Add failure # to state. @@ -1018,6 +1027,38 @@ def add_store( return store + def add_store(self, + store_type: Union[str, StoreType], + region: Optional[str] = None) -> AbstractStore: + """Initializes and adds a new store to the storage. + + Invoked by the optimizer after it has selected a store to + add it to Storage. + + Args: + store_type: StoreType; Type of the storage [S3, GCS, AZURE, R2, IBM] + region: str; Region to place the bucket in. Caller must ensure that + the region is valid for the chosen store_type. + """ + if isinstance(store_type, str): + store_type = StoreType(store_type) + + if store_type in self.stores: + if store_type == StoreType.AZURE: + azure_store_obj = self.stores[store_type] + assert isinstance(azure_store_obj, AzureBlobStore) + storage_account_name = azure_store_obj.storage_account_name + logger.info(f'Storage type {store_type} already exists under ' + f'storage account {storage_account_name!r}.') + else: + logger.info(f'Storage type {store_type} already exists.') + + return self.stores[store_type] + + store = self.construct_store(store_type, region) + + return store + def _add_store(self, store: AbstractStore, is_reconstructed: bool = False): # Adds a store object to the storage store_type = StoreType.from_store(store) @@ -1038,9 +1079,8 @@ def _store_is_configured_by_user(self, store: AbstractStore) -> bool: if bucket_wth_prefix is None: return False - store_type_str, bucket_name, _, _ = StoreType.from_store_url( + store_type, bucket_name, _, _, _ = StoreType.get_fields_from_store_url( bucket_wth_prefix) - store_type = StoreType(store_type_str) if StoreType.from_store(store) != store_type: return False return store.name == bucket_name and store.bucket_sub_path is not None @@ -1130,10 +1170,7 @@ def warn_for_git_dir(source: str): global_user_state.set_storage_status(self.name, StorageStatus.READY) @classmethod - def from_yaml_config( - cls, - config: Dict[str, Any], - store_init_kwargs: Optional[Dict[str, Any]] = None) -> 'Storage': + def from_yaml_config(cls, config: Dict[str, Any]) -> 'Storage': common_utils.validate_schema(config, schemas.get_storage_schema(), 'Invalid storage YAML: ') @@ -1166,8 +1203,7 @@ def from_yaml_config( mode=mode, _bucket_sub_path=_bucket_sub_path) if store is not None: - storage_obj.add_store(StoreType(store.upper()), - store_init_kwargs=store_init_kwargs or {}) + storage_obj.add_store(StoreType(store.upper())) # Add force deletion flag storage_obj.force_delete = force_delete diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 3536612ee47..cbf0b979ffd 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -726,15 +726,16 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: # Get the bucket name for the workdir and file mounts, # we stores all these files in same bucket from config. bucket_wth_prefix = skypilot_config.get_nested(('jobs', 'bucket'), None) - store_init_kwargs: Dict[str, Any] = {} if bucket_wth_prefix is None: - store_type_str = sub_path = None + store_type = sub_path = None + storage_account_name = region = None + allow_bucket_creation = True bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format( username=common_utils.get_cleaned_username(), id=run_id) else: - store_type_str, bucket_name, sub_path, store_init_kwargs = \ - storage_lib.StoreType.from_store_url(bucket_wth_prefix) - store_init_kwargs['_allow_bucket_creation'] = False + store_type, bucket_name, sub_path, storage_account_name, region = \ + storage_lib.StoreType.get_fields_from_store_url(bucket_wth_prefix) + allow_bucket_creation = False # Step 1: Translate the workdir to SkyPilot storage. new_storage_mounts = {} @@ -746,21 +747,19 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: raise ValueError( f'Cannot mount {constants.SKY_REMOTE_WORKDIR} as both the ' 'workdir and file_mounts contains it as the target.') - new_storage_mounts[ - constants. - SKY_REMOTE_WORKDIR] = storage_lib.Storage.from_yaml_config( - { - 'name': bucket_name, - 'source': workdir, - 'persistent': False, - 'mode': 'COPY', - 'store': store_type_str, - '_bucket_sub_path': _sub_path_join( - sub_path, - constants.FILE_MOUNTS_WORKDIR_SUBPATH.format( - run_id=run_id)), - }, - store_init_kwargs=store_init_kwargs) + storage_obj = storage_lib.Storage( + name=bucket_name, + source=workdir, + persistent=False, + mode=storage_lib.StorageMode.COPY, + _bucket_sub_path=_sub_path_join( + sub_path, + constants.FILE_MOUNTS_WORKDIR_SUBPATH.format(run_id=run_id))) + if store_type is not None: + storage_obj.construct_store(store_type, region, + storage_account_name, + allow_bucket_creation) + new_storage_mounts[constants.SKY_REMOTE_WORKDIR] = storage_obj # Check of the existence of the workdir in file_mounts is done in # the task construction. logger.info(f' {colorama.Style.DIM}Workdir: {workdir!r} ' @@ -778,18 +777,19 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: if os.path.isfile(os.path.abspath(os.path.expanduser(src))): copy_mounts_with_file_in_src[dst] = src continue - new_storage_mounts[dst] = storage_lib.Storage.from_yaml_config( - { - 'name': bucket_name, - 'source': src, - 'persistent': False, - 'mode': 'COPY', - 'store': store_type_str, - '_bucket_sub_path': _sub_path_join( - sub_path, - constants.FILE_MOUNTS_SUBPATH.format(i=i, run_id=run_id)), - }, - store_init_kwargs=store_init_kwargs) + storage_obj = storage_lib.Storage( + name=bucket_name, + source=src, + persistent=False, + mode=storage_lib.StorageMode.COPY, + _bucket_sub_path=_sub_path_join( + sub_path, + constants.FILE_MOUNTS_SUBPATH.format(i=i, run_id=run_id))) + if store_type is not None: + storage_obj.construct_store(store_type, region, + storage_account_name, + allow_bucket_creation) + new_storage_mounts[dst] = storage_obj logger.info(f' {colorama.Style.DIM}Folder : {src!r} ' f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}') @@ -809,18 +809,17 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: src_to_file_id[src] = i os.link(os.path.abspath(os.path.expanduser(src)), os.path.join(local_fm_path, f'file-{i}')) - - new_storage_mounts[ - file_mount_remote_tmp_dir] = storage_lib.Storage.from_yaml_config( - { - 'name': bucket_name, - 'source': local_fm_path, - 'persistent': False, - 'mode': 'MOUNT', - 'store': store_type_str, - '_bucket_sub_path': file_mounts_tmp_subpath, - }, - store_init_kwargs=store_init_kwargs) + storage_obj = storage_lib.Storage( + name=bucket_name, + source=local_fm_path, + persistent=False, + mode=storage_lib.StorageMode.MOUNT, + _bucket_sub_path=file_mounts_tmp_subpath) + if store_type is not None: + storage_obj.construct_store(store_type, region, + storage_account_name, + allow_bucket_creation) + new_storage_mounts[file_mount_remote_tmp_dir] = storage_obj if file_mount_remote_tmp_dir in original_storage_mounts: with ux_utils.print_exception_no_traceback(): raise ValueError( @@ -878,8 +877,8 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: # file_mount_remote_tmp_dir will only exist when there are files in # the src for copy mounts. storage_obj = task.storage_mounts[file_mount_remote_tmp_dir] - store_type = list(storage_obj.stores.keys())[0] - store_object = storage_obj.stores[store_type] + curr_store_type = list(storage_obj.stores.keys())[0] + store_object = storage_obj.stores[curr_store_type] bucket_url = storage_lib.StoreType.get_endpoint_url( store_object, bucket_name) bucket_url += f'/{file_mounts_tmp_subpath}' @@ -899,8 +898,8 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: store_types = list(storage_obj.stores.keys()) assert len(store_types) == 1, ( 'We only support one store type for now.', storage_obj.stores) - store_type = store_types[0] - store_object = storage_obj.stores[store_type] + curr_store_type = store_types[0] + store_object = storage_obj.stores[curr_store_type] storage_obj.source = storage_lib.StoreType.get_endpoint_url( store_object, storage_obj.name) storage_obj.force_delete = True @@ -917,8 +916,8 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: store_types = list(storage_obj.stores.keys()) assert len(store_types) == 1, ( 'We only support one store type for now.', storage_obj.stores) - store_type = store_types[0] - store_object = storage_obj.stores[store_type] + curr_store_type = store_types[0] + store_object = storage_obj.stores[curr_store_type] source = storage_lib.StoreType.get_endpoint_url( store_object, storage_obj.name) new_storage = storage_lib.Storage.from_yaml_config({ From e032fadcf8e133b8f27c90e3659395607eba47d9 Mon Sep 17 00:00:00 2001 From: zpoint Date: Sat, 14 Dec 2024 14:20:00 +0800 Subject: [PATCH 41/48] Update docs/source/reference/config.rst Co-authored-by: Romil Bhardwaj --- docs/source/reference/config.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index 37c7911b567..8dc6ea208a4 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -24,7 +24,7 @@ Available fields and semantics: # # Ref: https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html#customizing-job-controller-resources jobs: - # Bucket to store managed jobs mount files and tmp files. + # Bucket to store managed jobs mount files and tmp files. Bucket must already exist. # Optional. If not set, SkyPilot will create a new bucket for each managed job launch. # Supports s3://, gs://, https://.blob.core.windows.net/, r2://, cos:/// bucket: s3://my-bucket/ From 284b46d0104284d657c715e7c44b112e7e6b8859 Mon Sep 17 00:00:00 2001 From: zpoint Date: Sat, 14 Dec 2024 14:20:16 +0800 Subject: [PATCH 42/48] Update sky/utils/controller_utils.py Co-authored-by: Romil Bhardwaj --- sky/utils/controller_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index cbf0b979ffd..7108f3d359c 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -724,7 +724,7 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: f'Translating {msg} to SkyPilot Storage...')) # Get the bucket name for the workdir and file mounts, - # we stores all these files in same bucket from config. + # we store all these files in same bucket from config. bucket_wth_prefix = skypilot_config.get_nested(('jobs', 'bucket'), None) if bucket_wth_prefix is None: store_type = sub_path = None From 86ffe0196408a0ef12faa93b8566db2e6bb40bfd Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Sat, 14 Dec 2024 18:32:52 +0800 Subject: [PATCH 43/48] resolve PR comment --- sky/data/storage.py | 161 +++++++++++++--------------- sky/utils/controller_utils.py | 85 +++++++++------ tests/test_smoke.py | 194 ++++++++++++++++++++-------------- 3 files changed, 243 insertions(+), 197 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index e767c7036d1..6a9646b7b20 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -224,9 +224,10 @@ def from_store_url( @classmethod def get_fields_from_store_url( cls, store_url: str - ) -> Tuple['StoreType', str, str, Optional[str], Optional[str]]: - """Returns the store type, bucket name, and sub path from a store URL, - and the storage account name and region if applicable. + ) -> Tuple['StoreType', Type['AbstractStore'], str, str, Optional[str], + Optional[str]]: + """Returns the store type, store class, bucket name, and sub path from + a store URL, and the storage account name and region if applicable. Args: store_url: str; The store URL. @@ -241,16 +242,21 @@ def get_fields_from_store_url( if store_type == StoreType.AZURE: storage_account_name, bucket_name, sub_path = \ data_utils.split_az_path(store_url) + store_cls: Type['AbstractStore'] = AzureBlobStore elif store_type == StoreType.IBM: bucket_name, sub_path, region = data_utils.split_cos_path( store_url) + store_cls = IBMCosStore elif store_type == StoreType.R2: bucket_name, sub_path = data_utils.split_r2_path(store_url) + store_cls = R2Store elif store_type == StoreType.GCS: bucket_name, sub_path = data_utils.split_gcs_path(store_url) + store_cls = GcsStore elif store_type == StoreType.S3: bucket_name, sub_path = data_utils.split_s3_path(store_url) - return store_type, bucket_name, \ + store_cls = S3Store + return store_type, store_cls,bucket_name, \ sub_path, storage_account_name, region raise ValueError(f'Unknown store URL: {store_url}') @@ -280,18 +286,21 @@ def __init__(self, name: str, source: Optional[SourceType], region: Optional[str] = None, - is_sky_managed: Optional[bool] = None): + is_sky_managed: Optional[bool] = None, + _bucket_sub_path: Optional[str] = None): self.name = name self.source = source self.region = region self.is_sky_managed = is_sky_managed + self._bucket_sub_path = _bucket_sub_path def __repr__(self): return (f'StoreMetadata(' f'\n\tname={self.name},' f'\n\tsource={self.source},' f'\n\tregion={self.region},' - f'\n\tis_sky_managed={self.is_sky_managed})') + f'\n\tis_sky_managed={self.is_sky_managed},' + f'\n\t_bucket_sub_path={self._bucket_sub_path})') def __init__( self, @@ -363,19 +372,26 @@ def from_metadata(cls, metadata: StoreMetadata, **override_args): Used when reconstructing Storage and Store objects from global_user_state. """ - return cls(name=override_args.get('name', metadata.name), - source=override_args.get('source', metadata.source), - region=override_args.get('region', metadata.region), - is_sky_managed=override_args.get('is_sky_managed', - metadata.is_sky_managed), - sync_on_reconstruction=override_args.get( - 'sync_on_reconstruction', True)) + return cls( + name=override_args.get('name', metadata.name), + source=override_args.get('source', metadata.source), + region=override_args.get('region', metadata.region), + is_sky_managed=override_args.get('is_sky_managed', + metadata.is_sky_managed), + sync_on_reconstruction=override_args.get('sync_on_reconstruction', + True), + # backward compatibility + _bucket_sub_path=override_args.get( + '_bucket_sub_path', + metadata._bucket_sub_path # pylint: disable=protected-access + ) if hasattr(metadata, '_bucket_sub_path') else None) def get_metadata(self) -> StoreMetadata: return self.StoreMetadata(name=self.name, source=self.source, region=self.region, - is_sky_managed=self.is_sky_managed) + is_sky_managed=self.is_sky_managed, + _bucket_sub_path=self._bucket_sub_path) def initialize(self): """Initializes the Store object on the cloud. @@ -406,7 +422,7 @@ def delete(self) -> None: """Removes the Storage from the cloud.""" raise NotImplementedError - def delete_sub_path(self) -> None: + def _delete_sub_path(self) -> None: """Removes objects from the sub path in the bucket.""" raise NotImplementedError @@ -962,14 +978,34 @@ def from_metadata(cls, metadata: StorageMetadata, return storage_obj - def construct_store( - self, - store_type: StoreType, - region: Optional[str] = None, - storage_account_name: Optional[str] = None, - _allow_bucket_creation: bool = True # pylint: disable=invalid-name - ) -> AbstractStore: - """Initialize store object and get/create bucket.""" + def add_store(self, + store_type: Union[str, StoreType], + region: Optional[str] = None) -> AbstractStore: + """Initializes and adds a new store to the storage. + + Invoked by the optimizer after it has selected a store to + add it to Storage. + + Args: + store_type: StoreType; Type of the storage [S3, GCS, AZURE, R2, IBM] + region: str; Region to place the bucket in. Caller must ensure that + the region is valid for the chosen store_type. + """ + if isinstance(store_type, str): + store_type = StoreType(store_type) + + if store_type in self.stores: + if store_type == StoreType.AZURE: + azure_store_obj = self.stores[store_type] + assert isinstance(azure_store_obj, AzureBlobStore) + storage_account_name = azure_store_obj.storage_account_name + logger.info(f'Storage type {store_type} already exists under ' + f'storage account {storage_account_name!r}.') + else: + logger.info(f'Storage type {store_type} already exists.') + + return self.stores[store_type] + store_cls: Type[AbstractStore] if store_type == StoreType.S3: store_cls = S3Store @@ -986,17 +1022,12 @@ def construct_store( raise exceptions.StorageSpecError( f'{store_type} not supported as a Store.') try: - kwargs: Dict[str, Any] = {} - if storage_account_name is not None: - kwargs['storage_account_name'] = storage_account_name store = store_cls( name=self.name, source=self.source, region=region, sync_on_reconstruction=self.sync_on_reconstruction, - _bucket_sub_path=self._bucket_sub_path, - _allow_bucket_creation=_allow_bucket_creation, - **kwargs) + _bucket_sub_path=self._bucket_sub_path) except exceptions.StorageBucketCreateError: # Creation failed, so this must be sky managed store. Add failure # to state. @@ -1027,38 +1058,6 @@ def construct_store( return store - def add_store(self, - store_type: Union[str, StoreType], - region: Optional[str] = None) -> AbstractStore: - """Initializes and adds a new store to the storage. - - Invoked by the optimizer after it has selected a store to - add it to Storage. - - Args: - store_type: StoreType; Type of the storage [S3, GCS, AZURE, R2, IBM] - region: str; Region to place the bucket in. Caller must ensure that - the region is valid for the chosen store_type. - """ - if isinstance(store_type, str): - store_type = StoreType(store_type) - - if store_type in self.stores: - if store_type == StoreType.AZURE: - azure_store_obj = self.stores[store_type] - assert isinstance(azure_store_obj, AzureBlobStore) - storage_account_name = azure_store_obj.storage_account_name - logger.info(f'Storage type {store_type} already exists under ' - f'storage account {storage_account_name!r}.') - else: - logger.info(f'Storage type {store_type} already exists.') - - return self.stores[store_type] - - store = self.construct_store(store_type, region) - - return store - def _add_store(self, store: AbstractStore, is_reconstructed: bool = False): # Adds a store object to the storage store_type = StoreType.from_store(store) @@ -1070,21 +1069,6 @@ def _add_store(self, store: AbstractStore, is_reconstructed: bool = False): global_user_state.add_or_update_storage(self.name, self.handle, StorageStatus.INIT) - def _store_is_configured_by_user(self, store: AbstractStore) -> bool: - """Check if the store is configured by user in the config. - - If the bucket is specified in the config, it is managed by user. - """ - bucket_wth_prefix = skypilot_config.get_nested(('jobs', 'bucket'), None) - if bucket_wth_prefix is None: - return False - - store_type, bucket_name, _, _, _ = StoreType.get_fields_from_store_url( - bucket_wth_prefix) - if StoreType.from_store(store) != store_type: - return False - return store.name == bucket_name and store.bucket_sub_path is not None - def delete(self, store_type: Optional[StoreType] = None) -> None: """Deletes data for all sky-managed storage objects. @@ -1103,12 +1087,9 @@ def delete(self, store_type: Optional[StoreType] = None) -> None: is_sky_managed = store.is_sky_managed # We delete a store from the cloud if it's sky managed. Else just # remove handle and return - if is_sky_managed or self._store_is_configured_by_user(store): + if is_sky_managed: self.handle.remove_store(store) - if is_sky_managed: - store.delete() - else: - store.delete_sub_path() + store.delete() # Check remaining stores - if none is sky managed, remove # the storage from global_user_state. delete = all( @@ -1123,13 +1104,9 @@ def delete(self, store_type: Optional[StoreType] = None) -> None: del self.stores[store_type] else: for _, store in self.stores.items(): - if store.is_sky_managed or self._store_is_configured_by_user( - store): + if store.is_sky_managed: self.handle.remove_store(store) - if store.is_sky_managed: - store.delete() - else: - store.delete_sub_path() + store.delete() elif self.force_delete: store.delete() self.stores = {} @@ -1430,6 +1407,8 @@ def upload(self): f'Upload failed for store {self.name}') from e def delete(self) -> None: + if self._bucket_sub_path is not None and not self.is_sky_managed: + return self._delete_sub_path() deleted_by_skypilot = self._delete_s3_bucket(self.name) if deleted_by_skypilot: msg_str = f'Deleted S3 bucket {self.name}.' @@ -1907,6 +1886,8 @@ def upload(self): f'Upload failed for store {self.name}') from e def delete(self) -> None: + if self._bucket_sub_path is not None and not self.is_sky_managed: + return self._delete_sub_path() deleted_by_skypilot = self._delete_gcs_bucket(self.name) if deleted_by_skypilot: msg_str = f'Deleted GCS bucket {self.name}.' @@ -2735,6 +2716,8 @@ def delete(self) -> None: f'{colorama.Style.RESET_ALL}') def delete_sub_path(self) -> None: + if self._bucket_sub_path is not None and not self.is_sky_managed: + return self._delete_sub_path() assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' try: container_url = data_utils.AZURE_CONTAINER_URL.format( @@ -3177,6 +3160,8 @@ def delete(self) -> None: f'{colorama.Style.RESET_ALL}') def delete_sub_path(self) -> None: + if self._bucket_sub_path is not None and not self.is_sky_managed: + return self._delete_sub_path() assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' deleted_by_skypilot = self._delete_r2_bucket_sub_path( self.name, self._bucket_sub_path) @@ -3651,6 +3636,8 @@ def delete(self) -> None: f'{colorama.Style.RESET_ALL}') def delete_sub_path(self) -> None: + if self._bucket_sub_path is not None and not self.is_sky_managed: + return self._delete_sub_path() assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' bucket = self.s3_resource.Bucket(self.name) try: diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 7108f3d359c..f32717b8001 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -726,16 +726,21 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: # Get the bucket name for the workdir and file mounts, # we store all these files in same bucket from config. bucket_wth_prefix = skypilot_config.get_nested(('jobs', 'bucket'), None) + store_kwargs: Dict[str, Any] = {} if bucket_wth_prefix is None: - store_type = sub_path = None + store_cls = sub_path = None storage_account_name = region = None - allow_bucket_creation = True bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format( username=common_utils.get_cleaned_username(), id=run_id) else: - store_type, bucket_name, sub_path, storage_account_name, region = \ - storage_lib.StoreType.get_fields_from_store_url(bucket_wth_prefix) - allow_bucket_creation = False + store_type, store_cls, bucket_name, sub_path, storage_account_name, \ + region = storage_lib.StoreType.get_fields_from_store_url( + bucket_wth_prefix) + store_kwargs['allow_bucket_creation'] = False + if storage_account_name is not None: + store_kwargs['storage_account_name'] = storage_account_name + if region is not None: + store_kwargs['region'] = region # Step 1: Translate the workdir to SkyPilot storage. new_storage_mounts = {} @@ -747,18 +752,24 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: raise ValueError( f'Cannot mount {constants.SKY_REMOTE_WORKDIR} as both the ' 'workdir and file_mounts contains it as the target.') - storage_obj = storage_lib.Storage( - name=bucket_name, - source=workdir, - persistent=False, - mode=storage_lib.StorageMode.COPY, - _bucket_sub_path=_sub_path_join( - sub_path, - constants.FILE_MOUNTS_WORKDIR_SUBPATH.format(run_id=run_id))) + bucket_sub_path = _sub_path_join( + sub_path, + constants.FILE_MOUNTS_WORKDIR_SUBPATH.format(run_id=run_id)) + stores = None if store_type is not None: - storage_obj.construct_store(store_type, region, - storage_account_name, - allow_bucket_creation) + assert store_cls is not None + stores = { + store_type: store_cls(name=bucket_name, + source=workdir, + _bucket_sub_path=bucket_sub_path, + **store_kwargs) + } + storage_obj = storage_lib.Storage(name=bucket_name, + source=workdir, + persistent=False, + mode=storage_lib.StorageMode.COPY, + stores=stores, + _bucket_sub_path=bucket_sub_path) new_storage_mounts[constants.SKY_REMOTE_WORKDIR] = storage_obj # Check of the existence of the workdir in file_mounts is done in # the task construction. @@ -777,18 +788,23 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: if os.path.isfile(os.path.abspath(os.path.expanduser(src))): copy_mounts_with_file_in_src[dst] = src continue - storage_obj = storage_lib.Storage( - name=bucket_name, - source=src, - persistent=False, - mode=storage_lib.StorageMode.COPY, - _bucket_sub_path=_sub_path_join( - sub_path, - constants.FILE_MOUNTS_SUBPATH.format(i=i, run_id=run_id))) + bucket_sub_path = _sub_path_join( + sub_path, constants.FILE_MOUNTS_SUBPATH.format(i=i, run_id=run_id)) + stores = None if store_type is not None: - storage_obj.construct_store(store_type, region, - storage_account_name, - allow_bucket_creation) + assert store_cls is not None + stores = { + store_type: store_cls(name=bucket_name, + source=src, + _bucket_sub_path=bucket_sub_path, + **store_kwargs) + } + storage_obj = storage_lib.Storage(name=bucket_name, + source=src, + persistent=False, + mode=storage_lib.StorageMode.COPY, + stores=stores, + _bucket_sub_path=bucket_sub_path) new_storage_mounts[dst] = storage_obj logger.info(f' {colorama.Style.DIM}Folder : {src!r} ' f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}') @@ -809,16 +825,23 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: src_to_file_id[src] = i os.link(os.path.abspath(os.path.expanduser(src)), os.path.join(local_fm_path, f'file-{i}')) + stores = None + if store_type is not None: + assert store_cls is not None + stores = { + store_type: store_cls(name=bucket_name, + source=local_fm_path, + _bucket_sub_path=file_mounts_tmp_subpath, + **store_kwargs) + } storage_obj = storage_lib.Storage( name=bucket_name, source=local_fm_path, persistent=False, mode=storage_lib.StorageMode.MOUNT, + stores=stores, _bucket_sub_path=file_mounts_tmp_subpath) - if store_type is not None: - storage_obj.construct_store(store_type, region, - storage_account_name, - allow_bucket_creation) + new_storage_mounts[file_mount_remote_tmp_dir] = storage_obj if file_mount_remote_tmp_dir in original_storage_mounts: with ux_utils.print_exception_no_traceback(): diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 459203e65fc..4e8f9fd1c42 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3574,10 +3574,6 @@ def test_managed_jobs_storage(generic_cloud: str): storage_name = f'sky-test-{timestamp}' output_storage_name = f'sky-test-output-{timestamp}' - yaml_str_user_config = pathlib.Path( - 'tests/test_yamls/use_intermediate_bucket_config.yaml').read_text() - intermediate_storage_name = f'bucket-jobs-intermediate-smoke-test-{timestamp}' - # Also perform region testing for bucket creation to validate if buckets are # created in the correct region and correctly mounted in managed jobs. # However, we inject this testing only for AWS and GCP since they are the @@ -3630,11 +3626,56 @@ def test_managed_jobs_storage(generic_cloud: str): output_check_cmd = f'{s3_output_check_cmd} || {gcs_output_check_cmd}' use_spot = ' --no-use-spot' + yaml_str = yaml_str.replace('sky-workdir-zhwu', storage_name) + yaml_str = yaml_str.replace('sky-output-bucket', output_storage_name) + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + f.write(yaml_str) + f.flush() + file_path = f.name + test = Test( + 'managed_jobs_storage', + [ + *STORAGE_SETUP_COMMANDS, + f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y', + region_validation_cmd, # Check if the bucket is created in the correct region + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( + job_name=name, + job_status=[sky.ManagedJobStatus.SUCCEEDED], + timeout=60 + _BUMP_UP_SECONDS), + # Wait for the job to be cleaned up. + 'sleep 20', + f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]', + # Check if file was written to the mounted output bucket + output_check_cmd + ], + (f'sky jobs cancel -y -n {name}' + f'; sky storage delete {output_storage_name} -y || true'), + # Increase timeout since sky jobs queue -r can be blocked by other spot tests. + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.aws +def test_managed_jobs_intermediate_storage(generic_cloud: str): + """Test storage with managed job""" + name = _get_cluster_name() + yaml_str = pathlib.Path( + 'examples/managed_job_with_storage.yaml').read_text() + timestamp = int(time.time()) + storage_name = f'sky-test-{timestamp}' + output_storage_name = f'sky-test-output-{timestamp}' + + yaml_str_user_config = pathlib.Path( + 'tests/test_yamls/use_intermediate_bucket_config.yaml').read_text() + intermediate_storage_name = f'intermediate-smoke-test-{timestamp}' + yaml_str = yaml_str.replace('sky-workdir-zhwu', storage_name) yaml_str = yaml_str.replace('sky-output-bucket', output_storage_name) yaml_str_user_config = re.sub(r'bucket-jobs-[\w\d]+', intermediate_storage_name, yaml_str_user_config) + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f_user_config: f_user_config.write(yaml_str_user_config) f_user_config.flush() @@ -3643,25 +3684,26 @@ def test_managed_jobs_storage(generic_cloud: str): f_task.write(yaml_str) f_task.flush() file_path = f_task.name + test = Test( - 'managed_jobs_storage', + 'managed_jobs_intermediate_storage', [ *STORAGE_SETUP_COMMANDS, - f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y', - region_validation_cmd, # Check if the bucket is created in the correct region + # Verify command fails with correct error - run only once + f'err=$(sky jobs launch -n {name} --cloud {generic_cloud} {file_path} -y 2>&1); ret=$?; [ $ret -eq 0 ] || ! echo "$err" | grep "ValueError: Storage {intermediate_storage_name} not found" > /dev/null && exit 1 || exit 0', + f'aws s3api create-bucket --bucket {intermediate_storage_name}', + f'sky jobs launch -n {name} --cloud {generic_cloud} {file_path} -y', + # fail because the bucket does not exist _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, job_status=[ManagedJobStatus.SUCCEEDED], timeout=60 + _BUMP_UP_SECONDS), - f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]', # check intermediate bucket exists, it won't be deletd if its user specific f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{intermediate_storage_name}\')].Name" --output text | wc -l) -eq 1 ]', - # Check if file was written to the mounted output bucket - output_check_cmd ], - (f'sky jobs cancel -y -n {name}', - f'; sky storage delete {intermediate_storage_name}', - f'; sky storage delete {output_storage_name} || true'), + (f'sky jobs cancel -y -n {name}' + f'; aws s3 rb s3://{intermediate_storage_name} --force' + f'; sky storage delete {output_storage_name} -y || true'), env={'SKYPILOT_CONFIG': user_config_path}, # Increase timeout since sky jobs queue -r can be blocked by other spot tests. timeout=20 * 60, @@ -5038,21 +5080,69 @@ def cli_delete_cmd(store_type, bucket_name, Rclone.RcloneClouds.IBM) return f'rclone purge {bucket_rclone_profile}:{bucket_name} && rclone config delete {bucket_rclone_profile}' + @classmethod + def list_all_files(cls, store_type, bucket_name): + cmd = cls.cli_ls_cmd(store_type, bucket_name, recursive=True) + if store_type == storage_lib.StoreType.GCS: + try: + out = subprocess.check_output(cmd, + shell=True, + stderr=subprocess.PIPE) + files = [line[5:] for line in out.decode('utf-8').splitlines()] + except subprocess.CalledProcessError as e: + error_output = e.stderr.decode('utf-8') + if "One or more URLs matched no objects" in error_output: + files = [] + else: + raise + elif store_type == storage_lib.StoreType.AZURE: + out = subprocess.check_output(cmd, shell=True) + try: + blobs = json.loads(out.decode('utf-8')) + files = [blob['name'] for blob in blobs] + except json.JSONDecodeError: + files = [] + elif store_type == storage_lib.StoreType.IBM: + # rclone ls format: " 1234 path/to/file" + out = subprocess.check_output(cmd, shell=True) + files = [] + for line in out.decode('utf-8').splitlines(): + # Skip empty lines + if not line.strip(): + continue + # Split by whitespace and get the file path (last column) + parts = line.strip().split( + None, 1) # Split into max 2 parts (size and path) + if len(parts) == 2: + files.append(parts[1]) + else: + out = subprocess.check_output(cmd, shell=True) + files = [ + line.split()[-1] for line in out.decode('utf-8').splitlines() + ] + return files + @staticmethod - def cli_ls_cmd(store_type, bucket_name, suffix=''): + def cli_ls_cmd(store_type, bucket_name, recursive=False, suffix=''): if store_type == storage_lib.StoreType.S3: if suffix: url = f's3://{bucket_name}/{suffix}' else: url = f's3://{bucket_name}' - return f'aws s3 ls {url}' + cmd = f'aws s3 ls {url}' + if recursive: + cmd += ' --recursive' + return cmd if store_type == storage_lib.StoreType.GCS: if suffix: url = f'gs://{bucket_name}/{suffix}' else: url = f'gs://{bucket_name}' + if recursive: + url = f'"{url}/**"' return f'gsutil ls {url}' if store_type == storage_lib.StoreType.AZURE: + # azure isrecursive by default default_region = 'eastus' config_storage_account = skypilot_config.get_nested( ('azure', 'storage_account'), None) @@ -5074,8 +5164,10 @@ def cli_ls_cmd(store_type, bucket_name, suffix=''): url = f's3://{bucket_name}/{suffix}' else: url = f's3://{bucket_name}' - return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls {url} --endpoint {endpoint_url} --profile=r2' + recursive_flag = '--recursive' if recursive else '' + return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls {url} --endpoint {endpoint_url} --profile=r2 {recursive_flag}' if store_type == storage_lib.StoreType.IBM: + # rclone ls is recursive by default bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( bucket_name, Rclone.RcloneClouds.IBM) return f'rclone ls {bucket_rclone_profile}:{bucket_name}/{suffix}' @@ -5442,7 +5534,7 @@ def test_new_bucket_creation_and_deletion(self, tmp_local_storage_obj, @pytest.mark.no_fluidstack @pytest.mark.parametrize('store_type', [ - storage_lib.StoreType.S3, + pytest.param(storage_lib.StoreType.S3, marks=pytest.mark.aws), pytest.param(storage_lib.StoreType.GCS, marks=pytest.mark.gcp), pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm), @@ -5450,70 +5542,13 @@ def test_new_bucket_creation_and_deletion(self, tmp_local_storage_obj, ]) def test_bucket_sub_path(self, tmp_local_storage_obj_with_sub_path, store_type): - - def _list_all_files(): - if store_type == storage_lib.StoreType.S3: - # aws s3 ls command, list all files in bucket - cmd = f'aws s3 ls s3://{tmp_local_storage_obj_with_sub_path.name}/ --recursive' - out = subprocess.check_output(cmd, shell=True) - files = [ - line.split()[-1] - for line in out.decode('utf-8').splitlines() - ] - elif store_type == storage_lib.StoreType.GCS: - # gsutil ls command, list all files in bucket - cmd = f'gsutil ls "gs://{tmp_local_storage_obj_with_sub_path.name}/**"' - try: - out = subprocess.check_output(cmd, - shell=True, - stderr=subprocess.PIPE) - files = [ - line[5:] for line in out.decode('utf-8').splitlines() - ] - except subprocess.CalledProcessError as e: - error_output = e.stderr.decode('utf-8') - if "One or more URLs matched no objects" in error_output: - files = [] - else: - raise - elif store_type == storage_lib.StoreType.AZURE: - # az storage file list command, list all files in container - store = tmp_local_storage_obj_with_sub_path.stores[store_type] - container_url = data_utils.AZURE_CONTAINER_URL.format( - storage_account_name=store.storage_account_name, - container_name=store.name) - container_client = data_utils.create_az_client( - client_type='container', - container_url=container_url, - storage_account_name=store.storage_account_name, - resource_group_name=store.resource_group_name) - # List and delete blobs in the specified directory - blobs = container_client.list_blobs() - files = [blob.name for blob in blobs] - elif store_type == storage_lib.StoreType.IBM: - # ibm cos ls command, list all files in bucket - store = tmp_local_storage_obj_with_sub_path.stores[store_type] - bucket = store.s3_resource.Bucket(store.name) - files = [obj.key for obj in bucket.objects.all()] - elif store_type == storage_lib.StoreType.R2: - # r2 ls command, list all files in bucket - cmd = ( - f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls s3://{tmp_local_storage_obj_with_sub_path.name}/ ' - f'--recursive --endpoint {cloudflare.create_endpoint()} --profile=r2' - ) - out = subprocess.check_output(cmd, shell=True) - files = [ - line.split()[-1] - for line in out.decode('utf-8').splitlines() - ] - return files - # Creates a new bucket with a local source, uploads files to it # and deletes it. tmp_local_storage_obj_with_sub_path.add_store(store_type) # Check files under bucket and filter by prefix - files = _list_all_files() + files = self.list_all_files(store_type, + tmp_local_storage_obj_with_sub_path.name) assert len(files) > 0 if store_type == storage_lib.StoreType.GCS: assert all([ @@ -5530,9 +5565,10 @@ def _list_all_files(): ]) # Check bucket is empty, all files under sub directory should be deleted - tmp_local_storage_obj_with_sub_path.delete( - only_delete_sub_path_if_exists=True) - files = _list_all_files() + store = tmp_local_storage_obj_with_sub_path.stores[store_type] + store.delete_sub_path() + files = self.list_all_files(store_type, + tmp_local_storage_obj_with_sub_path.name) assert len(files) == 0 # Now, delete the entire bucket From 0613905db580be4eda00ff3a4832784a82b992c9 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Sat, 14 Dec 2024 18:37:41 +0800 Subject: [PATCH 44/48] bug fix --- sky/utils/controller_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index f32717b8001..f9e60906f2d 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -736,7 +736,7 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: store_type, store_cls, bucket_name, sub_path, storage_account_name, \ region = storage_lib.StoreType.get_fields_from_store_url( bucket_wth_prefix) - store_kwargs['allow_bucket_creation'] = False + store_kwargs['_allow_bucket_creation'] = False if storage_account_name is not None: store_kwargs['storage_account_name'] = storage_account_name if region is not None: From d1ae190ea057cac7bb89ca87bccd6cfcd802a99b Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Sat, 14 Dec 2024 18:53:27 +0800 Subject: [PATCH 45/48] bug fix --- sky/data/storage.py | 10 +++++----- tests/smoke_tests/test_mount_and_storage.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index 6a9646b7b20..a291b18757d 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -1418,7 +1418,7 @@ def delete(self) -> None: logger.info(f'{colorama.Fore.GREEN}{msg_str}' f'{colorama.Style.RESET_ALL}') - def delete_sub_path(self) -> None: + def _delete_sub_path(self) -> None: assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' deleted_by_skypilot = self._delete_s3_bucket_sub_path( self.name, self._bucket_sub_path) @@ -1897,7 +1897,7 @@ def delete(self) -> None: logger.info(f'{colorama.Fore.GREEN}{msg_str}' f'{colorama.Style.RESET_ALL}') - def delete_sub_path(self) -> None: + def _delete_sub_path(self) -> None: assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' deleted_by_skypilot = self._delete_gcs_bucket(self.name, self._bucket_sub_path) @@ -2715,7 +2715,7 @@ def delete(self) -> None: logger.info(f'{colorama.Fore.GREEN}{msg_str}' f'{colorama.Style.RESET_ALL}') - def delete_sub_path(self) -> None: + def _delete_sub_path(self) -> None: if self._bucket_sub_path is not None and not self.is_sky_managed: return self._delete_sub_path() assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' @@ -3159,7 +3159,7 @@ def delete(self) -> None: logger.info(f'{colorama.Fore.GREEN}{msg_str}' f'{colorama.Style.RESET_ALL}') - def delete_sub_path(self) -> None: + def _delete_sub_path(self) -> None: if self._bucket_sub_path is not None and not self.is_sky_managed: return self._delete_sub_path() assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' @@ -3635,7 +3635,7 @@ def delete(self) -> None: logger.info(f'{colorama.Fore.GREEN}Deleted COS bucket {self.name}.' f'{colorama.Style.RESET_ALL}') - def delete_sub_path(self) -> None: + def _delete_sub_path(self) -> None: if self._bucket_sub_path is not None and not self.is_sky_managed: return self._delete_sub_path() assert self._bucket_sub_path is not None, 'bucket_sub_path is not set' diff --git a/tests/smoke_tests/test_mount_and_storage.py b/tests/smoke_tests/test_mount_and_storage.py index 71c1cb04762..9e506e5bdcb 100644 --- a/tests/smoke_tests/test_mount_and_storage.py +++ b/tests/smoke_tests/test_mount_and_storage.py @@ -1099,7 +1099,7 @@ def test_bucket_sub_path(self, tmp_local_storage_obj_with_sub_path, # Check bucket is empty, all files under sub directory should be deleted store = tmp_local_storage_obj_with_sub_path.stores[store_type] - store.delete_sub_path() + store._delete_sub_path() files = self.list_all_files(store_type, tmp_local_storage_obj_with_sub_path.name) assert len(files) == 0 From 469cede0207877d93d5c6b8e4faf839bb5685613 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Sat, 14 Dec 2024 23:16:31 +0800 Subject: [PATCH 46/48] fix test case --- tests/smoke_tests/test_managed_job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py index 80b3734aa93..86a7c971526 100644 --- a/tests/smoke_tests/test_managed_job.py +++ b/tests/smoke_tests/test_managed_job.py @@ -785,7 +785,7 @@ def test_managed_jobs_intermediate_storage(generic_cloud: str): [ *smoke_tests_utils.STORAGE_SETUP_COMMANDS, # Verify command fails with correct error - run only once - f'err=$(sky jobs launch -n {name} --cloud {generic_cloud} {file_path} -y 2>&1); ret=$?; [ $ret -eq 0 ] || ! echo "$err" | grep "ValueError: Storage {intermediate_storage_name} not found" > /dev/null && exit 1 || exit 0', + f'err=$(sky jobs launch -n {name} --cloud {generic_cloud} {file_path} -y 2>&1); ret=$?; [ $ret -eq 0 ] || ! echo "$err" | grep "StorageBucketCreateError: Configured to use a non-existent bucket: {intermediate_storage_name}" > /dev/null && exit 1 || exit 0', f'aws s3api create-bucket --bucket {intermediate_storage_name}', f'sky jobs launch -n {name} --cloud {generic_cloud} {file_path} -y', # fail because the bucket does not exist From faedd282b13770407bdc0c6da3f164b90d0a8862 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Sun, 15 Dec 2024 00:50:18 +0800 Subject: [PATCH 47/48] bug fix --- sky/task.py | 4 +++- sky/utils/controller_utils.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sky/task.py b/sky/task.py index 46a479aeaa5..8ec77802c77 100644 --- a/sky/task.py +++ b/sky/task.py @@ -948,7 +948,7 @@ def _get_preferred_store( store_type = storage_lib.StoreType.from_cloud(storage_cloud_str) return store_type, storage_region - def sync_storage_mounts(self) -> None: + def sync_storage_mounts(self, force_sync: bool = False) -> None: """(INTERNAL) Eagerly syncs storage mounts to cloud storage. After syncing up, COPY-mode storage mounts are translated into regular @@ -960,6 +960,8 @@ def sync_storage_mounts(self) -> None: store_type, store_region = self._get_preferred_store() self.storage_plans[storage] = store_type storage.add_store(store_type, store_region) + elif force_sync: + storage.sync_all_stores() else: # We will download the first store that is added to remote. self.storage_plans[storage] = list(storage.stores.keys())[0] diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index f6645de21c0..3c454890486 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -690,7 +690,7 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: bucket_wth_prefix = skypilot_config.get_nested(('jobs', 'bucket'), None) store_kwargs: Dict[str, Any] = {} if bucket_wth_prefix is None: - store_cls = sub_path = None + store_type = store_cls = sub_path = None storage_account_name = region = None bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format( username=common_utils.get_cleaned_username(), id=run_id) @@ -832,7 +832,7 @@ def _sub_path_join(sub_path: Optional[str], path: str) -> str: ux_utils.spinner_message('Uploading local sources to storage[/] ' '[dim]View storages: sky storage ls')) try: - task.sync_storage_mounts() + task.sync_storage_mounts(force_sync=bucket_wth_prefix is not None) except (ValueError, exceptions.NoCloudAccessError) as e: if 'No enabled cloud for storage' in str(e) or isinstance( e, exceptions.NoCloudAccessError): From 8576e142185eba5811ccf64a5883e6e5702bdb9f Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Sun, 15 Dec 2024 00:53:27 +0800 Subject: [PATCH 48/48] fix --- sky/task.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/task.py b/sky/task.py index 8ec77802c77..339f2ddf122 100644 --- a/sky/task.py +++ b/sky/task.py @@ -960,9 +960,9 @@ def sync_storage_mounts(self, force_sync: bool = False) -> None: store_type, store_region = self._get_preferred_store() self.storage_plans[storage] = store_type storage.add_store(store_type, store_region) - elif force_sync: - storage.sync_all_stores() else: + if force_sync: + storage.sync_all_stores() # We will download the first store that is added to remote. self.storage_plans[storage] = list(storage.stores.keys())[0]