diff --git a/docs/source/examples/syncing-code-artifacts.rst b/docs/source/examples/syncing-code-artifacts.rst index 814bd00fb25..ded8d03f739 100644 --- a/docs/source/examples/syncing-code-artifacts.rst +++ b/docs/source/examples/syncing-code-artifacts.rst @@ -47,10 +47,30 @@ scripts, access checkpoints, etc.). .. note:: + **Exclude files from syncing** + For large, multi-gigabyte workdirs, uploading may be slow because they - are synced to the remote VM(s) with :code:`rsync`. To exclude large files in - your workdir from being uploaded, add them to the :code:`.gitignore` file - (or a ``.git/info/exclude`` file) under the workdir. + are synced to the remote VM(s). To exclude large files in + your workdir from being uploaded, add them to a :code:`.skyignore` file + under your workdir. :code:`.skyignore` follows RSYNC filter rules. + + Example :code:`.skyignore` file: + + .. code-block:: + + # Files that match pattern under ONLY CURRENT directory + /hello.py + /*.txt + /dir + + # Files that match pattern under ALL directories + *.txt + hello.py + + # Files that match pattern under a directory ./dir/ + /dir/*.txt + + Do NOT use ``.`` to indicate local directory (e.g. ``./hello.py``). .. note:: @@ -101,9 +121,8 @@ pass the ``--no-setup`` flag to ``sky launch``. For example, ``sky launch --no-s .. note:: - Items listed in a :code:`.gitignore` file (or a ``.git/info/exclude`` file) - under a local file_mount source are also ignored (the same behavior as - handling ``workdir``). + Items listed in a :code:`.skyignore` file under the local file_mount source + are also ignored (the same behavior as handling ``workdir``). .. note:: diff --git a/docs/source/reference/yaml-spec.rst b/docs/source/reference/yaml-spec.rst index 228cbd7c88f..c5339bcc184 100644 --- a/docs/source/reference/yaml-spec.rst +++ b/docs/source/reference/yaml-spec.rst @@ -22,8 +22,8 @@ Available fields: # If a relative path is used, it's evaluated relative to the location from # which `sky` is called. # - # If a .gitignore file (or a .git/info/exclude file) exists in the working - # directory, files and directories listed in it will be excluded from syncing. + # To exclude files from syncing, add them to a .skyignore file under your working directory. + # Details: https://skypilot.readthedocs.io/en/latest/examples/syncing-code-artifacts.html#uploading-code-and-project-files workdir: ~/my-task-code # Number of nodes (optional; defaults to 1) to launch including the head node. diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index b83817b9b42..24f638a12b9 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -280,18 +280,22 @@ def path_size_megabytes(path: str) -> int: If successful: the size of 'path' in megabytes, rounded down. Otherwise, -1. """ - resolved_path = pathlib.Path(path).expanduser().resolve() git_exclude_filter = '' - if (resolved_path / command_runner.GIT_EXCLUDE).exists(): - # Ensure file exists; otherwise, rsync will error out. - # - # We shlex.quote() because the path may contain spaces: - # 'my dir/.git/info/exclude' - # Without quoting rsync fails. - git_exclude_filter = command_runner.RSYNC_EXCLUDE_OPTION.format( - shlex.quote(str(resolved_path / command_runner.GIT_EXCLUDE))) + resolved_path = pathlib.Path(path).expanduser().resolve() + if (resolved_path / constants.SKY_IGNORE_FILE).exists(): + rsync_filter = command_runner.RSYNC_FILTER_SKYIGNORE + else: + rsync_filter = command_runner.RSYNC_FILTER_GITIGNORE + if (resolved_path / command_runner.GIT_EXCLUDE).exists(): + # Ensure file exists; otherwise, rsync will error out. + # + # We shlex.quote() because the path may contain spaces: + # 'my dir/.git/info/exclude' + # Without quoting rsync fails. + git_exclude_filter = command_runner.RSYNC_EXCLUDE_OPTION.format( + shlex.quote(str(resolved_path / command_runner.GIT_EXCLUDE))) rsync_command = (f'rsync {command_runner.RSYNC_DISPLAY_OPTION} ' - f'{command_runner.RSYNC_FILTER_OPTION} ' + f'{rsync_filter} ' f'{git_exclude_filter} --dry-run {path!r}') rsync_output = '' try: diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 4d6e0eb4fb7..714e4fc14eb 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3056,7 +3056,7 @@ def _sync_workdir(self, handle: CloudVmRayResourceHandle, logger.warning( f'{fore.YELLOW}The size of workdir {workdir!r} ' f'is {dir_size} MB. Try to keep workdir small or use ' - '.gitignore to exclude large files, as large sizes will slow ' + '.skyignore to exclude large files, as large sizes will slow ' f'down rsync.{style.RESET_ALL}') log_path = os.path.join(self.log_dir, 'workdir_sync.log') @@ -4470,7 +4470,7 @@ def _execute_file_mounts(self, handle: CloudVmRayResourceHandle, logger.warning( f'{fore.YELLOW}The size of file mount src {src!r} ' f'is {src_size} MB. Try to keep src small or use ' - '.gitignore to exclude large files, as large sizes ' + '.skyignore to exclude large files, as large sizes ' f'will slow down rsync. {style.RESET_ALL}') if os.path.islink(full_src): logger.warning( diff --git a/sky/data/storage.py b/sky/data/storage.py index 5214799d2f3..78174ad1ed5 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -1298,8 +1298,7 @@ def get_file_sync_command(base_dir_path, file_names): def get_dir_sync_command(src_dir_path, dest_dir_name): # we exclude .git directory from the sync - excluded_list = storage_utils.get_excluded_files_from_gitignore( - src_dir_path) + excluded_list = storage_utils.get_excluded_files(src_dir_path) excluded_list.append('.git/*') excludes = ' '.join([ f'--exclude {shlex.quote(file_name)}' @@ -1764,8 +1763,7 @@ def get_file_sync_command(base_dir_path, file_names): return sync_command def get_dir_sync_command(src_dir_path, dest_dir_name): - excluded_list = storage_utils.get_excluded_files_from_gitignore( - src_dir_path) + excluded_list = storage_utils.get_excluded_files(src_dir_path) # we exclude .git directory from the sync excluded_list.append(r'^\.git/.*$') excludes = '|'.join(excluded_list) @@ -2490,8 +2488,7 @@ def get_file_sync_command(base_dir_path, file_names) -> str: def get_dir_sync_command(src_dir_path, dest_dir_name) -> str: # we exclude .git directory from the sync - excluded_list = storage_utils.get_excluded_files_from_gitignore( - src_dir_path) + excluded_list = storage_utils.get_excluded_files(src_dir_path) excluded_list.append('.git/') excludes_list = ';'.join( [file_name.rstrip('*') for file_name in excluded_list]) @@ -2895,8 +2892,7 @@ def get_file_sync_command(base_dir_path, file_names): def get_dir_sync_command(src_dir_path, dest_dir_name): # we exclude .git directory from the sync - excluded_list = storage_utils.get_excluded_files_from_gitignore( - src_dir_path) + excluded_list = storage_utils.get_excluded_files(src_dir_path) excluded_list.append('.git/*') excludes = ' '.join([ f'--exclude {shlex.quote(file_name)}' diff --git a/sky/data/storage_utils.py b/sky/data/storage_utils.py index 245325806a3..a1295d5e3ee 100644 --- a/sky/data/storage_utils.py +++ b/sky/data/storage_utils.py @@ -1,4 +1,5 @@ """Utility functions for the storage module.""" +import glob import os import shlex import subprocess @@ -8,6 +9,8 @@ from sky import exceptions from sky import sky_logging +from sky.skylet import constants +from sky.utils import common_utils from sky.utils import log_utils from sky.utils.cli_utils import status_utils @@ -63,6 +66,42 @@ def format_storage_table(storages: List[Dict[str, Any]], return 'No existing storage.' +def get_excluded_files_from_skyignore(src_dir_path: str) -> List[str]: + """List files and patterns ignored by the .skyignore file + in the given source directory. + """ + excluded_list: List[str] = [] + expand_src_dir_path = os.path.expanduser(src_dir_path) + skyignore_path = os.path.join(expand_src_dir_path, + constants.SKY_IGNORE_FILE) + + try: + with open(skyignore_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + # Make parsing consistent with rsync. + # Rsync uses '/' as current directory. + if line.startswith('/'): + line = '.' + line + else: + line = '**/' + line + # Find all files matching the pattern. + matching_files = glob.glob(os.path.join( + expand_src_dir_path, line), + recursive=True) + # Process filenames to comply with cloud rsync format. + for i in range(len(matching_files)): + matching_files[i] = os.path.relpath( + matching_files[i], expand_src_dir_path) + excluded_list.extend(matching_files) + except IOError as e: + logger.warning(f'Error reading {skyignore_path}: ' + f'{common_utils.format_exception(e, use_bracket=True)}') + + return excluded_list + + def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]: """ Lists files and patterns ignored by git in the source directory @@ -78,7 +117,8 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]: expand_src_dir_path = os.path.expanduser(src_dir_path) git_exclude_path = os.path.join(expand_src_dir_path, '.git/info/exclude') - gitignore_path = os.path.join(expand_src_dir_path, '.gitignore') + gitignore_path = os.path.join(expand_src_dir_path, + constants.GIT_IGNORE_FILE) git_exclude_exists = os.path.isfile(git_exclude_path) gitignore_exists = os.path.isfile(gitignore_path) @@ -162,3 +202,19 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]: to_be_excluded += '*' excluded_list.append(to_be_excluded) return excluded_list + + +def get_excluded_files(src_dir_path: str) -> List[str]: + # TODO: this could return a huge list of files, + # should think of ways to optimize. + """ List files and directories to be excluded.""" + expand_src_dir_path = os.path.expanduser(src_dir_path) + skyignore_path = os.path.join(expand_src_dir_path, + constants.SKY_IGNORE_FILE) + if os.path.exists(skyignore_path): + logger.info(f'Exclude files to sync to cluster based on ' + f'{constants.SKY_IGNORE_FILE}.') + return get_excluded_files_from_skyignore(src_dir_path) + logger.info(f'Exclude files to sync to cluster based on ' + f'{constants.GIT_IGNORE_FILE}.') + return get_excluded_files_from_gitignore(src_dir_path) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index f23dc8100b5..5729d75c968 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -7,6 +7,8 @@ SKY_LOGS_DIRECTORY = '~/sky_logs' SKY_REMOTE_WORKDIR = '~/sky_workdir' +SKY_IGNORE_FILE = '.skyignore' +GIT_IGNORE_FILE = '.gitignore' # Default Ray port is 6379. Default Ray dashboard port is 8265. # Default Ray tempdir is /tmp/ray. diff --git a/sky/utils/command_runner.py b/sky/utils/command_runner.py index 3d4bcb0af9a..2936e7c5e62 100644 --- a/sky/utils/command_runner.py +++ b/sky/utils/command_runner.py @@ -16,8 +16,6 @@ logger = sky_logging.init_logger(__name__) -# The git exclude file to support. -GIT_EXCLUDE = '.git/info/exclude' # Rsync options # TODO(zhwu): This will print a per-file progress bar (with -P), # shooting a lot of messages to the output. --info=progress2 is used @@ -30,7 +28,10 @@ # Note that "-" is mandatory for rsync and means all patterns in the ignore # files are treated as *exclude* patterns. Non-exclude patterns, e.g., "! # do_not_exclude" doesn't work, even though git allows it. -RSYNC_FILTER_OPTION = '--filter=\'dir-merge,- .gitignore\'' +RSYNC_FILTER_SKYIGNORE = f'--filter=\'dir-merge,- {constants.SKY_IGNORE_FILE}\'' +RSYNC_FILTER_GITIGNORE = f'--filter=\'dir-merge,- {constants.GIT_IGNORE_FILE}\'' +# The git exclude file to support. +GIT_EXCLUDE = '.git/info/exclude' RSYNC_EXCLUDE_OPTION = '--exclude-from={}' _HASH_MAX_LENGTH = 10 @@ -237,21 +238,23 @@ def _rsync( rsync_command += ['rsync', RSYNC_DISPLAY_OPTION] # --filter - rsync_command.append(RSYNC_FILTER_OPTION) - - if up: - # Build --exclude-from argument. - # The source is a local path, so we need to resolve it. - resolved_source = pathlib.Path(source).expanduser().resolve() - if (resolved_source / GIT_EXCLUDE).exists(): - # Ensure file exists; otherwise, rsync will error out. - # - # We shlex.quote() because the path may contain spaces: - # 'my dir/.git/info/exclude' - # Without quoting rsync fails. - rsync_command.append( - RSYNC_EXCLUDE_OPTION.format( - shlex.quote(str(resolved_source / GIT_EXCLUDE)))) + # The source is a local path, so we need to resolve it. + resolved_source = pathlib.Path(source).expanduser().resolve() + if (resolved_source / constants.SKY_IGNORE_FILE).exists(): + rsync_command.append(RSYNC_FILTER_SKYIGNORE) + else: + rsync_command.append(RSYNC_FILTER_GITIGNORE) + if up: + # Build --exclude-from argument. + if (resolved_source / GIT_EXCLUDE).exists(): + # Ensure file exists; otherwise, rsync will error out. + # + # We shlex.quote() because the path may contain spaces: + # 'my dir/.git/info/exclude' + # Without quoting rsync fails. + rsync_command.append( + RSYNC_EXCLUDE_OPTION.format( + shlex.quote(str(resolved_source / GIT_EXCLUDE)))) rsync_command.append(f'-e {shlex.quote(rsh_option)}') diff --git a/sky/utils/command_runner.pyi b/sky/utils/command_runner.pyi index 51b22a259ea..a2c524e4e5d 100644 --- a/sky/utils/command_runner.pyi +++ b/sky/utils/command_runner.pyi @@ -16,7 +16,8 @@ from sky.utils import subprocess_utils as subprocess_utils GIT_EXCLUDE: str RSYNC_DISPLAY_OPTION: str -RSYNC_FILTER_OPTION: str +RSYNC_FILTER_GITIGNORE: str +RSYNC_FILTER_SKYIGNORE: str RSYNC_EXCLUDE_OPTION: str ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD: str diff --git a/tests/unit_tests/test_storage_utils.py b/tests/unit_tests/test_storage_utils.py new file mode 100644 index 00000000000..cd1e436390b --- /dev/null +++ b/tests/unit_tests/test_storage_utils.py @@ -0,0 +1,55 @@ +import os +import tempfile + +from sky.data import storage_utils +from sky.skylet import constants + + +def test_get_excluded_files_from_skyignore_no_file(): + excluded_files = storage_utils.get_excluded_files_from_skyignore('.') + assert len(excluded_files) == 0 + + +def test_get_excluded_files_from_skyignore(): + with tempfile.TemporaryDirectory() as temp_dir: + # Create workdir + dirs = ['remove_dir', 'dir', 'dir/subdir', 'dir/subdir/remove_dir'] + files = [ + 'remove.py', 'remove.sh', 'remove.a', 'keep.py', 'remove.a', + 'dir/keep.txt', 'dir/remove.sh', 'dir/keep.a', 'dir/remove.b', + 'dir/remove.a', 'dir/subdir/keep.b', 'dir/subdir/remove.py' + ] + for dir_name in dirs: + os.makedirs(os.path.join(temp_dir, dir_name), exist_ok=True) + for file_path in files: + full_path = os.path.join(temp_dir, file_path) + with open(full_path, 'w') as f: + f.write('test content') + + # Create skyignore file + skyignore_content = """ + # Current directory + /remove.py + /remove_dir + /*.a + /dir/*.b + # Pattern match for all subdirectories + *.sh + remove.a + """ + skyignore_path = os.path.join(temp_dir, constants.SKY_IGNORE_FILE) + with open(skyignore_path, 'w') as f: + f.write(skyignore_content) + + # Test function + excluded_files = storage_utils.get_excluded_files_from_skyignore( + temp_dir) + + # Validate results + expected_excluded_files = [ + 'remove.py', 'remove_dir', 'remove.sh', 'remove.a', 'dir/remove.sh', + 'dir/remove.b', 'remove.a', 'dir/remove.a' + ] + for file_path in expected_excluded_files: + assert file_path in excluded_files + assert len(excluded_files) == len(expected_excluded_files)