Skip to content

Commit

Permalink
[Storage] Add .skyignore support (skypilot-org#4038)
Browse files Browse the repository at this point in the history
* [Storage] Add .skyignore support

* lint fix

* fix lint

* Make sky job launch consistent with sky launch

* remove unused comments

* Don't use .git/info/exclude when .skyignore is present

* Don't use .git/info/exclude when .skyignore is present 2

* Update SkyPilot Reference Page

* address comments

* Handle all files under current dir

* link

* no absolute path

* use / in front of individual files and dirs

* correct **/

---------

Co-authored-by: Yika Luo <[email protected]>
  • Loading branch information
yika-luo and Yika Luo authored Oct 8, 2024
1 parent d5b6d89 commit 3f898ab
Show file tree
Hide file tree
Showing 10 changed files with 184 additions and 48 deletions.
31 changes: 25 additions & 6 deletions docs/source/examples/syncing-code-artifacts.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,30 @@ scripts, access checkpoints, etc.).

.. note::

**Exclude files from syncing**

For large, multi-gigabyte workdirs, uploading may be slow because they
are synced to the remote VM(s) with :code:`rsync`. To exclude large files in
your workdir from being uploaded, add them to the :code:`.gitignore` file
(or a ``.git/info/exclude`` file) under the workdir.
are synced to the remote VM(s). To exclude large files in
your workdir from being uploaded, add them to a :code:`.skyignore` file
under your workdir. :code:`.skyignore` follows RSYNC filter rules.

Example :code:`.skyignore` file:

.. code-block::
# Files that match pattern under ONLY CURRENT directory
/hello.py
/*.txt
/dir
# Files that match pattern under ALL directories
*.txt
hello.py
# Files that match pattern under a directory ./dir/
/dir/*.txt
Do NOT use ``.`` to indicate local directory (e.g. ``./hello.py``).

.. note::

Expand Down Expand Up @@ -101,9 +121,8 @@ pass the ``--no-setup`` flag to ``sky launch``. For example, ``sky launch --no-s

.. note::

Items listed in a :code:`.gitignore` file (or a ``.git/info/exclude`` file)
under a local file_mount source are also ignored (the same behavior as
handling ``workdir``).
Items listed in a :code:`.skyignore` file under the local file_mount source
are also ignored (the same behavior as handling ``workdir``).

.. note::

Expand Down
4 changes: 2 additions & 2 deletions docs/source/reference/yaml-spec.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ Available fields:
# If a relative path is used, it's evaluated relative to the location from
# which `sky` is called.
#
# If a .gitignore file (or a .git/info/exclude file) exists in the working
# directory, files and directories listed in it will be excluded from syncing.
# To exclude files from syncing, add them to a .skyignore file under your working directory.
# Details: https://skypilot.readthedocs.io/en/latest/examples/syncing-code-artifacts.html#uploading-code-and-project-files
workdir: ~/my-task-code
# Number of nodes (optional; defaults to 1) to launch including the head node.
Expand Down
24 changes: 14 additions & 10 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,18 +280,22 @@ def path_size_megabytes(path: str) -> int:
If successful: the size of 'path' in megabytes, rounded down. Otherwise,
-1.
"""
resolved_path = pathlib.Path(path).expanduser().resolve()
git_exclude_filter = ''
if (resolved_path / command_runner.GIT_EXCLUDE).exists():
# Ensure file exists; otherwise, rsync will error out.
#
# We shlex.quote() because the path may contain spaces:
# 'my dir/.git/info/exclude'
# Without quoting rsync fails.
git_exclude_filter = command_runner.RSYNC_EXCLUDE_OPTION.format(
shlex.quote(str(resolved_path / command_runner.GIT_EXCLUDE)))
resolved_path = pathlib.Path(path).expanduser().resolve()
if (resolved_path / constants.SKY_IGNORE_FILE).exists():
rsync_filter = command_runner.RSYNC_FILTER_SKYIGNORE
else:
rsync_filter = command_runner.RSYNC_FILTER_GITIGNORE
if (resolved_path / command_runner.GIT_EXCLUDE).exists():
# Ensure file exists; otherwise, rsync will error out.
#
# We shlex.quote() because the path may contain spaces:
# 'my dir/.git/info/exclude'
# Without quoting rsync fails.
git_exclude_filter = command_runner.RSYNC_EXCLUDE_OPTION.format(
shlex.quote(str(resolved_path / command_runner.GIT_EXCLUDE)))
rsync_command = (f'rsync {command_runner.RSYNC_DISPLAY_OPTION} '
f'{command_runner.RSYNC_FILTER_OPTION} '
f'{rsync_filter} '
f'{git_exclude_filter} --dry-run {path!r}')
rsync_output = ''
try:
Expand Down
4 changes: 2 additions & 2 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3056,7 +3056,7 @@ def _sync_workdir(self, handle: CloudVmRayResourceHandle,
logger.warning(
f'{fore.YELLOW}The size of workdir {workdir!r} '
f'is {dir_size} MB. Try to keep workdir small or use '
'.gitignore to exclude large files, as large sizes will slow '
'.skyignore to exclude large files, as large sizes will slow '
f'down rsync.{style.RESET_ALL}')

log_path = os.path.join(self.log_dir, 'workdir_sync.log')
Expand Down Expand Up @@ -4470,7 +4470,7 @@ def _execute_file_mounts(self, handle: CloudVmRayResourceHandle,
logger.warning(
f'{fore.YELLOW}The size of file mount src {src!r} '
f'is {src_size} MB. Try to keep src small or use '
'.gitignore to exclude large files, as large sizes '
'.skyignore to exclude large files, as large sizes '
f'will slow down rsync. {style.RESET_ALL}')
if os.path.islink(full_src):
logger.warning(
Expand Down
12 changes: 4 additions & 8 deletions sky/data/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -1298,8 +1298,7 @@ def get_file_sync_command(base_dir_path, file_names):

def get_dir_sync_command(src_dir_path, dest_dir_name):
# we exclude .git directory from the sync
excluded_list = storage_utils.get_excluded_files_from_gitignore(
src_dir_path)
excluded_list = storage_utils.get_excluded_files(src_dir_path)
excluded_list.append('.git/*')
excludes = ' '.join([
f'--exclude {shlex.quote(file_name)}'
Expand Down Expand Up @@ -1764,8 +1763,7 @@ def get_file_sync_command(base_dir_path, file_names):
return sync_command

def get_dir_sync_command(src_dir_path, dest_dir_name):
excluded_list = storage_utils.get_excluded_files_from_gitignore(
src_dir_path)
excluded_list = storage_utils.get_excluded_files(src_dir_path)
# we exclude .git directory from the sync
excluded_list.append(r'^\.git/.*$')
excludes = '|'.join(excluded_list)
Expand Down Expand Up @@ -2490,8 +2488,7 @@ def get_file_sync_command(base_dir_path, file_names) -> str:

def get_dir_sync_command(src_dir_path, dest_dir_name) -> str:
# we exclude .git directory from the sync
excluded_list = storage_utils.get_excluded_files_from_gitignore(
src_dir_path)
excluded_list = storage_utils.get_excluded_files(src_dir_path)
excluded_list.append('.git/')
excludes_list = ';'.join(
[file_name.rstrip('*') for file_name in excluded_list])
Expand Down Expand Up @@ -2895,8 +2892,7 @@ def get_file_sync_command(base_dir_path, file_names):

def get_dir_sync_command(src_dir_path, dest_dir_name):
# we exclude .git directory from the sync
excluded_list = storage_utils.get_excluded_files_from_gitignore(
src_dir_path)
excluded_list = storage_utils.get_excluded_files(src_dir_path)
excluded_list.append('.git/*')
excludes = ' '.join([
f'--exclude {shlex.quote(file_name)}'
Expand Down
58 changes: 57 additions & 1 deletion sky/data/storage_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Utility functions for the storage module."""
import glob
import os
import shlex
import subprocess
Expand All @@ -8,6 +9,8 @@

from sky import exceptions
from sky import sky_logging
from sky.skylet import constants
from sky.utils import common_utils
from sky.utils import log_utils
from sky.utils.cli_utils import status_utils

Expand Down Expand Up @@ -63,6 +66,42 @@ def format_storage_table(storages: List[Dict[str, Any]],
return 'No existing storage.'


def get_excluded_files_from_skyignore(src_dir_path: str) -> List[str]:
"""List files and patterns ignored by the .skyignore file
in the given source directory.
"""
excluded_list: List[str] = []
expand_src_dir_path = os.path.expanduser(src_dir_path)
skyignore_path = os.path.join(expand_src_dir_path,
constants.SKY_IGNORE_FILE)

try:
with open(skyignore_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line and not line.startswith('#'):
# Make parsing consistent with rsync.
# Rsync uses '/' as current directory.
if line.startswith('/'):
line = '.' + line
else:
line = '**/' + line
# Find all files matching the pattern.
matching_files = glob.glob(os.path.join(
expand_src_dir_path, line),
recursive=True)
# Process filenames to comply with cloud rsync format.
for i in range(len(matching_files)):
matching_files[i] = os.path.relpath(
matching_files[i], expand_src_dir_path)
excluded_list.extend(matching_files)
except IOError as e:
logger.warning(f'Error reading {skyignore_path}: '
f'{common_utils.format_exception(e, use_bracket=True)}')

return excluded_list


def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
""" Lists files and patterns ignored by git in the source directory
Expand All @@ -78,7 +117,8 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
expand_src_dir_path = os.path.expanduser(src_dir_path)

git_exclude_path = os.path.join(expand_src_dir_path, '.git/info/exclude')
gitignore_path = os.path.join(expand_src_dir_path, '.gitignore')
gitignore_path = os.path.join(expand_src_dir_path,
constants.GIT_IGNORE_FILE)

git_exclude_exists = os.path.isfile(git_exclude_path)
gitignore_exists = os.path.isfile(gitignore_path)
Expand Down Expand Up @@ -162,3 +202,19 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
to_be_excluded += '*'
excluded_list.append(to_be_excluded)
return excluded_list


def get_excluded_files(src_dir_path: str) -> List[str]:
# TODO: this could return a huge list of files,
# should think of ways to optimize.
""" List files and directories to be excluded."""
expand_src_dir_path = os.path.expanduser(src_dir_path)
skyignore_path = os.path.join(expand_src_dir_path,
constants.SKY_IGNORE_FILE)
if os.path.exists(skyignore_path):
logger.info(f'Exclude files to sync to cluster based on '
f'{constants.SKY_IGNORE_FILE}.')
return get_excluded_files_from_skyignore(src_dir_path)
logger.info(f'Exclude files to sync to cluster based on '
f'{constants.GIT_IGNORE_FILE}.')
return get_excluded_files_from_gitignore(src_dir_path)
2 changes: 2 additions & 0 deletions sky/skylet/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

SKY_LOGS_DIRECTORY = '~/sky_logs'
SKY_REMOTE_WORKDIR = '~/sky_workdir'
SKY_IGNORE_FILE = '.skyignore'
GIT_IGNORE_FILE = '.gitignore'

# Default Ray port is 6379. Default Ray dashboard port is 8265.
# Default Ray tempdir is /tmp/ray.
Expand Down
39 changes: 21 additions & 18 deletions sky/utils/command_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

logger = sky_logging.init_logger(__name__)

# The git exclude file to support.
GIT_EXCLUDE = '.git/info/exclude'
# Rsync options
# TODO(zhwu): This will print a per-file progress bar (with -P),
# shooting a lot of messages to the output. --info=progress2 is used
Expand All @@ -30,7 +28,10 @@
# Note that "-" is mandatory for rsync and means all patterns in the ignore
# files are treated as *exclude* patterns. Non-exclude patterns, e.g., "!
# do_not_exclude" doesn't work, even though git allows it.
RSYNC_FILTER_OPTION = '--filter=\'dir-merge,- .gitignore\''
RSYNC_FILTER_SKYIGNORE = f'--filter=\'dir-merge,- {constants.SKY_IGNORE_FILE}\''
RSYNC_FILTER_GITIGNORE = f'--filter=\'dir-merge,- {constants.GIT_IGNORE_FILE}\''
# The git exclude file to support.
GIT_EXCLUDE = '.git/info/exclude'
RSYNC_EXCLUDE_OPTION = '--exclude-from={}'

_HASH_MAX_LENGTH = 10
Expand Down Expand Up @@ -237,21 +238,23 @@ def _rsync(
rsync_command += ['rsync', RSYNC_DISPLAY_OPTION]

# --filter
rsync_command.append(RSYNC_FILTER_OPTION)

if up:
# Build --exclude-from argument.
# The source is a local path, so we need to resolve it.
resolved_source = pathlib.Path(source).expanduser().resolve()
if (resolved_source / GIT_EXCLUDE).exists():
# Ensure file exists; otherwise, rsync will error out.
#
# We shlex.quote() because the path may contain spaces:
# 'my dir/.git/info/exclude'
# Without quoting rsync fails.
rsync_command.append(
RSYNC_EXCLUDE_OPTION.format(
shlex.quote(str(resolved_source / GIT_EXCLUDE))))
# The source is a local path, so we need to resolve it.
resolved_source = pathlib.Path(source).expanduser().resolve()
if (resolved_source / constants.SKY_IGNORE_FILE).exists():
rsync_command.append(RSYNC_FILTER_SKYIGNORE)
else:
rsync_command.append(RSYNC_FILTER_GITIGNORE)
if up:
# Build --exclude-from argument.
if (resolved_source / GIT_EXCLUDE).exists():
# Ensure file exists; otherwise, rsync will error out.
#
# We shlex.quote() because the path may contain spaces:
# 'my dir/.git/info/exclude'
# Without quoting rsync fails.
rsync_command.append(
RSYNC_EXCLUDE_OPTION.format(
shlex.quote(str(resolved_source / GIT_EXCLUDE))))

rsync_command.append(f'-e {shlex.quote(rsh_option)}')

Expand Down
3 changes: 2 additions & 1 deletion sky/utils/command_runner.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ from sky.utils import subprocess_utils as subprocess_utils

GIT_EXCLUDE: str
RSYNC_DISPLAY_OPTION: str
RSYNC_FILTER_OPTION: str
RSYNC_FILTER_GITIGNORE: str
RSYNC_FILTER_SKYIGNORE: str
RSYNC_EXCLUDE_OPTION: str
ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD: str

Expand Down
55 changes: 55 additions & 0 deletions tests/unit_tests/test_storage_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import os
import tempfile

from sky.data import storage_utils
from sky.skylet import constants


def test_get_excluded_files_from_skyignore_no_file():
excluded_files = storage_utils.get_excluded_files_from_skyignore('.')
assert len(excluded_files) == 0


def test_get_excluded_files_from_skyignore():
with tempfile.TemporaryDirectory() as temp_dir:
# Create workdir
dirs = ['remove_dir', 'dir', 'dir/subdir', 'dir/subdir/remove_dir']
files = [
'remove.py', 'remove.sh', 'remove.a', 'keep.py', 'remove.a',
'dir/keep.txt', 'dir/remove.sh', 'dir/keep.a', 'dir/remove.b',
'dir/remove.a', 'dir/subdir/keep.b', 'dir/subdir/remove.py'
]
for dir_name in dirs:
os.makedirs(os.path.join(temp_dir, dir_name), exist_ok=True)
for file_path in files:
full_path = os.path.join(temp_dir, file_path)
with open(full_path, 'w') as f:
f.write('test content')

# Create skyignore file
skyignore_content = """
# Current directory
/remove.py
/remove_dir
/*.a
/dir/*.b
# Pattern match for all subdirectories
*.sh
remove.a
"""
skyignore_path = os.path.join(temp_dir, constants.SKY_IGNORE_FILE)
with open(skyignore_path, 'w') as f:
f.write(skyignore_content)

# Test function
excluded_files = storage_utils.get_excluded_files_from_skyignore(
temp_dir)

# Validate results
expected_excluded_files = [
'remove.py', 'remove_dir', 'remove.sh', 'remove.a', 'dir/remove.sh',
'dir/remove.b', 'remove.a', 'dir/remove.a'
]
for file_path in expected_excluded_files:
assert file_path in excluded_files
assert len(excluded_files) == len(expected_excluded_files)

0 comments on commit 3f898ab

Please sign in to comment.