diff --git a/README.md b/README.md index 95e6b38..797df4d 100644 --- a/README.md +++ b/README.md @@ -100,6 +100,8 @@ Reading a sequence repository requires several Python packages, all of which are available from pypi. Installation should be as simple as `pip install biocommons.seqrepo`. +Acquiring SeqRepo snapshots using the CLI requires an [rsync](https://github.com/RsyncProject/rsync) binary to be available on the system $PATH. Note that [openrsync](https://www.openrsync.org/), which now ships with new MacOS installs, does not support all required functions. Mac users should install rsync from [HomeBrew](https://formulae.brew.sh/formula/rsync) and ensure that it's available on the $PATH. + *Writing* sequence files also requires `bgzip`, which provided in the [htslib](https://github.com/samtools/htslib) repo. Ubuntu users should install the `tabix` package with `sudo apt install tabix`. diff --git a/src/biocommons/seqrepo/cli.py b/src/biocommons/seqrepo/cli.py index a767d34..f57d699 100644 --- a/src/biocommons/seqrepo/cli.py +++ b/src/biocommons/seqrepo/cli.py @@ -49,6 +49,28 @@ _logger = logging.getLogger(__name__) +class RsyncExeError(Exception): + """Raise for issues relating to rsync executable.""" + + +def _raise_for_rsync_binary_error(e: subprocess.CalledProcessError, opts: argparse.Namespace): + """Check if error appears to be rooted in a known problem with recent (as of 2024) + MacOS distributions, which ship with an incompatible rsync clone. + + Raise a custom error type if so. Do nothing otherwise. Calling contexts should + presumably raise the original error in that case. + + :param e: originating subprocess error + :param opts: CLI args given + :raise RsyncExeError: if issue appears to be related to openrsync + """ + if e.returncode == 1: + result = subprocess.check_output([opts.rsync_exe, "--version"]) + if result is not None and ("openrsync" in result.decode()): + msg = f"Binary located at {opts.rsync_exe} appears to be an `openrsync` instance, but the SeqRepo CLI requires `rsync` (NOT `openrsync`). Please install `rsync` and either make it available on the $PATH variable or manually provide its location with the `--rsync-exe` option. See README for more information." # noqa: E501 + raise RsyncExeError(msg) + + def _get_remote_instances(opts: argparse.Namespace) -> list[str]: line_re = re.compile(r"d[-rwx]{9}\s+[\d,]+ \d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2} (.+)") rsync_cmd = [ @@ -58,7 +80,12 @@ def _get_remote_instances(opts: argparse.Namespace) -> list[str]: opts.remote_host + "::seqrepo", ] _logger.debug("Executing `" + " ".join(rsync_cmd) + "`") - lines = subprocess.check_output(rsync_cmd).decode().splitlines()[1:] + try: + result = subprocess.check_output(rsync_cmd) + except subprocess.CalledProcessError as e: + _raise_for_rsync_binary_error(e, opts) + raise + lines = result.decode().splitlines()[1:] dirs = (m.group(1) for m in (line_re.match(line) for line in lines) if m) return sorted(list(filter(instance_name_new_re.match, dirs))) @@ -571,7 +598,11 @@ def pull(opts: argparse.Namespace) -> None: _logger.debug("Executing: " + " ".join(cmd)) if not opts.dry_run: - subprocess.check_call(cmd) + try: + subprocess.check_call(cmd) + except subprocess.CalledProcessError as e: + _raise_for_rsync_binary_error(e, opts) + raise dst_dir = os.path.join(opts.root_directory, instance_name) os.rename(tmp_dir, dst_dir) _logger.info("{}: successfully updated ({})".format(instance_name, dst_dir))