From 54f531b68c2168179668e526935a40c63c771438 Mon Sep 17 00:00:00 2001 From: nsheff Date: Mon, 30 Mar 2020 12:33:52 -0400 Subject: [PATCH 01/29] add plugin functionality --- refgenconf/__init__.py | 9 ++++++++- refgenconf/plugins.py | 13 +++++++++++++ refgenconf/refgenconf.py | 25 +++++++++++++++++++++++-- 3 files changed, 44 insertions(+), 3 deletions(-) create mode 100644 refgenconf/plugins.py diff --git a/refgenconf/__init__.py b/refgenconf/__init__.py index 51557765..62d59d9f 100644 --- a/refgenconf/__init__.py +++ b/refgenconf/__init__.py @@ -1,11 +1,18 @@ from ._version import __version__ +import pkg_resources from .const import * from .exceptions import * from .helpers import * +from .plugins import * from .refgenconf import * + + __all__ = ["RefGenConf", "select_genome_config", "GenomeConfigFormatError", "MissingAssetError", "MissingConfigDataError", "MissingGenomeError", - "RefgenconfError", "UnboundEnvironmentVariablesError"] + \ + "RefgenconfError", "UnboundEnvironmentVariablesError", + "discovered_plugins"] + \ ["DEFAULT_SERVER"] + CFG_KEY_NAMES + + diff --git a/refgenconf/plugins.py b/refgenconf/plugins.py new file mode 100644 index 00000000..a4956285 --- /dev/null +++ b/refgenconf/plugins.py @@ -0,0 +1,13 @@ +""" Plugin functions """ + +import pkg_resources + +__all__ = ["plugins"] + +# HOOKS is a list of all available plugin entry points +HOOKS = ["post_update", "pre_pull", "pre_tag", "pre_list"] + +plugins = {} +for hook in HOOKS: + plugins[hook] = { entry_point.name: entry_point.load() for entry_point + in pkg_resources.iter_entry_points('refgenie.hooks.' + hook) } diff --git a/refgenconf/refgenconf.py b/refgenconf/refgenconf.py index 81ff8044..d81d1ad5 100755 --- a/refgenconf/refgenconf.py +++ b/refgenconf/refgenconf.py @@ -33,7 +33,7 @@ from .const import * from .helpers import unbound_env_vars, asciify_json_dict, select_genome_config from .exceptions import * - +from .plugins import plugins _LOGGER = logging.getLogger(__name__) @@ -160,6 +160,7 @@ def list(self, genome=None, order=None, include_tags=False): :return Mapping[str, Iterable[str]]: mapping from assembly name to collection of available asset names. """ + self.run_plugins("pre_list") refgens = _select_genomes(sorted(self[CFG_GENOMES_KEY].keys(), key=order), genome) if include_tags: return OrderedDict( @@ -245,6 +246,7 @@ def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, :raise refgenconf.MissingAssetError: if the names assembly is known to this configuration instance, but the requested asset is unknown """ + tag_name = tag_name or self.get_default_tag(genome_name, asset_name) _LOGGER.debug( "getting asset: '{}/{}.{}:{}'".format(genome_name, asset_name, seek_key, @@ -475,6 +477,7 @@ def tag(self, genome, asset, tag, new_tag, files=True): :raise ValueError: when the original tag is not specified :return bool: a logical indicating whether the tagging was successful """ + self.run_plugins("pre_tag") ori_path = self.seek(genome, asset, tag, enclosing_dir=True, strict_exists=True) new_path = os.path.abspath(os.path.join(ori_path, os.pardir, new_tag)) if self.file_path: @@ -504,6 +507,7 @@ def tag(self, genome, asset, tag, new_tag, files=True): _LOGGER.debug("Original asset has been moved from '{}' to '{}'". format(ori_path, new_path)) + def cfg_tag_asset(self, genome, asset, tag, new_tag): """ Retags the asset selected by the tag with the new_tag. @@ -621,6 +625,7 @@ def pull(self, genome, asset, tag, unpack=True, force=None, :raise refgenconf.RefGenConfError: if the object update is requested in a non-writable state """ + self.run_plugins("pre_pull") missing_vars = unbound_env_vars(self[CFG_FOLDER_KEY]) if missing_vars: raise UnboundEnvironmentVariablesError(", ".join(missing_vars)) @@ -764,6 +769,7 @@ def msg_overwrite(): self.set_default_pointer(*gat) return gat, archive_data, server_url + def remove_asset_from_relatives(self, genome, asset, tag): """ Remove any relationship links associated with the selected asset @@ -809,7 +815,7 @@ def update_relatives_assets(self, genome, asset, tag=None, data=None, children=F def update_seek_keys(self, genome, asset, tag=None, keys=None): """ - A convenience method which wraps the update assets and uses it to + A convenience method which wraps the updated assets and uses it to update the seek keys for a tagged asset. :param str genome: genome to be added/updated @@ -1238,6 +1244,21 @@ def id(self, genome, asset, tag=None): raise MissingConfigDataError("Digest does not exist for: {}/{}:{}". format(genome, asset, tag)) + def run_plugins(self, hook): + """ + Runs all installed plugins for the specified hook. + + :param str hook: hook idenfier + """ + _LOGGER.debug("Activating plugins...") + for name, func in plugins[hook].items(): + _LOGGER.debug("Running {} plugin: {}".format(hook, name)) + func(self) + + def write(self): + super(RefGenConf, self).write() + self.run_plugins("post_update") + class DownloadProgressBar(tqdm): """ From 98092d603fe70d2b7ed7ab52f08e4cab10827b00 Mon Sep 17 00:00:00 2001 From: nsheff Date: Mon, 30 Mar 2020 12:38:16 -0400 Subject: [PATCH 02/29] cleanup --- refgenconf/__init__.py | 3 +-- refgenconf/refgenconf.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/refgenconf/__init__.py b/refgenconf/__init__.py index 62d59d9f..f40b09e0 100644 --- a/refgenconf/__init__.py +++ b/refgenconf/__init__.py @@ -11,8 +11,7 @@ __all__ = ["RefGenConf", "select_genome_config", "GenomeConfigFormatError", "MissingAssetError", "MissingConfigDataError", "MissingGenomeError", - "RefgenconfError", "UnboundEnvironmentVariablesError", - "discovered_plugins"] + \ + "RefgenconfError", "UnboundEnvironmentVariablesError"] + \ ["DEFAULT_SERVER"] + CFG_KEY_NAMES diff --git a/refgenconf/refgenconf.py b/refgenconf/refgenconf.py index d81d1ad5..96908e3a 100755 --- a/refgenconf/refgenconf.py +++ b/refgenconf/refgenconf.py @@ -1250,7 +1250,6 @@ def run_plugins(self, hook): :param str hook: hook idenfier """ - _LOGGER.debug("Activating plugins...") for name, func in plugins[hook].items(): _LOGGER.debug("Running {} plugin: {}".format(hook, name)) func(self) From 9a92a70aa08f4958389951505225255dd4cf5857 Mon Sep 17 00:00:00 2001 From: nsheff Date: Mon, 30 Mar 2020 12:41:53 -0400 Subject: [PATCH 03/29] cleanup --- refgenconf/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/refgenconf/__init__.py b/refgenconf/__init__.py index f40b09e0..f59b9a92 100644 --- a/refgenconf/__init__.py +++ b/refgenconf/__init__.py @@ -1,5 +1,4 @@ from ._version import __version__ -import pkg_resources from .const import * from .exceptions import * From e5dffe39f9a8f2833a75a6588df77b0872aa71f9 Mon Sep 17 00:00:00 2001 From: nsheff Date: Mon, 30 Mar 2020 12:42:18 -0400 Subject: [PATCH 04/29] cleanup --- refgenconf/__init__.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/refgenconf/__init__.py b/refgenconf/__init__.py index f59b9a92..177de8ce 100644 --- a/refgenconf/__init__.py +++ b/refgenconf/__init__.py @@ -6,11 +6,7 @@ from .plugins import * from .refgenconf import * - - __all__ = ["RefGenConf", "select_genome_config", "GenomeConfigFormatError", "MissingAssetError", "MissingConfigDataError", "MissingGenomeError", "RefgenconfError", "UnboundEnvironmentVariablesError"] + \ ["DEFAULT_SERVER"] + CFG_KEY_NAMES - - From 775065e301fa549b0485fcedf0d88435185be1e5 Mon Sep 17 00:00:00 2001 From: nsheff Date: Mon, 30 Mar 2020 12:43:12 -0400 Subject: [PATCH 05/29] cleanup --- refgenconf/refgenconf.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/refgenconf/refgenconf.py b/refgenconf/refgenconf.py index 96908e3a..2c351aa7 100755 --- a/refgenconf/refgenconf.py +++ b/refgenconf/refgenconf.py @@ -246,7 +246,6 @@ def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, :raise refgenconf.MissingAssetError: if the names assembly is known to this configuration instance, but the requested asset is unknown """ - tag_name = tag_name or self.get_default_tag(genome_name, asset_name) _LOGGER.debug( "getting asset: '{}/{}.{}:{}'".format(genome_name, asset_name, seek_key, @@ -507,7 +506,6 @@ def tag(self, genome, asset, tag, new_tag, files=True): _LOGGER.debug("Original asset has been moved from '{}' to '{}'". format(ori_path, new_path)) - def cfg_tag_asset(self, genome, asset, tag, new_tag): """ Retags the asset selected by the tag with the new_tag. @@ -769,7 +767,6 @@ def msg_overwrite(): self.set_default_pointer(*gat) return gat, archive_data, server_url - def remove_asset_from_relatives(self, genome, asset, tag): """ Remove any relationship links associated with the selected asset From eb5e2725567df6ab77628760cf5a28f41580aa80 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 30 Mar 2020 16:34:38 -0400 Subject: [PATCH 06/29] update version and changelog --- docs/changelog.md | 5 +++++ refgenconf/_version.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/changelog.md b/docs/changelog.md index f8c8a55f..a1fc0c75 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,11 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +## [0.7.1] - unreleased + +### Added +- plugins functionality + ## [0.7.0] - 2020-03-17 ### Added diff --git a/refgenconf/_version.py b/refgenconf/_version.py index 49e0fc1e..fcf769f5 100644 --- a/refgenconf/_version.py +++ b/refgenconf/_version.py @@ -1 +1 @@ -__version__ = "0.7.0" +__version__ = "0.7.1-dev" From 5aba3fac15bda2e3d9cfe5e285c1274f4c91e352 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 30 Mar 2020 16:46:14 -0400 Subject: [PATCH 07/29] centralize hooks ids definition --- refgenconf/const.py | 14 +++++++++++++- refgenconf/plugins.py | 9 ++++----- refgenconf/refgenconf.py | 8 ++++---- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/refgenconf/const.py b/refgenconf/const.py index 6e851861..ff6c4b50 100644 --- a/refgenconf/const.py +++ b/refgenconf/const.py @@ -94,6 +94,18 @@ "CFG_ASSET_CHILDREN_KEY", "CFG_TAG_DESC_KEY", "CFG_ASSET_CHECKSUM_KEY", "CFG_ASSET_TAGS_KEY", "CFG_ASSET_RELATIVES_KEYS", "CFG_ARCHIVE_CONFIG_KEY", "CFG_ARCHIVE_KEY_OLD"] +# hook identifiers, A_* (after/post) B_* (before/pre) +B_UPDATE_HOOK = "pre_update" +A_UPDATE_HOOK = "post_update" +B_PULL_HOOK = "pre_pull" +A_PULL_HOOK = "post_pull" +B_TAG_HOOK = "pre_tag" +A_TAG_HOOK = "post_tag" +B_LIST_HOOK = "pre_list" +A_LIST_HOOK = "post_list" +# HOOKS is a list of all available plugin entry points +HOOKS = [A_LIST_HOOK, A_PULL_HOOK, A_TAG_HOOK, A_UPDATE_HOOK, B_TAG_HOOK, + B_LIST_HOOK, B_PULL_HOOK, B_UPDATE_HOOK] # other consts REQ_CFG_VERSION = 0.3 @@ -106,7 +118,7 @@ __all__ = ["DEFAULT_SERVER", "CFG_ASSET_DEFAULT_TAG_KEY", "CFG_KEY_NAMES", "CFG_GENOME_DESC_KEY", "REQ_CFG_VERSION", "CFG_ASSETS_KEY", "CFG_GENOME_ATTRS_KEYS", "REFGENIE_BY_CFG", "DEFAULT_TAG", "ATTRS_COPY_PULL", "RGC_REQ_KEYS", "REQ_TAG_ATTRS", "CUSTOM_BAR_FMT", "API_VERSION", "CONF_STRUCTURE", "OPERATION_IDS", - "CUSTOM_PFX"] + FILE_DIR_NAMES + CFG_CONST + CFG_KEY_NAMES + API_IDS + "CUSTOM_PFX", "HOOKS"] + FILE_DIR_NAMES + CFG_CONST + CFG_KEY_NAMES + API_IDS CONF_STRUCTURE = """ # example genome configuration structure diff --git a/refgenconf/plugins.py b/refgenconf/plugins.py index a4956285..65b9b080 100644 --- a/refgenconf/plugins.py +++ b/refgenconf/plugins.py @@ -1,13 +1,12 @@ """ Plugin functions """ import pkg_resources +from .const import HOOKS __all__ = ["plugins"] -# HOOKS is a list of all available plugin entry points -HOOKS = ["post_update", "pre_pull", "pre_tag", "pre_list"] - plugins = {} for hook in HOOKS: - plugins[hook] = { entry_point.name: entry_point.load() for entry_point - in pkg_resources.iter_entry_points('refgenie.hooks.' + hook) } + plugins[hook] = \ + {entry_point.name: entry_point.load() for entry_point in + pkg_resources.iter_entry_points('refgenie.hooks.' + hook)} diff --git a/refgenconf/refgenconf.py b/refgenconf/refgenconf.py index 2c351aa7..be444bf4 100755 --- a/refgenconf/refgenconf.py +++ b/refgenconf/refgenconf.py @@ -160,7 +160,7 @@ def list(self, genome=None, order=None, include_tags=False): :return Mapping[str, Iterable[str]]: mapping from assembly name to collection of available asset names. """ - self.run_plugins("pre_list") + self.run_plugins(B_LIST_HOOK) refgens = _select_genomes(sorted(self[CFG_GENOMES_KEY].keys(), key=order), genome) if include_tags: return OrderedDict( @@ -476,7 +476,7 @@ def tag(self, genome, asset, tag, new_tag, files=True): :raise ValueError: when the original tag is not specified :return bool: a logical indicating whether the tagging was successful """ - self.run_plugins("pre_tag") + self.run_plugins(B_TAG_HOOK) ori_path = self.seek(genome, asset, tag, enclosing_dir=True, strict_exists=True) new_path = os.path.abspath(os.path.join(ori_path, os.pardir, new_tag)) if self.file_path: @@ -623,7 +623,7 @@ def pull(self, genome, asset, tag, unpack=True, force=None, :raise refgenconf.RefGenConfError: if the object update is requested in a non-writable state """ - self.run_plugins("pre_pull") + self.run_plugins(B_PULL_HOOK) missing_vars = unbound_env_vars(self[CFG_FOLDER_KEY]) if missing_vars: raise UnboundEnvironmentVariablesError(", ".join(missing_vars)) @@ -1253,7 +1253,7 @@ def run_plugins(self, hook): def write(self): super(RefGenConf, self).write() - self.run_plugins("post_update") + self.run_plugins(A_UPDATE_HOOK) class DownloadProgressBar(tqdm): From 5dac949a35683425504d34a43a43e1e8b0f9bce0 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 30 Mar 2020 16:53:46 -0400 Subject: [PATCH 08/29] allow access to individuall hook consts --- refgenconf/const.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/refgenconf/const.py b/refgenconf/const.py index ff6c4b50..b3c88de7 100644 --- a/refgenconf/const.py +++ b/refgenconf/const.py @@ -104,8 +104,9 @@ B_LIST_HOOK = "pre_list" A_LIST_HOOK = "post_list" # HOOKS is a list of all available plugin entry points -HOOKS = [A_LIST_HOOK, A_PULL_HOOK, A_TAG_HOOK, A_UPDATE_HOOK, B_TAG_HOOK, - B_LIST_HOOK, B_PULL_HOOK, B_UPDATE_HOOK] +HOOK_NAMES = ["A_LIST_HOOK", "A_PULL_HOOK", "A_TAG_HOOK", "A_UPDATE_HOOK", + "B_TAG_HOOK", "B_LIST_HOOK", "B_PULL_HOOK", "B_UPDATE_HOOK"] +HOOKS = JSON_METADATA_VALUES = [eval(x) for x in HOOK_NAMES] # other consts REQ_CFG_VERSION = 0.3 @@ -118,7 +119,7 @@ __all__ = ["DEFAULT_SERVER", "CFG_ASSET_DEFAULT_TAG_KEY", "CFG_KEY_NAMES", "CFG_GENOME_DESC_KEY", "REQ_CFG_VERSION", "CFG_ASSETS_KEY", "CFG_GENOME_ATTRS_KEYS", "REFGENIE_BY_CFG", "DEFAULT_TAG", "ATTRS_COPY_PULL", "RGC_REQ_KEYS", "REQ_TAG_ATTRS", "CUSTOM_BAR_FMT", "API_VERSION", "CONF_STRUCTURE", "OPERATION_IDS", - "CUSTOM_PFX", "HOOKS"] + FILE_DIR_NAMES + CFG_CONST + CFG_KEY_NAMES + API_IDS + "CUSTOM_PFX", "HOOKS"] + FILE_DIR_NAMES + CFG_CONST + CFG_KEY_NAMES + API_IDS + HOOK_NAMES CONF_STRUCTURE = """ # example genome configuration structure From db4974013baa3c6cec23e21b97f069e14a272357 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 30 Mar 2020 17:26:03 -0400 Subject: [PATCH 09/29] update const names, add missing plugin calls --- refgenconf/const.py | 23 ++++++++++++----------- refgenconf/refgenconf.py | 39 +++++++++++++++++++++++++-------------- 2 files changed, 37 insertions(+), 25 deletions(-) diff --git a/refgenconf/const.py b/refgenconf/const.py index b3c88de7..68bb384a 100644 --- a/refgenconf/const.py +++ b/refgenconf/const.py @@ -95,18 +95,19 @@ "CFG_ASSET_RELATIVES_KEYS", "CFG_ARCHIVE_CONFIG_KEY", "CFG_ARCHIVE_KEY_OLD"] # hook identifiers, A_* (after/post) B_* (before/pre) -B_UPDATE_HOOK = "pre_update" -A_UPDATE_HOOK = "post_update" -B_PULL_HOOK = "pre_pull" -A_PULL_HOOK = "post_pull" -B_TAG_HOOK = "pre_tag" -A_TAG_HOOK = "post_tag" -B_LIST_HOOK = "pre_list" -A_LIST_HOOK = "post_list" +PRE_UPDATE_HOOK = "pre_update" +POST_UPDATE_HOOK = "post_update" +PRE_PULL_HOOK = "pre_pull" +POST_PULL_HOOK = "post_pull" +PRE_TAG_HOOK = "pre_tag" +POST_TAG_HOOK = "post_tag" +PRE_LIST_HOOK = "pre_list" +POST_LIST_HOOK = "post_list" # HOOKS is a list of all available plugin entry points -HOOK_NAMES = ["A_LIST_HOOK", "A_PULL_HOOK", "A_TAG_HOOK", "A_UPDATE_HOOK", - "B_TAG_HOOK", "B_LIST_HOOK", "B_PULL_HOOK", "B_UPDATE_HOOK"] -HOOKS = JSON_METADATA_VALUES = [eval(x) for x in HOOK_NAMES] +HOOK_NAMES = [ + "PRE_LIST_HOOK", "PRE_PULL_HOOK", "PRE_TAG_HOOK", "PRE_UPDATE_HOOK", + "POST_TAG_HOOK", "POST_LIST_HOOK", "POST_PULL_HOOK", "POST_UPDATE_HOOK"] +HOOKS = [eval(x) for x in HOOK_NAMES] # other consts REQ_CFG_VERSION = 0.3 diff --git a/refgenconf/refgenconf.py b/refgenconf/refgenconf.py index be444bf4..d6e1b33f 100755 --- a/refgenconf/refgenconf.py +++ b/refgenconf/refgenconf.py @@ -160,12 +160,14 @@ def list(self, genome=None, order=None, include_tags=False): :return Mapping[str, Iterable[str]]: mapping from assembly name to collection of available asset names. """ - self.run_plugins(B_LIST_HOOK) + self.run_plugins(PRE_LIST_HOOK) refgens = _select_genomes(sorted(self[CFG_GENOMES_KEY].keys(), key=order), genome) if include_tags: + self.run_plugins(POST_LIST_HOOK) return OrderedDict( [(g, sorted(_make_asset_tags_product(self[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY], ":"), key=order)) for g in refgens]) + self.run_plugins(POST_LIST_HOOK) return OrderedDict([(g, sorted(list(self[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY].keys()), key=order)) for g in refgens]) @@ -476,7 +478,7 @@ def tag(self, genome, asset, tag, new_tag, files=True): :raise ValueError: when the original tag is not specified :return bool: a logical indicating whether the tagging was successful """ - self.run_plugins(B_TAG_HOOK) + self.run_plugins(PRE_TAG_HOOK) ori_path = self.seek(genome, asset, tag, enclosing_dir=True, strict_exists=True) new_path = os.path.abspath(os.path.join(ori_path, os.pardir, new_tag)) if self.file_path: @@ -487,6 +489,7 @@ def tag(self, genome, asset, tag, new_tag, files=True): if not self.cfg_tag_asset(genome, asset, tag, new_tag): sys.exit(0) if not files: + self.run_plugins(POST_TAG_HOOK) return try: if os.path.exists(new_path): @@ -505,6 +508,7 @@ def tag(self, genome, asset, tag, new_tag, files=True): " the genome config".format(genome, asset, tag)) _LOGGER.debug("Original asset has been moved from '{}' to '{}'". format(ori_path, new_path)) + self.run_plugins(POST_TAG_HOOK) def cfg_tag_asset(self, genome, asset, tag, new_tag): """ @@ -623,12 +627,16 @@ def pull(self, genome, asset, tag, unpack=True, force=None, :raise refgenconf.RefGenConfError: if the object update is requested in a non-writable state """ - self.run_plugins(B_PULL_HOOK) + self.run_plugins(PRE_PULL_HOOK) missing_vars = unbound_env_vars(self[CFG_FOLDER_KEY]) if missing_vars: raise UnboundEnvironmentVariablesError(", ".join(missing_vars)) - def raise_unpack_error(): + def _null_return(): + self.run_plugins(POST_PULL_HOOK) + return gat, None, None + + def _raise_unpack_error(): raise NotImplementedError("Option to not extract tarballs is not yet supported.") num_servers = 0 @@ -646,7 +654,7 @@ def raise_unpack_error(): else: determined_tag = str(determined_tag) _LOGGER.debug("Determined tag: {}".format(determined_tag)) - unpack or raise_unpack_error() + unpack or _raise_unpack_error() gat = [genome, asset, determined_tag] url_attrs = get_json_url(server_url, API_ID_ASSET_ATTRS).format(genome=genome, asset=asset) url_archive = get_json_url(server_url, API_ID_ARCHIVE).format(genome=genome, asset=asset) @@ -659,7 +667,7 @@ def raise_unpack_error(): if num_servers == len(self[CFG_SERVERS_KEY]): _LOGGER.error("Asset '{}/{}:{}' not available on any of the following servers: {}". format(genome, asset, determined_tag, ", ".join(no_asset_json))) - return gat, None, None + return _null_return() continue if sys.version_info[0] == 2: @@ -675,7 +683,7 @@ def raise_unpack_error(): if os.path.exists(tag_dir): def preserve(): _LOGGER.debug("Preserving existing: {}".format(tag_dir)) - return gat, None, None + return _null_return() def msg_overwrite(): _LOGGER.debug("Overwriting: {}".format(tag_dir)) @@ -699,7 +707,7 @@ def msg_overwrite(): _LOGGER.debug("'{}' archive size: {}".format(bundle_name, archsize)) if _is_large_archive(archsize) and not query_yes_no("Are you sure you want to download this large archive?"): _LOGGER.info("pull action aborted by user") - return gat, None, None + return _null_return() if not os.path.exists(genome_dir_path): _LOGGER.debug("Creating directory: {}".format(genome_dir_path)) @@ -714,7 +722,7 @@ def msg_overwrite(): _LOGGER.error("Asset archive '{}/{}:{}' is missing on the server: {s}".format(*gat, s=server_url)) if server_url == self[CFG_SERVERS_KEY][-1]: # it this was the last server on the list, return - return gat, None, None + return _null_return() else: _LOGGER.info("Trying next server") # set the tag value back to what user requested @@ -724,11 +732,11 @@ def msg_overwrite(): _LOGGER.error(str(e)) _LOGGER.error("Server {}/{} refused download. Check your internet settings".format(server_url, API_VERSION)) - return gat, None, None + return _null_return() except ContentTooShortError as e: _LOGGER.error(str(e)) _LOGGER.error("'{}' download incomplete".format(bundle_name)) - return gat, None, None + return _null_return() else: _LOGGER.info("Download complete: {}".format(filepath)) @@ -736,7 +744,7 @@ def msg_overwrite(): old_checksum = archive_data and archive_data.get(CFG_ARCHIVE_CHECKSUM_KEY) if old_checksum and new_checksum != old_checksum: _LOGGER.error("Checksum mismatch: ({}, {})".format(new_checksum, old_checksum)) - return gat, None, None + return _null_return() else: _LOGGER.debug("Matched checksum: '{}'".format(old_checksum)) import tempfile @@ -759,12 +767,14 @@ def msg_overwrite(): rgc.update_tags(*gat, data={attr: archive_data[attr] for attr in ATTRS_COPY_PULL if attr in archive_data}) rgc.set_default_pointer(*gat) + self.run_plugins(POST_PULL_HOOK) return gat, archive_data, server_url [self.chk_digest_update_child(gat[0], x, "{}/{}:{}".format(*gat), server_url) for x in archive_data[CFG_ASSET_PARENTS_KEY] if CFG_ASSET_PARENTS_KEY in archive_data] self.update_tags(*gat, data={attr: archive_data[attr] for attr in ATTRS_COPY_PULL if attr in archive_data}) self.set_default_pointer(*gat) + self.run_plugins(POST_PULL_HOOK) return gat, archive_data, server_url def remove_asset_from_relatives(self, genome, asset, tag): @@ -1245,15 +1255,16 @@ def run_plugins(self, hook): """ Runs all installed plugins for the specified hook. - :param str hook: hook idenfier + :param str hook: hook identifier """ for name, func in plugins[hook].items(): _LOGGER.debug("Running {} plugin: {}".format(hook, name)) func(self) def write(self): + self.run_plugins(PRE_UPDATE_HOOK) super(RefGenConf, self).write() - self.run_plugins(A_UPDATE_HOOK) + self.run_plugins(POST_UPDATE_HOOK) class DownloadProgressBar(tqdm): From b2c05d93d2b18056aa9ed5d72ca3226318a552ca Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 2 Jun 2020 15:39:55 -0400 Subject: [PATCH 10/29] add filepath argument to the write method; https://github.com/databio/refgenieserver/issues/80 --- refgenconf/refgenconf.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/refgenconf/refgenconf.py b/refgenconf/refgenconf.py index d6e1b33f..e11702a9 100755 --- a/refgenconf/refgenconf.py +++ b/refgenconf/refgenconf.py @@ -1261,10 +1261,23 @@ def run_plugins(self, hook): _LOGGER.debug("Running {} plugin: {}".format(hook, name)) func(self) - def write(self): + def write(self, filepath=None): + """ + Write the contents to a file. + If pre- and post-update plugins are defined, they will be executed automatically + + :param str filepath: a file path to write to + :raise OSError: when the object has been created in a read only mode or other process has locked the file + :raise TypeError: when the filepath cannot be determined. + This takes place only if YacAttMap initialized with a Mapping as an input, not read from file. + :raise OSError: when the write is called on an object with no write capabilities + or when writing to a file that is locked by a different object + :return str: the path to the created files + """ self.run_plugins(PRE_UPDATE_HOOK) - super(RefGenConf, self).write() + path = super(RefGenConf, self).write(filepath=filepath) self.run_plugins(POST_UPDATE_HOOK) + return path class DownloadProgressBar(tqdm): From ce3cdb3ee7e065ec38e968bb9204c7afb03cd798 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 3 Jun 2020 15:32:15 -0400 Subject: [PATCH 11/29] install unreleased deps, test more pythons --- .travis.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.travis.yml b/.travis.yml index 2782aba3..7fa1bbcf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,8 +3,12 @@ python: - "2.7" - "3.5" - "3.6" + - "3.8" os: - linux +before_install: #installing unreleased packages + - pip install git+https://github.com/databio/henge.git + - pip install git+https://github.com/databio/refget-py install: - pip install . - pip install -r requirements/requirements-dev.txt From 21f94029ec86747ba0bdf4c62e1afa7ac55f6a5b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 3 Jun 2020 15:36:14 -0400 Subject: [PATCH 12/29] implement genomes compatibility level checking functionality --- refgenconf/refgenconf.py | 119 ++++++++++++++++++++---------- requirements/requirements-all.txt | 3 +- 2 files changed, 80 insertions(+), 42 deletions(-) diff --git a/refgenconf/refgenconf.py b/refgenconf/refgenconf.py index e11702a9..9e6ee4bb 100755 --- a/refgenconf/refgenconf.py +++ b/refgenconf/refgenconf.py @@ -23,7 +23,9 @@ import signal import warnings import shutil +import json +from refget import RefGetHenge from attmap import PathExAttMap as PXAM from ubiquerg import checksum, is_url, query_yes_no, parse_registry_path as prp, untar, is_writable from tqdm import tqdm @@ -282,39 +284,6 @@ def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, warnings.warn(msg, RuntimeWarning) return path - def get_asset(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=True, - check_exist=lambda p: os.path.exists(p) or is_url(p), enclosing_dir=False): - """ - Get a path to the a specified genome-asset-tag. - Note: enforces file existence checks by default - - :param str genome_name: name of a reference genome assembly of interest - :param str asset_name: name of the particular asset to fetch - :param str tag_name: name of the particular asset tag to fetch - :param str seek_key: name of the particular subasset to fetch - :param bool | NoneType strict_exists: how to handle case in which - path doesn't exist; True to raise IOError, False to raise - RuntimeWarning, and None to do nothing at all - :param function(callable) -> bool check_exist: how to check for - asset/path existence - :param bool enclosing_dir: whether a path to the entire enclosing directory should be returned, e.g. - for a fasta asset that has 3 seek_keys pointing to 3 files in an asset dir, that asset dir is returned - :return str: path to the asset - :raise TypeError: if the existence check is not a one-arg function - :raise refgenconf.MissingGenomeError: if the named assembly isn't known - to this configuration instance - :raise refgenconf.MissingAssetError: if the names assembly is known to - this configuration instance, but the requested asset is unknown - """ - warnings.warn( - "Please use seek method instead; get_asset will be removed " - "in the next release.", category=DeprecationWarning - ) - return self.seek(genome_name=genome_name, asset_name=asset_name, - tag_name=tag_name, seek_key=seek_key, - strict_exists=strict_exists, check_exist=check_exist, - enclosing_dir=enclosing_dir) - def get_default_tag(self, genome, asset, use_existing=True): """ Determine the asset tag to use as default. The one indicated by the 'default_tag' key in the asset @@ -642,6 +611,9 @@ def _raise_unpack_error(): num_servers = 0 bad_servers = [] no_asset_json = [] + if CFG_SERVERS_KEY not in self or self[CFG_SERVERS_KEY] is None: + _LOGGER.error("You are not subscribed to any asset servers") + return _null_return() for server_url in self[CFG_SERVERS_KEY]: num_servers += 1 try: @@ -767,16 +739,46 @@ def msg_overwrite(): rgc.update_tags(*gat, data={attr: archive_data[attr] for attr in ATTRS_COPY_PULL if attr in archive_data}) rgc.set_default_pointer(*gat) - self.run_plugins(POST_PULL_HOOK) - return gat, archive_data, server_url - [self.chk_digest_update_child(gat[0], x, "{}/{}:{}".format(*gat), server_url) - for x in archive_data[CFG_ASSET_PARENTS_KEY] if CFG_ASSET_PARENTS_KEY in archive_data] - self.update_tags(*gat, data={attr: archive_data[attr] - for attr in ATTRS_COPY_PULL if attr in archive_data}) - self.set_default_pointer(*gat) + else: + [self.chk_digest_update_child(gat[0], x, "{}/{}:{}".format(*gat), server_url) + for x in archive_data[CFG_ASSET_PARENTS_KEY] if CFG_ASSET_PARENTS_KEY in archive_data] + self.update_tags(*gat, data={attr: archive_data[attr] + for attr in ATTRS_COPY_PULL if attr in archive_data}) + self.set_default_pointer(*gat) + if gat[1] == "fasta": + self.initialize_genome(gat) self.run_plugins(POST_PULL_HOOK) return gat, archive_data, server_url + def initialize_genome(self, gat): + """ + Initialize a genome + + Create a JSON file with Derived Recursive Unique Indentifiers (DRUIDs) + for the FASTA file in the genome directory. + + :param list[str] gat: list of genome, asset and tag names + :return + """ + g = gat[0] + _LOGGER.info("Initializing genome: {}".format(g)) + d, c = RefGetHenge({}).load_fasta(self.seek(*gat, strict_exists=True)) + pth = self.get_asds_path(g) + with open(pth, "w") as jfp: + json.dump(c, jfp) + _LOGGER.debug("Saved DRUIDs to JSON: {}".format(pth)) + return d, c + + def get_asds_path(self, genome): + """ + Get path to the Annotated Sequence Digests JSON file for a given genome. + Note that the path and/or genome may not exist. + + :param str genome: genome name + :return str: ASDs path + """ + return os.path.join(self[CFG_FOLDER_KEY], genome, genome + "__ASDs.json") + def remove_asset_from_relatives(self, genome, asset, tag): """ Remove any relationship links associated with the selected asset @@ -1251,6 +1253,42 @@ def id(self, genome, asset, tag=None): raise MissingConfigDataError("Digest does not exist for: {}/{}:{}". format(genome, asset, tag)) + def compare(self, genome1, genome2, explain=False): + """ + Check genomes compatibility level. + + Compares Annotated Sequence Digests (ASDs) -- digested sequences and metadata + + :param str genome1: name of the first genome to compare + :param str genome2: name of the first genome to compare + :param bool explain: whether the returned code explanation should + be displayed + :return int: compatibility code + """ + def _get_asds_for_genome(rgc, genome): + """ + Read JSON file containing ASDs for a specified genome + + :param refgenconf.RefGenConf rgc: object to find the genome for + :param str genome: genome to find the file for + :return list[dict]: list of ASDs, ready to compare + """ + rgc.seek(genome, "fasta", strict_exists=True) + json_file = rgc.get_asds_path(genome) + if not os.path.exists(json_file): + raise OSError( + "File containing Annotated Sequence Digests (ASDs) not " + "found for genome: {g}. Pull or build '{g}/fasta' again to " + "check the compatibility.".format(g=genome)) + with open(json_file, "r") as jfp: + return json.load(jfp) + + return RefGetHenge({}).compare_asds( + _get_asds_for_genome(self, genome1), + _get_asds_for_genome(self, genome2), + explain=explain + ) + def run_plugins(self, hook): """ Runs all installed plugins for the specified hook. @@ -1501,7 +1539,6 @@ def _read_remote_data(url): :param str url: data request :return dict: JSON parsed from the response from given URL request """ - import json with urllib.request.urlopen(url) as response: encoding = response.info().get_content_charset('utf8') return json.loads(response.read().decode(encoding)) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 8d17ea5f..21f78c1a 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -4,4 +4,5 @@ requests tqdm>=4.38.0 ubiquerg>=0.5.0 yacman>=0.6.6 -future \ No newline at end of file +future +refget>=0.0.1 \ No newline at end of file From ff759f3d93e621f42073932e78be6861e74ab737 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 4 Jun 2020 08:53:34 -0400 Subject: [PATCH 13/29] test compatibility checks --- .travis.yml | 4 ++-- refgenconf/refgenconf.py | 11 +++++++---- tests/test_1pull_asset.py | 18 ++++++++++++++++-- tests/test_compare.py | 27 +++++++++++++++++++++++++++ 4 files changed, 52 insertions(+), 8 deletions(-) create mode 100644 tests/test_compare.py diff --git a/.travis.yml b/.travis.yml index 7fa1bbcf..2bd5fa2c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,8 +7,8 @@ python: os: - linux before_install: #installing unreleased packages - - pip install git+https://github.com/databio/henge.git - - pip install git+https://github.com/databio/refget-py + - pip install git+https://github.com/databio/henge.git + - pip install git+https://github.com/databio/refget-py install: - pip install . - pip install -r requirements/requirements-dev.txt diff --git a/refgenconf/refgenconf.py b/refgenconf/refgenconf.py index 9e6ee4bb..99871828 100755 --- a/refgenconf/refgenconf.py +++ b/refgenconf/refgenconf.py @@ -739,14 +739,16 @@ def msg_overwrite(): rgc.update_tags(*gat, data={attr: archive_data[attr] for attr in ATTRS_COPY_PULL if attr in archive_data}) rgc.set_default_pointer(*gat) + if gat[1] == "fasta": + self.initialize_genome(gat) else: [self.chk_digest_update_child(gat[0], x, "{}/{}:{}".format(*gat), server_url) for x in archive_data[CFG_ASSET_PARENTS_KEY] if CFG_ASSET_PARENTS_KEY in archive_data] self.update_tags(*gat, data={attr: archive_data[attr] for attr in ATTRS_COPY_PULL if attr in archive_data}) self.set_default_pointer(*gat) - if gat[1] == "fasta": - self.initialize_genome(gat) + if gat[1] == "fasta": + self.initialize_genome(gat) self.run_plugins(POST_PULL_HOOK) return gat, archive_data, server_url @@ -754,7 +756,7 @@ def initialize_genome(self, gat): """ Initialize a genome - Create a JSON file with Derived Recursive Unique Indentifiers (DRUIDs) + Create a JSON file with Annotated Sequence Digests (ASDs) for the FASTA file in the genome directory. :param list[str] gat: list of genome, asset and tag names @@ -766,7 +768,8 @@ def initialize_genome(self, gat): pth = self.get_asds_path(g) with open(pth, "w") as jfp: json.dump(c, jfp) - _LOGGER.debug("Saved DRUIDs to JSON: {}".format(pth)) + _LOGGER.debug("Saved ASDs to JSON: {}".format(pth)) + self[CFG_GENOMES_KEY][g][CFG_CHECKSUM_KEY] = d return d, c def get_asds_path(self, genome): diff --git a/tests/test_1pull_asset.py b/tests/test_1pull_asset.py index 62a69380..d47c9a31 100644 --- a/tests/test_1pull_asset.py +++ b/tests/test_1pull_asset.py @@ -16,7 +16,7 @@ from .conftest import remove_asset_and_file -__author__ = "Vince Reuter" +__author__ = "Vince Reuter, Michal Stolarczyk" __email__ = "vreuter@virginia.edu" @@ -122,4 +122,18 @@ def test_pull_asset_works_with_nonwritable_and_writable_rgc(cfg_file, gname, ana remove_asset_and_file(rgc, gname, aname, tname) with mock.patch("refgenconf.refgenconf.query_yes_no", return_value=True): print("\nPulling; genome: {}, asset: {}, tag: {}\n".format(gname, aname, tname)) - rgc.pull(gname, aname, tname) \ No newline at end of file + rgc.pull(gname, aname, tname) + + +@pytest.mark.parametrize("gname", ["rCRSd", "mouse_chrM2x", "human_repeats"]) +def test_pull_initializes_genomes(cfg_file, gname): + """ + Test for existence of a JSON files containing ASDs and genome digests in the + cfg after a fasta asset has been pulled for a genome + """ + rgc = RefGenConf(filepath=cfg_file, writable=False) + assert os.path.exists(rgc.get_asds_path(gname)) + assert CFG_CHECKSUM_KEY in rgc.get_genome_attributes(gname) + assert isinstance(rgc.get_genome_attributes(gname)[CFG_CHECKSUM_KEY], str) + + diff --git a/tests/test_compare.py b/tests/test_compare.py new file mode 100644 index 00000000..1f84b007 --- /dev/null +++ b/tests/test_compare.py @@ -0,0 +1,27 @@ +""" Tests for RefGenConf.compare. These tests depend on successful completion of tests is test_1pull_asset.py """ + +import pytest +import os + +__author__ = "Michal Stolarczyk" +__email__ = "michal@virginia.edu" + + +class TestCompare: + @pytest.mark.parametrize(["gname1", "gname2", "result"], + [("rCRSd", "rCRSd", 63), + ("mouse_chrM2x", "mouse_chrM2x", 63), + ("rCRSd", "mouse_chrM2x", 0)]) + def test_compare_result(self, ro_rgc, gname1, gname2, result): + assert ro_rgc.compare(gname1, gname2) == result + + @pytest.mark.parametrize(["gname1", "gname2"], + [("rCRSd", "rCRSd"), + ("mouse_chrM2x", "mouse_chrM2x"), + ("rCRSd", "mouse_chrM2x")]) + def test_compare_errors_when_no_asd_json(self, ro_rgc, gname1, gname2): + jfp = ro_rgc.get_asds_path(gname1) + os.rename(jfp, jfp + "_renamed") + with pytest.raises(OSError): + ro_rgc.compare(gname1, gname2) + os.rename(jfp + "_renamed", jfp) From 5759a9ebf7c88b1333419ed58add89391e1e63c3 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 8 Jun 2020 14:04:01 -0400 Subject: [PATCH 14/29] make plugins mapping an rgc property --- refgenconf/__init__.py | 1 - refgenconf/const.py | 2 +- refgenconf/refgenconf.py | 15 +++++++++++++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/refgenconf/__init__.py b/refgenconf/__init__.py index 177de8ce..51557765 100644 --- a/refgenconf/__init__.py +++ b/refgenconf/__init__.py @@ -3,7 +3,6 @@ from .const import * from .exceptions import * from .helpers import * -from .plugins import * from .refgenconf import * __all__ = ["RefGenConf", "select_genome_config", "GenomeConfigFormatError", diff --git a/refgenconf/const.py b/refgenconf/const.py index 68bb384a..87ad4c16 100644 --- a/refgenconf/const.py +++ b/refgenconf/const.py @@ -94,7 +94,7 @@ "CFG_ASSET_CHILDREN_KEY", "CFG_TAG_DESC_KEY", "CFG_ASSET_CHECKSUM_KEY", "CFG_ASSET_TAGS_KEY", "CFG_ASSET_RELATIVES_KEYS", "CFG_ARCHIVE_CONFIG_KEY", "CFG_ARCHIVE_KEY_OLD"] -# hook identifiers, A_* (after/post) B_* (before/pre) +# hook identifiers PRE_UPDATE_HOOK = "pre_update" POST_UPDATE_HOOK = "post_update" PRE_PULL_HOOK = "pre_pull" diff --git a/refgenconf/refgenconf.py b/refgenconf/refgenconf.py index 99871828..83695384 100755 --- a/refgenconf/refgenconf.py +++ b/refgenconf/refgenconf.py @@ -29,13 +29,13 @@ from attmap import PathExAttMap as PXAM from ubiquerg import checksum, is_url, query_yes_no, parse_registry_path as prp, untar, is_writable from tqdm import tqdm +from pkg_resources import iter_entry_points import yacman from .const import * from .helpers import unbound_env_vars, asciify_json_dict, select_genome_config from .exceptions import * -from .plugins import plugins _LOGGER = logging.getLogger(__name__) @@ -126,6 +126,17 @@ def __bool__(self): __nonzero__ = __bool__ + @property + def plugins(self): + """ + Plugins registered by entry points in the current Python env + + :return dict[dict[function(refgenconf.RefGenConf)]]: dict which keys + are names of all possible hooks and values are dicts mapping + registered funcions names to their values + """ + return {h: {ep.name: ep.load() for ep in iter_entry_points('refgenie.hooks.' + h)} for h in HOOKS} + def initialize_config_file(self, filepath=None): """ Initialize genome configuration file on disk @@ -1298,7 +1309,7 @@ def run_plugins(self, hook): :param str hook: hook identifier """ - for name, func in plugins[hook].items(): + for name, func in self.plugins[hook].items(): _LOGGER.debug("Running {} plugin: {}".format(hook, name)) func(self) From 63cd397195507b8e487bb5179382728a838062c1 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 8 Jun 2020 16:45:11 -0400 Subject: [PATCH 15/29] test plugins --- refgenconf/plugins.py | 12 ------- refgenconf/refgenconf.py | 6 ++-- requirements/requirements-test.txt | 1 + tests/test_compare.py | 2 ++ tests/test_plugins.py | 53 ++++++++++++++++++++++++++++++ tests/test_removal.py | 1 + 6 files changed, 60 insertions(+), 15 deletions(-) delete mode 100644 refgenconf/plugins.py create mode 100644 tests/test_plugins.py diff --git a/refgenconf/plugins.py b/refgenconf/plugins.py deleted file mode 100644 index 65b9b080..00000000 --- a/refgenconf/plugins.py +++ /dev/null @@ -1,12 +0,0 @@ -""" Plugin functions """ - -import pkg_resources -from .const import HOOKS - -__all__ = ["plugins"] - -plugins = {} -for hook in HOOKS: - plugins[hook] = \ - {entry_point.name: entry_point.load() for entry_point in - pkg_resources.iter_entry_points('refgenie.hooks.' + hook)} diff --git a/refgenconf/refgenconf.py b/refgenconf/refgenconf.py index 83695384..099d6234 100755 --- a/refgenconf/refgenconf.py +++ b/refgenconf/refgenconf.py @@ -133,7 +133,7 @@ def plugins(self): :return dict[dict[function(refgenconf.RefGenConf)]]: dict which keys are names of all possible hooks and values are dicts mapping - registered funcions names to their values + registered functions names to their values """ return {h: {ep.name: ep.load() for ep in iter_entry_points('refgenie.hooks.' + h)} for h in HOOKS} @@ -730,11 +730,11 @@ def msg_overwrite(): return _null_return() else: _LOGGER.debug("Matched checksum: '{}'".format(old_checksum)) - import tempfile + from tempfile import mkdtemp # successfully downloaded and moved tarball; untar it if unpack and filepath.endswith(".tgz"): _LOGGER.info("Extracting asset tarball and saving to: {}".format(tag_dir)) - tmpdir = tempfile.mkdtemp(dir=genome_dir_path) # TODO: use context manager here when we drop support for py2 + tmpdir = mkdtemp(dir=genome_dir_path) # TODO: use context manager here when we drop support for py2 untar(filepath, tmpdir) # here we suspect the unarchived asset to be an asset-named directory # the asset data inside diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt index 1371faf1..62fd5580 100644 --- a/requirements/requirements-test.txt +++ b/requirements/requirements-test.txt @@ -1,3 +1,4 @@ coveralls>=1.1 pytest-cov==2.6.1 pytest-remotedata +git+git://github.com/databio/refgenie_myplugin@master#egg=refgenie_myplugin diff --git a/tests/test_compare.py b/tests/test_compare.py index 1f84b007..e6f2c1d7 100644 --- a/tests/test_compare.py +++ b/tests/test_compare.py @@ -25,3 +25,5 @@ def test_compare_errors_when_no_asd_json(self, ro_rgc, gname1, gname2): with pytest.raises(OSError): ro_rgc.compare(gname1, gname2) os.rename(jfp + "_renamed", jfp) + + diff --git a/tests/test_plugins.py b/tests/test_plugins.py new file mode 100644 index 00000000..e7f639d7 --- /dev/null +++ b/tests/test_plugins.py @@ -0,0 +1,53 @@ +import os +import mock +from refgenconf import RefGenConf + +__author__ = "Michal Stolarczyk" +__email__ = "michal@virginia.edu" + + +def set_flag(rgc): + """ + Creates a flag file next to the genome configuration file. + + Useful for plugin system testing if one does not want to rely + on printed messages to check plugin effect + + :param refgenconf.RefGenConf rgc: object to create the flag for + """ + pth = os.path.join(os.path.dirname(rgc.file_path), "plugin.flag") + if not os.path.exists(pth): + write_flags = os.O_CREAT | os.O_EXCL | os.O_WRONLY + fd = os.open(pth, write_flags) + os.close(fd) + assert os.path.exists(pth) + print("Created flag file: {}".format(pth)) + else: + raise FileExistsError("Flag file already exists: {}".format(pth)) + + +PLUGINS_DICT = {'pre_list': {'my_func': set_flag}, 'pre_pull': {}, + 'pre_tag': {}, 'pre_update': {}, 'post_tag': {}, + 'post_list': {}, 'post_pull': {}, 'post_update': {}} + + +class TestPlugins: + def test_prelist_plugins_called(self, cfg_file): + with mock.patch("refgenconf.refgenconf.RefGenConf.plugins", + new_callable=mock.PropertyMock) as mock_plugins: + mock_plugins.return_value = PLUGINS_DICT + rgc = RefGenConf(cfg_file, writable=False) + rgc.list() + assert os.path.exists(os.path.join(os.path.dirname(rgc.file_path), + "plugin.flag")) + os.remove(os.path.join(os.path.dirname(rgc.file_path), "plugin.flag")) + assert not os.path.exists( + os.path.join(os.path.dirname(rgc.file_path), "plugin.flag")) + + def test_plugin_entrypoints_scanning(self, ro_rgc): + """ + Plugins property dynamically scans defined entrypoints in the packages + in current Python environment. Properly defined ones are included in + the plugins property return value + """ + assert any([len(fun) > 0 for plugin, fun in ro_rgc.plugins.items()]) \ No newline at end of file diff --git a/tests/test_removal.py b/tests/test_removal.py index 3a8b1770..6fb25ff3 100644 --- a/tests/test_removal.py +++ b/tests/test_removal.py @@ -5,6 +5,7 @@ from refgenconf.exceptions import * from refgenconf.const import * + class TestRemoveAssets: @pytest.mark.parametrize(["gname", "aname", "tname"], [("rCRSd", "fasta", None), ("mouse_chrM2x", "fasta", None)]) def test_default_tag_removal(self, my_rgc, gname, aname, tname): From b4137446df35c1780aa1701b691464e475b5b183 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 8 Jun 2020 16:52:57 -0400 Subject: [PATCH 16/29] wrap repeated code in the function --- tests/test_plugins.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/test_plugins.py b/tests/test_plugins.py index e7f639d7..88e10542 100644 --- a/tests/test_plugins.py +++ b/tests/test_plugins.py @@ -6,6 +6,10 @@ __email__ = "michal@virginia.edu" +def get_flag_pth(rgc): + return os.path.join(os.path.dirname(rgc.file_path), "plugin.flag") + + def set_flag(rgc): """ Creates a flag file next to the genome configuration file. @@ -15,7 +19,7 @@ def set_flag(rgc): :param refgenconf.RefGenConf rgc: object to create the flag for """ - pth = os.path.join(os.path.dirname(rgc.file_path), "plugin.flag") + pth = get_flag_pth(rgc) if not os.path.exists(pth): write_flags = os.O_CREAT | os.O_EXCL | os.O_WRONLY fd = os.open(pth, write_flags) @@ -38,11 +42,9 @@ def test_prelist_plugins_called(self, cfg_file): mock_plugins.return_value = PLUGINS_DICT rgc = RefGenConf(cfg_file, writable=False) rgc.list() - assert os.path.exists(os.path.join(os.path.dirname(rgc.file_path), - "plugin.flag")) - os.remove(os.path.join(os.path.dirname(rgc.file_path), "plugin.flag")) - assert not os.path.exists( - os.path.join(os.path.dirname(rgc.file_path), "plugin.flag")) + assert get_flag_pth(rgc) + os.remove(get_flag_pth(rgc)) + assert not os.path.exists(get_flag_pth(rgc)) def test_plugin_entrypoints_scanning(self, ro_rgc): """ From 1eac275874c8022ceefe357e995dce070bfe9245 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 16 Jun 2020 16:04:56 -0400 Subject: [PATCH 17/29] add url base key const --- refgenconf/const.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/refgenconf/const.py b/refgenconf/const.py index 87ad4c16..eed4c7f2 100644 --- a/refgenconf/const.py +++ b/refgenconf/const.py @@ -54,6 +54,7 @@ CFG_ARCHIVE_KEY = "genome_archive_folder" CFG_ARCHIVE_KEY_OLD = "genome_archive" CFG_ARCHIVE_CONFIG_KEY = "genome_archive_config" +CFG_REMOTE_URL_BASE_KEY = "remote_url_base" CFG_VERSION_KEY = "config_version" CFG_GENOMES_KEY = "genomes" @@ -78,7 +79,7 @@ CFG_TOP_LEVEL_KEYS = [ CFG_FOLDER_KEY, CFG_SERVER_KEY, CFG_SERVERS_KEY, CFG_ARCHIVE_KEY, CFG_GENOMES_KEY, - CFG_VERSION_KEY, CFG_ARCHIVE_CONFIG_KEY, CFG_ARCHIVE_KEY_OLD] + CFG_VERSION_KEY, CFG_ARCHIVE_CONFIG_KEY, CFG_ARCHIVE_KEY_OLD, CFG_REMOTE_URL_BASE_KEY] CFG_GENOME_KEYS = [ CFG_GENOME_DESC_KEY, CFG_ASSETS_KEY, CFG_CHECKSUM_KEY] CFG_GENOME_ATTRS_KEYS = [CFG_GENOME_DESC_KEY, CFG_CHECKSUM_KEY] @@ -92,7 +93,7 @@ "CFG_ASSET_PATH_KEY", "CFG_ASSET_DESC_KEY", "CFG_ARCHIVE_KEY", "CFG_ARCHIVE_SIZE_KEY", "CFG_SEEK_KEYS_KEY", "CFG_ASSET_SIZE_KEY", "CFG_CHECKSUM_KEY", "CFG_ARCHIVE_CHECKSUM_KEY", "CFG_VERSION_KEY", "CFG_ASSET_PARENTS_KEY", "CFG_ASSET_CHILDREN_KEY", "CFG_TAG_DESC_KEY", "CFG_ASSET_CHECKSUM_KEY", "CFG_ASSET_TAGS_KEY", - "CFG_ASSET_RELATIVES_KEYS", "CFG_ARCHIVE_CONFIG_KEY", "CFG_ARCHIVE_KEY_OLD"] + "CFG_ASSET_RELATIVES_KEYS", "CFG_ARCHIVE_CONFIG_KEY", "CFG_ARCHIVE_KEY_OLD", "CFG_REMOTE_URL_BASE_KEY"] # hook identifiers PRE_UPDATE_HOOK = "pre_update" From 6caa403260dcef3ecfd976b82a49427682632317 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 17 Jun 2020 16:19:14 -0400 Subject: [PATCH 18/29] add aliases-related code for future release --- refgenconf/_version.py | 2 +- refgenconf/const.py | 12 ++++--- refgenconf/exceptions.py | 7 ++++- refgenconf/refgenconf.py | 67 ++++++++++++++++++++++++++++++++++++++-- 4 files changed, 79 insertions(+), 9 deletions(-) diff --git a/refgenconf/_version.py b/refgenconf/_version.py index fcf769f5..b4e087d8 100644 --- a/refgenconf/_version.py +++ b/refgenconf/_version.py @@ -1 +1 @@ -__version__ = "0.7.1-dev" +__version__ = "0.8.0-dev" diff --git a/refgenconf/const.py b/refgenconf/const.py index eed4c7f2..fac15cad 100644 --- a/refgenconf/const.py +++ b/refgenconf/const.py @@ -57,11 +57,13 @@ CFG_REMOTE_URL_BASE_KEY = "remote_url_base" CFG_VERSION_KEY = "config_version" CFG_GENOMES_KEY = "genomes" +CFG_ALIASES_KEY = "genome_aliases" CFG_CHECKSUM_KEY = "genome_digest" CFG_GENOME_DESC_KEY = "genome_description" CFG_ASSETS_KEY = "assets" +CFG_GENOME_MASK_KEY = "genome_mask" CFG_ASSET_PATH_KEY = "asset_path" CFG_ASSET_SIZE_KEY = "asset_size" CFG_ASSET_DESC_KEY = "asset_description" @@ -78,18 +80,18 @@ CFG_ASSET_RELATIVES_KEYS = [CFG_ASSET_CHILDREN_KEY, CFG_ASSET_PARENTS_KEY] CFG_TOP_LEVEL_KEYS = [ - CFG_FOLDER_KEY, CFG_SERVER_KEY, CFG_SERVERS_KEY, CFG_ARCHIVE_KEY, CFG_GENOMES_KEY, + CFG_FOLDER_KEY, CFG_SERVER_KEY, CFG_SERVERS_KEY, CFG_ARCHIVE_KEY, CFG_GENOMES_KEY, CFG_ALIASES_KEY, CFG_VERSION_KEY, CFG_ARCHIVE_CONFIG_KEY, CFG_ARCHIVE_KEY_OLD, CFG_REMOTE_URL_BASE_KEY] CFG_GENOME_KEYS = [ CFG_GENOME_DESC_KEY, CFG_ASSETS_KEY, CFG_CHECKSUM_KEY] CFG_GENOME_ATTRS_KEYS = [CFG_GENOME_DESC_KEY, CFG_CHECKSUM_KEY] CFG_SINGLE_ASSET_SECTION_KEYS = [CFG_ASSET_PATH_KEY, CFG_ASSET_DESC_KEY, CFG_ASSET_SIZE_KEY, CFG_ARCHIVE_SIZE_KEY, - CFG_ARCHIVE_CHECKSUM_KEY, CFG_SEEK_KEYS_KEY] + CFG_ARCHIVE_CHECKSUM_KEY, CFG_SEEK_KEYS_KEY, CFG_GENOME_MASK_KEY] -RGC_REQ_KEYS = [CFG_SERVERS_KEY, CFG_FOLDER_KEY, CFG_GENOMES_KEY, CFG_VERSION_KEY] +RGC_REQ_KEYS = [CFG_SERVERS_KEY, CFG_FOLDER_KEY, CFG_GENOMES_KEY, CFG_VERSION_KEY, CFG_ALIASES_KEY] CFG_KEY_NAMES = [ - "CFG_FOLDER_KEY", "CFG_SERVER_KEY", "CFG_SERVERS_KEY", "CFG_GENOMES_KEY", + "CFG_FOLDER_KEY", "CFG_SERVER_KEY", "CFG_SERVERS_KEY", "CFG_GENOMES_KEY", "CFG_GENOME_MASK_KEY", "CFG_ALIASES_KEY", "CFG_ASSET_PATH_KEY", "CFG_ASSET_DESC_KEY", "CFG_ARCHIVE_KEY", "CFG_ARCHIVE_SIZE_KEY", "CFG_SEEK_KEYS_KEY", "CFG_ASSET_SIZE_KEY", "CFG_CHECKSUM_KEY", "CFG_ARCHIVE_CHECKSUM_KEY", "CFG_VERSION_KEY", "CFG_ASSET_PARENTS_KEY", "CFG_ASSET_CHILDREN_KEY", "CFG_TAG_DESC_KEY", "CFG_ASSET_CHECKSUM_KEY", "CFG_ASSET_TAGS_KEY", @@ -112,7 +114,7 @@ # other consts REQ_CFG_VERSION = 0.3 -REFGENIE_BY_CFG = {"0.3": "0.7.0", "0.2": "0.6.0"} +REFGENIE_BY_CFG = {"0.4": "0.8.0", "0.3": "0.7.0", "0.2": "0.6.0"} ATTRS_COPY_PULL = [CFG_ASSET_DESC_KEY, CFG_SEEK_KEYS_KEY, CFG_ASSET_PARENTS_KEY, CFG_ASSET_PATH_KEY, CFG_ASSET_CHECKSUM_KEY, CFG_TAG_DESC_KEY] REQ_TAG_ATTRS = [CFG_ASSET_PATH_KEY, CFG_SEEK_KEYS_KEY] diff --git a/refgenconf/exceptions.py b/refgenconf/exceptions.py index 762e5ad7..a8e88042 100644 --- a/refgenconf/exceptions.py +++ b/refgenconf/exceptions.py @@ -5,7 +5,7 @@ __all__ = ["DownloadJsonError", "GenomeConfigFormatError", "MissingAssetError", "MissingRecipeError", "MissingConfigDataError", "MissingGenomeError", "MissingSeekKeyError", "MissingTagError", "RefgenconfError", "UnboundEnvironmentVariablesError", "ConfigNotCompliantError", - "RemoteDigestMismatchError"] + "RemoteDigestMismatchError", "UndefinedAliasError"] DOC_URL = "http://refgenie.databio.org/en/latest/genome_config/" @@ -80,3 +80,8 @@ def __init__(self, asset, local_digest, remote_digest): "because the remote version was not built from the same parent asset you have locally." \ .format(asset, local_digest, remote_digest) super(RemoteDigestMismatchError, self).__init__(msg) + + +class UndefinedAliasError(RefgenconfError): + """ Alias is is not defined. """ + pass diff --git a/refgenconf/refgenconf.py b/refgenconf/refgenconf.py index 099d6234..6167ab0e 100755 --- a/refgenconf/refgenconf.py +++ b/refgenconf/refgenconf.py @@ -763,6 +763,68 @@ def msg_overwrite(): self.run_plugins(POST_PULL_HOOK) return gat, archive_data, server_url + def get_genome_digest(self, alias): + """ + Get the genome digest for human readable alias + + :param str alias: human-readable alias to get the genome digest for + :return str: genome digest + :raise GenomeConfigFormatError: if "genome_digests" section does + not exist in the config + :raise UndefinedAliasError: if a no digest has been defined for the + requested alias + """ + if CFG_ALIASES_KEY not in self: + raise GenomeConfigFormatError( + "'{}' not in genome config".format(CFG_ALIASES_KEY)) + if alias not in self[CFG_ALIASES_KEY].keys(): + raise UndefinedAliasError("No digest defined for '{}'".format(alias)) + return self[CFG_ALIASES_KEY][alias] + + def get_alias(self, digest): + """ + Get the human readable alias for a genome digest + + :param str digest: digest to find human-readable alias for + :return str: human-readable alias + :raise GenomeConfigFormatError: if "genome_digests" section does + not exist in the config + :raise UndefinedAliasError: if a no alias has been defined for the + requested digest + """ + if CFG_ALIASES_KEY not in self: + raise GenomeConfigFormatError( + "'{}' not in genome config".format(CFG_ALIASES_KEY)) + for a, d in self[CFG_ALIASES_KEY].items(): + if d == digest: + return a + raise UndefinedAliasError("No alias defined for '{}'".format(digest)) + + def set_alias(self, genome, digest=None): + """ + Assign a human-readable alias to a genome identifier. + + Genomes are identified by a unique identifier which is derived from the + FASTA file (fasta asset). This way we can ensure genome provenance and + compatibility with the server. This function maps a human-readable + identifier to make referring to the genomes easier. + + :param str genome: name of the genome to assign to an identifier + :param str digest: identifier to use + :return bool: whether the alias has been established + """ + if not digest: + raise NotImplementedError("Digest lookup from server is not implemented yet") + with self as r: + r.setdefault(CFG_ALIASES_KEY, {}) + if genome in r[CFG_ALIASES_KEY]: + _LOGGER.warning("'{}' already in aliases ({})". + format(genome, r[CFG_ALIASES_KEY][genome])) + return False + r[CFG_ALIASES_KEY][genome] = digest + _LOGGER.info("Added new alias ({}: {})".format(genome, digest)) + return True + def initialize_genome(self, gat): """ Initialize a genome @@ -771,7 +833,8 @@ def initialize_genome(self, gat): for the FASTA file in the genome directory. :param list[str] gat: list of genome, asset and tag names - :return + :return str, list[dict[str]]: a pair of genome digest and list of + annotated sequence digests """ g = gat[0] _LOGGER.info("Initializing genome: {}".format(g)) @@ -780,7 +843,7 @@ def initialize_genome(self, gat): with open(pth, "w") as jfp: json.dump(c, jfp) _LOGGER.debug("Saved ASDs to JSON: {}".format(pth)) - self[CFG_GENOMES_KEY][g][CFG_CHECKSUM_KEY] = d + self.set_alias(g, d) return d, c def get_asds_path(self, genome): From 4b6e773464785de376fb65fad02a94f142145811 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 17 Jun 2020 17:01:05 -0400 Subject: [PATCH 19/29] decrease version for plugins only --- refgenconf/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/refgenconf/_version.py b/refgenconf/_version.py index b4e087d8..fcf769f5 100644 --- a/refgenconf/_version.py +++ b/refgenconf/_version.py @@ -1 +1 @@ -__version__ = "0.8.0-dev" +__version__ = "0.7.1-dev" From 6770601cdffd2488aba3daeb380220f6ce944cd9 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 17 Jun 2020 17:14:33 -0400 Subject: [PATCH 20/29] remove compatibility funcitonalities --- .travis.yml | 3 - refgenconf/refgenconf.py | 142 ++---------------------------- requirements/requirements-all.txt | 3 +- 3 files changed, 6 insertions(+), 142 deletions(-) diff --git a/.travis.yml b/.travis.yml index 2bd5fa2c..4b0bff89 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,9 +6,6 @@ python: - "3.8" os: - linux -before_install: #installing unreleased packages - - pip install git+https://github.com/databio/henge.git - - pip install git+https://github.com/databio/refget-py install: - pip install . - pip install -r requirements/requirements-dev.txt diff --git a/refgenconf/refgenconf.py b/refgenconf/refgenconf.py index 6167ab0e..359854e3 100755 --- a/refgenconf/refgenconf.py +++ b/refgenconf/refgenconf.py @@ -25,7 +25,6 @@ import shutil import json -from refget import RefGetHenge from attmap import PathExAttMap as PXAM from ubiquerg import checksum, is_url, query_yes_no, parse_registry_path as prp, untar, is_writable from tqdm import tqdm @@ -713,8 +712,9 @@ def msg_overwrite(): continue except ConnectionRefusedError as e: _LOGGER.error(str(e)) - _LOGGER.error("Server {}/{} refused download. Check your internet settings".format(server_url, - API_VERSION)) + _LOGGER.error("Server {}/{} refused download. " + "Check your internet settings". + format(server_url, API_VERSION)) return _null_return() except ContentTooShortError as e: _LOGGER.error(str(e)) @@ -750,112 +750,15 @@ def msg_overwrite(): rgc.update_tags(*gat, data={attr: archive_data[attr] for attr in ATTRS_COPY_PULL if attr in archive_data}) rgc.set_default_pointer(*gat) - if gat[1] == "fasta": - self.initialize_genome(gat) else: [self.chk_digest_update_child(gat[0], x, "{}/{}:{}".format(*gat), server_url) for x in archive_data[CFG_ASSET_PARENTS_KEY] if CFG_ASSET_PARENTS_KEY in archive_data] self.update_tags(*gat, data={attr: archive_data[attr] - for attr in ATTRS_COPY_PULL if attr in archive_data}) + for attr in ATTRS_COPY_PULL if attr in archive_data}) self.set_default_pointer(*gat) - if gat[1] == "fasta": - self.initialize_genome(gat) self.run_plugins(POST_PULL_HOOK) return gat, archive_data, server_url - def get_genome_digest(self, alias): - """ - Get the genome digest for human readable alias - - :param str alias: human-readable alias to get the genome digest for - :return str: genome digest - :raise GenomeConfigFormatError: if "genome_digests" section does - not exist in the config - :raise UndefinedAliasError: if a no digest has been defined for the - requested alias - """ - if CFG_ALIASES_KEY not in self: - raise GenomeConfigFormatError( - "'{}' not in genome config".format(CFG_ALIASES_KEY)) - if alias not in self[CFG_ALIASES_KEY].keys(): - raise UndefinedAliasError("No digest defined for '{}'".format(alias)) - return self[CFG_ALIASES_KEY][alias] - - def get_alias(self, digest): - """ - Get the human readable alias for a genome digest - - :param str digest: digest to find human-readable alias for - :return str: human-readable alias - :raise GenomeConfigFormatError: if "genome_digests" section does - not exist in the config - :raise UndefinedAliasError: if a no alias has been defined for the - requested digest - """ - if CFG_ALIASES_KEY not in self: - raise GenomeConfigFormatError( - "'{}' not in genome config".format(CFG_ALIASES_KEY)) - for a, d in self[CFG_ALIASES_KEY].items(): - if d == digest: - return a - raise UndefinedAliasError("No alias defined for '{}'".format(digest)) - - def set_alias(self, genome, digest=None): - """ - Assign a human-readable alias to a genome identifier. - - Genomes are identified by a unique identifier which is derived from the - FASTA file (fasta asset). This way we can ensure genome provenance and - compatibility with the server. This function maps a human-readable - identifier to make referring to the genomes easier. - - :param str genome: name of the genome to assign to an identifier - :param str digest: identifier to use - :return bool: whether the alias has been established - """ - if not digest: - raise NotImplementedError("Digest lookup from server is not implemented yet") - with self as r: - r.setdefault(CFG_ALIASES_KEY, {}) - if genome in r[CFG_ALIASES_KEY]: - _LOGGER.warning("'{}' already in aliases ({})". - format(genome, r[CFG_ALIASES_KEY][genome])) - return False - r[CFG_ALIASES_KEY][genome] = digest - _LOGGER.info("Added new alias ({}: {})".format(genome, digest)) - return True - - def initialize_genome(self, gat): - """ - Initialize a genome - - Create a JSON file with Annotated Sequence Digests (ASDs) - for the FASTA file in the genome directory. - - :param list[str] gat: list of genome, asset and tag names - :return str, list[dict[str]]: a pair of genome digest and list of - annotated sequence digests - """ - g = gat[0] - _LOGGER.info("Initializing genome: {}".format(g)) - d, c = RefGetHenge({}).load_fasta(self.seek(*gat, strict_exists=True)) - pth = self.get_asds_path(g) - with open(pth, "w") as jfp: - json.dump(c, jfp) - _LOGGER.debug("Saved ASDs to JSON: {}".format(pth)) - self.set_alias(g, d) - return d, c - - def get_asds_path(self, genome): - """ - Get path to the Annotated Sequence Digests JSON file for a given genome. - Note that the path and/or genome may not exist. - - :param str genome: genome name - :return str: ASDs path - """ - return os.path.join(self[CFG_FOLDER_KEY], genome, genome + "__ASDs.json") - def remove_asset_from_relatives(self, genome, asset, tag): """ Remove any relationship links associated with the selected asset @@ -1330,42 +1233,6 @@ def id(self, genome, asset, tag=None): raise MissingConfigDataError("Digest does not exist for: {}/{}:{}". format(genome, asset, tag)) - def compare(self, genome1, genome2, explain=False): - """ - Check genomes compatibility level. - - Compares Annotated Sequence Digests (ASDs) -- digested sequences and metadata - - :param str genome1: name of the first genome to compare - :param str genome2: name of the first genome to compare - :param bool explain: whether the returned code explanation should - be displayed - :return int: compatibility code - """ - def _get_asds_for_genome(rgc, genome): - """ - Read JSON file containing ASDs for a specified genome - - :param refgenconf.RefGenConf rgc: object to find the genome for - :param str genome: genome to find the file for - :return list[dict]: list of ASDs, ready to compare - """ - rgc.seek(genome, "fasta", strict_exists=True) - json_file = rgc.get_asds_path(genome) - if not os.path.exists(json_file): - raise OSError( - "File containing Annotated Sequence Digests (ASDs) not " - "found for genome: {g}. Pull or build '{g}/fasta' again to " - "check the compatibility.".format(g=genome)) - with open(json_file, "r") as jfp: - return json.load(jfp) - - return RefGetHenge({}).compare_asds( - _get_asds_for_genome(self, genome1), - _get_asds_for_genome(self, genome2), - explain=explain - ) - def run_plugins(self, hook): """ Runs all installed plugins for the specified hook. @@ -1749,6 +1616,7 @@ def map_paths_by_id(json_dict): raise ValueError("The provided mapping is not a valid representation of a JSON openAPI description") return {values["get"]["operationId"]: endpoint for endpoint, values in json_dict["paths"].items()} + def _remove(path): """ remove asset if it is a dir or a file diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 21f78c1a..8d17ea5f 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -4,5 +4,4 @@ requests tqdm>=4.38.0 ubiquerg>=0.5.0 yacman>=0.6.6 -future -refget>=0.0.1 \ No newline at end of file +future \ No newline at end of file From 8678791c9e26d243d0871c2dfc9327bda5934ac2 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 17 Jun 2020 17:50:17 -0400 Subject: [PATCH 21/29] update tests and consts --- refgenconf/const.py | 12 ++++------ tests/data/genomes.yaml | 48 ++++++++++++++++++++++++++++++++++++++- tests/test_1pull_asset.py | 14 ------------ tests/test_compare.py | 29 ----------------------- 4 files changed, 52 insertions(+), 51 deletions(-) delete mode 100644 tests/test_compare.py diff --git a/refgenconf/const.py b/refgenconf/const.py index fac15cad..eed4c7f2 100644 --- a/refgenconf/const.py +++ b/refgenconf/const.py @@ -57,13 +57,11 @@ CFG_REMOTE_URL_BASE_KEY = "remote_url_base" CFG_VERSION_KEY = "config_version" CFG_GENOMES_KEY = "genomes" -CFG_ALIASES_KEY = "genome_aliases" CFG_CHECKSUM_KEY = "genome_digest" CFG_GENOME_DESC_KEY = "genome_description" CFG_ASSETS_KEY = "assets" -CFG_GENOME_MASK_KEY = "genome_mask" CFG_ASSET_PATH_KEY = "asset_path" CFG_ASSET_SIZE_KEY = "asset_size" CFG_ASSET_DESC_KEY = "asset_description" @@ -80,18 +78,18 @@ CFG_ASSET_RELATIVES_KEYS = [CFG_ASSET_CHILDREN_KEY, CFG_ASSET_PARENTS_KEY] CFG_TOP_LEVEL_KEYS = [ - CFG_FOLDER_KEY, CFG_SERVER_KEY, CFG_SERVERS_KEY, CFG_ARCHIVE_KEY, CFG_GENOMES_KEY, CFG_ALIASES_KEY, + CFG_FOLDER_KEY, CFG_SERVER_KEY, CFG_SERVERS_KEY, CFG_ARCHIVE_KEY, CFG_GENOMES_KEY, CFG_VERSION_KEY, CFG_ARCHIVE_CONFIG_KEY, CFG_ARCHIVE_KEY_OLD, CFG_REMOTE_URL_BASE_KEY] CFG_GENOME_KEYS = [ CFG_GENOME_DESC_KEY, CFG_ASSETS_KEY, CFG_CHECKSUM_KEY] CFG_GENOME_ATTRS_KEYS = [CFG_GENOME_DESC_KEY, CFG_CHECKSUM_KEY] CFG_SINGLE_ASSET_SECTION_KEYS = [CFG_ASSET_PATH_KEY, CFG_ASSET_DESC_KEY, CFG_ASSET_SIZE_KEY, CFG_ARCHIVE_SIZE_KEY, - CFG_ARCHIVE_CHECKSUM_KEY, CFG_SEEK_KEYS_KEY, CFG_GENOME_MASK_KEY] + CFG_ARCHIVE_CHECKSUM_KEY, CFG_SEEK_KEYS_KEY] -RGC_REQ_KEYS = [CFG_SERVERS_KEY, CFG_FOLDER_KEY, CFG_GENOMES_KEY, CFG_VERSION_KEY, CFG_ALIASES_KEY] +RGC_REQ_KEYS = [CFG_SERVERS_KEY, CFG_FOLDER_KEY, CFG_GENOMES_KEY, CFG_VERSION_KEY] CFG_KEY_NAMES = [ - "CFG_FOLDER_KEY", "CFG_SERVER_KEY", "CFG_SERVERS_KEY", "CFG_GENOMES_KEY", "CFG_GENOME_MASK_KEY", "CFG_ALIASES_KEY", + "CFG_FOLDER_KEY", "CFG_SERVER_KEY", "CFG_SERVERS_KEY", "CFG_GENOMES_KEY", "CFG_ASSET_PATH_KEY", "CFG_ASSET_DESC_KEY", "CFG_ARCHIVE_KEY", "CFG_ARCHIVE_SIZE_KEY", "CFG_SEEK_KEYS_KEY", "CFG_ASSET_SIZE_KEY", "CFG_CHECKSUM_KEY", "CFG_ARCHIVE_CHECKSUM_KEY", "CFG_VERSION_KEY", "CFG_ASSET_PARENTS_KEY", "CFG_ASSET_CHILDREN_KEY", "CFG_TAG_DESC_KEY", "CFG_ASSET_CHECKSUM_KEY", "CFG_ASSET_TAGS_KEY", @@ -114,7 +112,7 @@ # other consts REQ_CFG_VERSION = 0.3 -REFGENIE_BY_CFG = {"0.4": "0.8.0", "0.3": "0.7.0", "0.2": "0.6.0"} +REFGENIE_BY_CFG = {"0.3": "0.7.0", "0.2": "0.6.0"} ATTRS_COPY_PULL = [CFG_ASSET_DESC_KEY, CFG_SEEK_KEYS_KEY, CFG_ASSET_PARENTS_KEY, CFG_ASSET_PATH_KEY, CFG_ASSET_CHECKSUM_KEY, CFG_TAG_DESC_KEY] REQ_TAG_ATTRS = [CFG_ASSET_PATH_KEY, CFG_SEEK_KEYS_KEY] diff --git a/tests/data/genomes.yaml b/tests/data/genomes.yaml index a103da3f..f72ff849 100644 --- a/tests/data/genomes.yaml +++ b/tests/data/genomes.yaml @@ -1,4 +1,50 @@ config_version: 0.3 genome_folder: /tmp genome_servers: ['http://refgenomes.databio.org'] -genomes: null \ No newline at end of file +genomes: + human_repeats: + assets: + fasta: + tags: + default: + seek_keys: + fasta: human_repeats.fa + fai: human_repeats.fa.fai + chrom_sizes: human_repeats.chrom.sizes + asset_parents: [] + asset_path: fasta + asset_digest: 4a749d4e74b057d0efa0c8398ebcb871 + default_tag: default + mouse_chrM2x: + assets: + bwa_index: + tags: + default: + seek_keys: + bwa_index: mouse_chrM2x.fa + asset_parents: [] + asset_path: bwa_index + asset_digest: 914dec83dcfab73e056717d33ecfd465 + default_tag: default + rCRSd: + assets: + bowtie2_index: + tags: + default: + seek_keys: + bowtie2_index: rCRSd + asset_parents: [] + asset_path: bowtie2_index + asset_digest: 1262e30d4a87db9365d501de8559b3b4 + default_tag: default + fasta: + tags: + default: + seek_keys: + fasta: rCRSd.fa + fai: rCRSd.fa.fai + chrom_sizes: rCRSd.chrom.sizes + asset_parents: [] + asset_path: fasta + asset_digest: 4eb430296bc02ed7e4006624f1d5ac53 + default_tag: default diff --git a/tests/test_1pull_asset.py b/tests/test_1pull_asset.py index d47c9a31..3fba59e6 100644 --- a/tests/test_1pull_asset.py +++ b/tests/test_1pull_asset.py @@ -123,17 +123,3 @@ def test_pull_asset_works_with_nonwritable_and_writable_rgc(cfg_file, gname, ana with mock.patch("refgenconf.refgenconf.query_yes_no", return_value=True): print("\nPulling; genome: {}, asset: {}, tag: {}\n".format(gname, aname, tname)) rgc.pull(gname, aname, tname) - - -@pytest.mark.parametrize("gname", ["rCRSd", "mouse_chrM2x", "human_repeats"]) -def test_pull_initializes_genomes(cfg_file, gname): - """ - Test for existence of a JSON files containing ASDs and genome digests in the - cfg after a fasta asset has been pulled for a genome - """ - rgc = RefGenConf(filepath=cfg_file, writable=False) - assert os.path.exists(rgc.get_asds_path(gname)) - assert CFG_CHECKSUM_KEY in rgc.get_genome_attributes(gname) - assert isinstance(rgc.get_genome_attributes(gname)[CFG_CHECKSUM_KEY], str) - - diff --git a/tests/test_compare.py b/tests/test_compare.py deleted file mode 100644 index e6f2c1d7..00000000 --- a/tests/test_compare.py +++ /dev/null @@ -1,29 +0,0 @@ -""" Tests for RefGenConf.compare. These tests depend on successful completion of tests is test_1pull_asset.py """ - -import pytest -import os - -__author__ = "Michal Stolarczyk" -__email__ = "michal@virginia.edu" - - -class TestCompare: - @pytest.mark.parametrize(["gname1", "gname2", "result"], - [("rCRSd", "rCRSd", 63), - ("mouse_chrM2x", "mouse_chrM2x", 63), - ("rCRSd", "mouse_chrM2x", 0)]) - def test_compare_result(self, ro_rgc, gname1, gname2, result): - assert ro_rgc.compare(gname1, gname2) == result - - @pytest.mark.parametrize(["gname1", "gname2"], - [("rCRSd", "rCRSd"), - ("mouse_chrM2x", "mouse_chrM2x"), - ("rCRSd", "mouse_chrM2x")]) - def test_compare_errors_when_no_asd_json(self, ro_rgc, gname1, gname2): - jfp = ro_rgc.get_asds_path(gname1) - os.rename(jfp, jfp + "_renamed") - with pytest.raises(OSError): - ro_rgc.compare(gname1, gname2) - os.rename(jfp + "_renamed", jfp) - - From 23db73e9d074fae59be51d8cf28b188c479630ac Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 17 Jun 2020 17:58:30 -0400 Subject: [PATCH 22/29] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index cfd358a5..b8686b0a 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ # refgenconf [![Build Status](https://travis-ci.org/databio/refgenconf.svg?branch=master)](https://travis-ci.org/databio/refgenconf) -[![Coverage Status](https://coveralls.io/repos/github/vreuter/refgenconf/badge.svg?branch=master)](https://coveralls.io/github/vreuter/refgenconf?branch=master) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/refgenconf/README.html) Configuration object for refgenie *et al.* From db9d019d3ff7d9d73bcb8833f14381d0f7615b06 Mon Sep 17 00:00:00 2001 From: nsheff Date: Fri, 19 Jun 2020 09:13:20 -0400 Subject: [PATCH 23/29] increase max wait --- refgenconf/refgenconf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/refgenconf/refgenconf.py b/refgenconf/refgenconf.py index 359854e3..82a868dd 100755 --- a/refgenconf/refgenconf.py +++ b/refgenconf/refgenconf.py @@ -58,7 +58,7 @@ def handle(sig, frame): class RefGenConf(yacman.YacAttMap): """ A sort of oracle of available reference genome assembly assets """ - def __init__(self, filepath=None, entries=None, writable=False, wait_max=10): + def __init__(self, filepath=None, entries=None, writable=False, wait_max=60): """ Create the config instance by with a filepath or key-value pairs. From b362d4141d963738aabc80be7537a6cc8572c9d4 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 22 Jun 2020 08:57:55 -0400 Subject: [PATCH 24/29] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b8686b0a..a3a96368 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # refgenconf [![Build Status](https://travis-ci.org/databio/refgenconf.svg?branch=master)](https://travis-ci.org/databio/refgenconf) +[![Coverage Status](https://coveralls.io/repos/github/refgenie/refgenconf/badge.svg?branch=master)](https://coveralls.io/github/refgenie/refgenconf?branch=master) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/refgenconf/README.html) Configuration object for refgenie *et al.* From 26a239fe36a039d7deb9d3809d55d2afea6beae1 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 22 Jun 2020 09:03:07 -0400 Subject: [PATCH 25/29] Update .travis.yml --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 4b0bff89..e53e2d48 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,6 +11,8 @@ install: - pip install -r requirements/requirements-dev.txt - pip install -r requirements/requirements-test.txt script: pytest --remote-data --cov=refgenconf +after_success: + - coveralls branches: only: - dev From da3b50a3ecd40d73f57a39d4abd56cffbce85ec5 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 22 Jun 2020 09:39:22 -0400 Subject: [PATCH 26/29] setup pypi release workflow --- .github/workflows/python-publish.yml | 31 ++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 .github/workflows/python-publish.yml diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 00000000..4e1ef42d --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,31 @@ +# This workflows will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Upload Python Package + +on: + release: + types: [created] + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build and publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + python setup.py sdist bdist_wheel + twine upload dist/* From 95ba7a498d9ab6abacd64cc90881bc487d8553ab Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 23 Jun 2020 10:00:36 -0400 Subject: [PATCH 27/29] prep release --- docs/changelog.md | 5 ++++- refgenconf/_version.py | 2 +- requirements/requirements-all.txt | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index a1fc0c75..1337e4fc 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,11 +2,14 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. -## [0.7.1] - unreleased +## [0.7.1] - 2020-06-23 ### Added - plugins functionality +### Removed +- preciously deprecated `get_asset` method. Use `seek` instead + ## [0.7.0] - 2020-03-17 ### Added diff --git a/refgenconf/_version.py b/refgenconf/_version.py index fcf769f5..a5f830a2 100644 --- a/refgenconf/_version.py +++ b/refgenconf/_version.py @@ -1 +1 @@ -__version__ = "0.7.1-dev" +__version__ = "0.7.1" diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 8d17ea5f..99c62f2c 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -2,6 +2,6 @@ attmap>=0.12.5 pyyaml requests tqdm>=4.38.0 -ubiquerg>=0.5.0 -yacman>=0.6.6 +ubiquerg>=0.6.0 +yacman>=0.6.8 future \ No newline at end of file From 6ef3dbe69199ee50d6b8b555715e882cca0eed6c Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 25 Jun 2020 09:39:26 -0400 Subject: [PATCH 28/29] prep release --- .travis.yml | 2 +- docs/changelog.md | 5 ++++- refgenconf/_version.py | 2 +- setup.py | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index e53e2d48..65e609c6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,8 @@ language: python python: - - "2.7" - "3.5" - "3.6" + - "3.7" - "3.8" os: - linux diff --git a/docs/changelog.md b/docs/changelog.md index 1337e4fc..800f3f32 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,11 +2,14 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. -## [0.7.1] - 2020-06-23 +## [0.8.0] - 2020-06-25 ### Added - plugins functionality +### Changed +- dropped Python 2 support + ### Removed - preciously deprecated `get_asset` method. Use `seek` instead diff --git a/refgenconf/_version.py b/refgenconf/_version.py index a5f830a2..777f190d 100644 --- a/refgenconf/_version.py +++ b/refgenconf/_version.py @@ -1 +1 @@ -__version__ = "0.7.1" +__version__ = "0.8.0" diff --git a/setup.py b/setup.py index fa303ee8..98a129dc 100644 --- a/setup.py +++ b/setup.py @@ -40,9 +40,10 @@ classifiers=[ "Development Status :: 4 - Beta", "License :: OSI Approved :: BSD License", - "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", "Topic :: Scientific/Engineering :: Bio-Informatics" ], license="BSD2", From 709693c9c41d0d88e3901da1664ef1a3503622f0 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 25 Jun 2020 10:04:50 -0400 Subject: [PATCH 29/29] use context manager for tempdir creation, simplify imports --- refgenconf/refgenconf.py | 43 +++++++++++++++------------------------- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/refgenconf/refgenconf.py b/refgenconf/refgenconf.py index 82a868dd..96700bcb 100755 --- a/refgenconf/refgenconf.py +++ b/refgenconf/refgenconf.py @@ -1,21 +1,6 @@ #!/usr/bin/env python -from collections import Iterable, Mapping, OrderedDict -from functools import partial - -# Some short-term hacks to get at least 1 working version on python 2.7 import sys -if sys.version_info >= (3, ): - from inspect import getfullargspec as finspect - from urllib.error import HTTPError, ContentTooShortError -else: - from future.standard_library import install_aliases - install_aliases() - from inspect import getargspec as finspect - from urllib2 import HTTPError - from urllib.error import ContentTooShortError - ConnectionRefusedError = Exception - import urllib.request import itertools import logging @@ -25,12 +10,19 @@ import shutil import json -from attmap import PathExAttMap as PXAM -from ubiquerg import checksum, is_url, query_yes_no, parse_registry_path as prp, untar, is_writable +import yacman + +from collections import Iterable, Mapping, OrderedDict +from functools import partial +from inspect import getfullargspec as finspect +from urllib.error import HTTPError, ContentTooShortError from tqdm import tqdm from pkg_resources import iter_entry_points +from tempfile import TemporaryDirectory -import yacman +from attmap import PathExAttMap as PXAM +from ubiquerg import checksum, is_url, query_yes_no, \ + parse_registry_path as prp, untar, is_writable from .const import * from .helpers import unbound_env_vars, asciify_json_dict, select_genome_config @@ -38,7 +30,6 @@ _LOGGER = logging.getLogger(__name__) - __all__ = ["RefGenConf"] @@ -730,17 +721,15 @@ def msg_overwrite(): return _null_return() else: _LOGGER.debug("Matched checksum: '{}'".format(old_checksum)) - from tempfile import mkdtemp # successfully downloaded and moved tarball; untar it if unpack and filepath.endswith(".tgz"): _LOGGER.info("Extracting asset tarball and saving to: {}".format(tag_dir)) - tmpdir = mkdtemp(dir=genome_dir_path) # TODO: use context manager here when we drop support for py2 - untar(filepath, tmpdir) - # here we suspect the unarchived asset to be an asset-named directory - # the asset data inside - # and we transfer it to the tag-named subdirectory - shutil.move(os.path.join(tmpdir, asset), tag_dir) - shutil.rmtree(tmpdir) + with TemporaryDirectory(dir=genome_dir_path) as tmpdir: + # here we suspect the unarchived asset to be an asset-named + # directory with the asset data inside and we transfer it + # to the tag-named subdirectory + untar(filepath, tmpdir) + shutil.move(os.path.join(tmpdir, asset), tag_dir) if os.path.isfile(filepath): os.remove(filepath) if self.file_path: