From b4dc0def47d5fcd34699712bc9fc854ff3e80b64 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Mon, 21 Oct 2024 10:08:18 -0400 Subject: [PATCH 1/2] skip root dirs --- CHANGELOG.rst | 2 +- src/xscen/catutils.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3523581c..78a5e5bc 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,7 +10,7 @@ New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ * ``xs.io.make_toc`` now includes the global attributes of the dataset after the information about the variables. (:pull:`473`). * New function ``xs.get_warming_level_from_period`` to get the warming level associated with a given time horizon. (:pull:`474`). -* Added ability to skip whole folders to ``xs.parse_directory`` with argument ``skip_dirs``. +* Added ability to skip whole folders to ``xs.parse_directory`` with argument ``skip_dirs``. (:pull:`478`, :pull:`479`). Breaking changes ^^^^^^^^^^^^^^^^ diff --git a/src/xscen/catutils.py b/src/xscen/catutils.py index 5ed56357..0a0d5e8f 100644 --- a/src/xscen/catutils.py +++ b/src/xscen/catutils.py @@ -148,10 +148,12 @@ def _find_assets( # Split zarr subdirectories from next iteration zarrs = [] for dr in deepcopy(alldirs): + fdr = Path(top).joinpath(dr) if dr.endswith(".zarr"): zarrs.append(dr) alldirs.remove(dr) - if Path(top).joinpath(dr) in skip_dirs: + if fdr in skip_dirs: + logger.debug("Skipping %s", fdr) alldirs.remove(dr) if ( @@ -323,6 +325,12 @@ def _parse_dir( # noqa: C901 exts = {Path(patt).suffix for patt in patterns} comp_patterns = list(map(_compile_pattern, patterns)) checks = checks or [] + parsed = [] + + root = Path(root) + if any([(skd in root.parents) or (skd == root) for skd in skip_dirs]): + logger.debug("Skipping %s", root) + return parsed # Multithread, communicating via FIFO queues. # This thread walks the directory @@ -334,7 +342,6 @@ def _parse_dir( # noqa: C901 # Usually, the walking is the bottleneck. q_found = queue.Queue() q_checked = queue.Queue() - parsed = [] def check_worker(): # Worker that processes the checks. @@ -517,7 +524,7 @@ def parse_directory( # noqa: C901 A glob pattern for path matching to accelerate the parsing of a directory tree if only a subtree is needed. Only folders matching the pattern are parsed to find datasets. skip_dirs : list of str or Paths, optional - A list of folders that will be removed from the search. + A list of folders that will be removed from the search, should be absolute. xr_open_kwargs: dict If needed, arguments to send xr.open_dataset() when opening the file to read the attributes. only_official_columns: bool From 531a09d288af4f7662b40d4dc809a933644ab5b5 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Mon, 21 Oct 2024 10:26:28 -0400 Subject: [PATCH 2/2] Fix _parse dir for none skip_dirs --- src/xscen/catutils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xscen/catutils.py b/src/xscen/catutils.py index 0a0d5e8f..7737a5bc 100644 --- a/src/xscen/catutils.py +++ b/src/xscen/catutils.py @@ -328,7 +328,7 @@ def _parse_dir( # noqa: C901 parsed = [] root = Path(root) - if any([(skd in root.parents) or (skd == root) for skd in skip_dirs]): + if any([(skd in root.parents) or (skd == root) for skd in (skip_dirs or [])]): logger.debug("Skipping %s", root) return parsed