Skip to content

Commit

Permalink
Skip dirs in parse_directory (#478)
Browse files Browse the repository at this point in the history
<!-- Please ensure the PR fulfills the following requirements! -->
<!-- If this is your first PR, make sure to add your details to the
AUTHORS.rst! -->
### Pull Request Checklist:
- [ ] This PR addresses an already opened issue (for bug fixes /
features)
    - This PR fixes #xyz
- [x] (If applicable) Documentation has been added / updated (for bug
fixes / features).
- [x] (If applicable) Tests have been added.
- [x] This PR does not seem to break the templates.
- [x] CHANGELOG.rst has been updated (with summary of main changes).
- [ ] Link to issue (:issue:`number`) and pull request (:pull:`number`)
has been added.

### What kind of change does this PR introduce?

* `parse_directory` : New argument `skip_dirs`, a list of folders to not
go into when parsing.

### Does this PR introduce a breaking change?
No.

### Other information:
The MRCC5 catalog (him again) has two issues this help fix:
- Some simulations are copied in two sub-folders of the list of
`directories` to parse. This is because I want to give some time to
people to adapt to the new disk. The entries are exactly the same except
for the path, so there's no way to filter them out currently with the
other arguments of `parse_directory`.
- There are some many files.... This skipping will help the parsing time
by skipping folders I _know_ are garbage and would be rejected anyway by
other filters.
  • Loading branch information
aulemahal authored Oct 18, 2024
2 parents 5085476 + 94037f8 commit 937326f
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 2 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ Changelog

v0.11.0 (unreleased)
--------------------
Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`).
Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`), Pascal Bourgault (:user:`aulemahal`).

New features and enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
* ``xs.io.make_toc`` now includes the global attributes of the dataset after the information about the variables. (:pull:`473`).
* New function ``xs.get_warming_level_from_period`` to get the warming level associated with a given time horizon. (:pull:`474`).
* Added ability to skip whole folders to ``xs.parse_directory`` with argument ``skip_dirs``.

Breaking changes
^^^^^^^^^^^^^^^^
Expand Down
15 changes: 14 additions & 1 deletion src/xscen/catutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def _find_assets(
exts: set[str],
lengths: set[int],
dirglob: str | None = None,
skip_dirs: list[os.PathLike] | None = None,
):
"""Walk recursively over files in a directory, filtering according to a glob pattern, path depth and extensions.
Expand All @@ -138,7 +139,10 @@ def _find_assets(
dirglob : str, optional
A glob pattern. If given, only parent folders matching this pattern are walked through.
This pattern can not include the asset's basename.
skip_dirs : list of Paths, optional
A list of directories to skip on the walk.
"""
skip_dirs = skip_dirs or []
root = str(Path(root)) # to be sure
for top, alldirs, files in os.walk(root):
# Split zarr subdirectories from next iteration
Expand All @@ -147,6 +151,8 @@ def _find_assets(
if dr.endswith(".zarr"):
zarrs.append(dr)
alldirs.remove(dr)
if Path(top).joinpath(dr) in skip_dirs:
alldirs.remove(dr)

if (
top != root
Expand Down Expand Up @@ -270,6 +276,7 @@ def _parse_dir( # noqa: C901
root: os.PathLike | str,
patterns: list[str],
dirglob: str | None = None,
skip_dirs: list[os.PathLike] | None = None,
checks: list[str] | None = None,
read_from_file: list[str] | dict | None = None,
attrs_map: dict | None = None,
Expand All @@ -289,6 +296,8 @@ def _parse_dir( # noqa: C901
dirglob : str
A glob pattern. If given, only parent folders matching this pattern are walked through.
This pattern can not include the asset's basename.
skip_dirs : list of strings or Paths, optional
A list of directories to skip in the walk.
checks: list of strings, optional
A list of checks to perform, available values are:
- "readable" : Check that the file is readable by the current user.
Expand Down Expand Up @@ -389,7 +398,7 @@ def parse_worker():

# Skip the checks if none are requested (save some overhead)
q = q_found if checks else q_checked
for path in _find_assets(Path(root), exts, lengths, dirglob):
for path in _find_assets(Path(root), exts, lengths, dirglob, skip_dirs):
q.put(path)

q_found.join()
Expand Down Expand Up @@ -465,6 +474,7 @@ def parse_directory( # noqa: C901
homogenous_info: dict | None = None,
cvs: str | os.PathLike | dict | None = None,
dirglob: str | None = None,
skip_dirs: list[str | os.PathLike] | None = None,
xr_open_kwargs: Mapping[str, Any] | None = None,
only_official_columns: bool = True,
progress: bool = False,
Expand Down Expand Up @@ -506,6 +516,8 @@ def parse_directory( # noqa: C901
dirglob : str, optional
A glob pattern for path matching to accelerate the parsing of a directory tree if only a subtree is needed.
Only folders matching the pattern are parsed to find datasets.
skip_dirs : list of str or Paths, optional
A list of folders that will be removed from the search.
xr_open_kwargs: dict
If needed, arguments to send xr.open_dataset() when opening the file to read the attributes.
only_official_columns: bool
Expand Down Expand Up @@ -597,6 +609,7 @@ def parse_directory( # noqa: C901
parse_kwargs = dict(
patterns=patterns,
dirglob=dirglob,
skip_dirs=[Path(d) for d in (skip_dirs or [])],
read_from_file=read_from_file if not read_file_groups else None,
attrs_map=attrs_map,
xr_open_kwargs=xr_open_kwargs,
Expand Down
37 changes: 37 additions & 0 deletions tests/test_catutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,43 @@ def test_parse_directory_idcols():
assert (df["id"] == "example-region_NCC").all()


@pytest.mark.requires_netcdf
def test_parse_directory_skipdirs():
df = cu.parse_directory(
directories=[str(SAMPLES_DIR)],
skip_dirs=[
str(SAMPLES_DIR) + "/ScenarioMIP/example-region/NCC/NorESM2-MM/ssp126",
str(SAMPLES_DIR) + "/ScenarioMIP/example-region/NCC/NorESM2-MM/ssp245/",
],
patterns=[
"{activity}/{domain}/{institution}/{source}/{experiment}/{member:rev}/{frequency}/{?:_}.nc"
],
homogenous_info={
"mip_era": "CMIP6",
"type": "simulation",
"processing_level": "raw",
},
read_from_file=["variable", "date_start", "date_end", "version"],
xr_open_kwargs={"engine": "h5netcdf"},
cvs={
"domain": {"example-region": "exreg"},
"attributes": {"version_id": "version"},
},
file_checks=["readable", "ncvalid"],
)

assert len(df) == 4
assert (df["activity"] == "ScenarioMIP").all()
assert (df["mip_era"] == "CMIP6").all()
assert (df["domain"] == "exreg").all() # CVS simple
assert (
df[df["frequency"] == "fx"]["variable"] == ("sftlf",)
).all() # Read from file
assert df.date_start.dtype == "<M8[ms]"
assert df.date_end.dtype == "<M8[ms]"
assert set(df.experiment.unique()) == {"ssp370", "ssp585"}


def test_parse_from_ds():
# Real ds
ds = xr.tutorial.open_dataset("air_temperature")
Expand Down

0 comments on commit 937326f

Please sign in to comment.