diff --git a/bids2table/__main__.py b/bids2table/__main__.py index d8791b3..83bb0c7 100644 --- a/bids2table/__main__.py +++ b/bids2table/__main__.py @@ -54,6 +54,12 @@ def main(): default=0, help="Increase verbosity level.", ) + parser.add_argument( + "--exclude", + nargs="+", + default=None, + help="List of directory names or glob patterns to exclude from indexing.", + ) args = parser.parse_args() @@ -68,6 +74,7 @@ def main(): overwrite=args.overwrite, workers=args.workers, worker_id=args.worker_id, + exclude=args.exclude, return_table=False, ) diff --git a/bids2table/_b2t.py b/bids2table/_b2t.py index 1749245..1f83d1e 100644 --- a/bids2table/_b2t.py +++ b/bids2table/_b2t.py @@ -1,7 +1,7 @@ import logging from functools import partial from pathlib import Path -from typing import Optional +from typing import List, Optional from elbow.builders import build_parquet, build_table from elbow.sources.filesystem import Crawler @@ -19,6 +19,7 @@ def bids2table( with_meta: bool = True, persistent: bool = False, index_path: Optional[StrOrPath] = None, + exclude: Optional[List[str]] = None, incremental: bool = False, overwrite: bool = False, workers: Optional[int] = None, @@ -35,6 +36,7 @@ def bids2table( persistent: whether to save index to disk as a Parquet dataset index_path: path to BIDS Parquet index to generate or load. Defaults to `root / "index.b2t"`. Index generation requires `persistent=True`. + exclude: Optional list of directory names or glob patterns to exclude from indexing. incremental: update index incrementally with only new or changed files. overwrite: overwrite previous index. workers: number of parallel processes. If `None` or 1, run in the main @@ -57,14 +59,17 @@ def bids2table( if not root.is_dir(): raise FileNotFoundError(f"root directory {root} does not exists") + if exclude is None: + exclude = [] + source = Crawler( root=root, include=["sub-*"], # find subject dirs - skip=["sub-*"], # but don't crawl into subject dirs + skip=["sub-*"] + exclude, # but don't crawl into subject dirs dirs_only=True, follow_links=True, ) - extract = partial(extract_bids_subdir, with_meta=with_meta) + extract = partial(extract_bids_subdir, exclude=exclude, with_meta=with_meta) if index_path is None: index_path = root / "index.b2t" diff --git a/bids2table/extractors/bids.py b/bids2table/extractors/bids.py index 40e0fbb..a272561 100644 --- a/bids2table/extractors/bids.py +++ b/bids2table/extractors/bids.py @@ -1,10 +1,10 @@ import logging -from glob import iglob from pathlib import Path -from typing import Generator, Optional +from typing import Generator, List, Optional from elbow.extractors import extract_file_meta from elbow.record import Record, concat +from elbow.sources.filesystem import Crawler from elbow.typing import StrOrPath from bids2table.entities import BIDSEntities @@ -42,12 +42,12 @@ def extract_bids_file(path: StrOrPath, with_meta: bool = True) -> Optional[Recor def extract_bids_subdir( - path: StrOrPath, with_meta: bool = True + path: StrOrPath, exclude: List[str], with_meta: bool = True ) -> Generator[Optional[Record], None, None]: """ Extract BIDS records recursively for all files in a sub-directory. """ - for path in iglob(str(Path(path) / "**"), recursive=True): + for path in Crawler(root=path, skip=exclude, exclude=exclude, follow_links=True): yield extract_bids_file(path, with_meta=with_meta) diff --git a/tests/test_bids2table.py b/tests/test_bids2table.py index d97cb06..1e0a625 100644 --- a/tests/test_bids2table.py +++ b/tests/test_bids2table.py @@ -62,5 +62,23 @@ def test_bids2table_nonexist(tmp_path: Path): bids2table(root=tmp_path / "nonexistent_dataset") +def test_bids2table_exclude(tmp_path: Path): + root = BIDS_EXAMPLES / "ds001" + index_path = tmp_path / "index_exclude.b2t" + exclude_list = ["anat"] + + tab = bids2table( + root=root, + with_meta=True, + persistent=True, + index_path=index_path, + exclude=exclude_list, + ) + + # Check that the excluded strings are not in the indexed table + assert "ent__datatype" in tab.columns + assert "anat" not in tab["ent__datatype"].values + + if __name__ == "__main__": pytest.main([__file__])