Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add 'exclude' field #35

Merged
merged 15 commits into from
Jun 22, 2024
7 changes: 7 additions & 0 deletions bids2table/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,12 @@ def main():
default=0,
help="Increase verbosity level.",
)
parser.add_argument(
"--exclude",
nargs='*',
e-kenneally marked this conversation as resolved.
Show resolved Hide resolved
default=None,
help="List of directory names or glob patterns to exclude from indexing.",
)

args = parser.parse_args()

Expand All @@ -68,6 +74,7 @@ def main():
overwrite=args.overwrite,
workers=args.workers,
worker_id=args.worker_id,
exclude=args.exclude,
return_table=False,
)

Expand Down
13 changes: 9 additions & 4 deletions bids2table/_b2t.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from functools import partial
from pathlib import Path
from typing import Optional
from typing import List, Optional

from elbow.builders import build_parquet, build_table
from elbow.sources.filesystem import Crawler
Expand All @@ -19,6 +19,7 @@ def bids2table(
with_meta: bool = True,
persistent: bool = False,
index_path: Optional[StrOrPath] = None,
exclude: Optional[List[str]] = None,
incremental: bool = False,
overwrite: bool = False,
workers: Optional[int] = None,
Expand All @@ -35,6 +36,7 @@ def bids2table(
persistent: whether to save index to disk as a Parquet dataset
index_path: path to BIDS Parquet index to generate or load. Defaults to `root /
"index.b2t"`. Index generation requires `persistent=True`.
exclude: Optional list of directory names to exclude from indexing.
e-kenneally marked this conversation as resolved.
Show resolved Hide resolved
incremental: update index incrementally with only new or changed files.
overwrite: overwrite previous index.
workers: number of parallel processes. If `None` or 1, run in the main
Expand All @@ -56,15 +58,18 @@ def bids2table(
root = Path(root).expanduser().resolve()
if not root.is_dir():
raise FileNotFoundError(f"root directory {root} does not exists")


if exclude is None:
exclude = []

source = Crawler(
root=root,
include=["sub-*"], # find subject dirs
skip=["sub-*"], # but don't crawl into subject dirs
skip=["sub-*"] + exclude, # but don't crawl into subject dirs
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably we should also pass exclude to the Crawler's own exclude arg? But this should only matter in the odd case that someone tries to exclude a particular subject directory (which to be fair I'm not sure why anyone would do).

dirs_only=True,
follow_links=True,
)
extract = partial(extract_bids_subdir, with_meta=with_meta)
extract = partial(extract_bids_subdir, exclude=exclude, with_meta=with_meta)

if index_path is None:
index_path = root / "index.b2t"
Expand Down
16 changes: 13 additions & 3 deletions bids2table/extractors/bids.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import logging
from glob import iglob
from pathlib import Path
from typing import Generator, Optional
from typing import Generator, List, Optional

from elbow.extractors import extract_file_meta
from elbow.record import Record, concat
from elbow.sources.filesystem import Crawler
from elbow.typing import StrOrPath

from bids2table.entities import BIDSEntities
Expand Down Expand Up @@ -42,12 +43,21 @@ def extract_bids_file(path: StrOrPath, with_meta: bool = True) -> Optional[Recor


def extract_bids_subdir(
path: StrOrPath, with_meta: bool = True
path: StrOrPath, exclude: List[str],
with_meta: bool = True
) -> Generator[Optional[Record], None, None]:
"""
Extract BIDS records recursively for all files in a sub-directory.
"""
for path in iglob(str(Path(path) / "**"), recursive=True):

# replace glob with crawler
source = Crawler(
root=path,
skip=exclude,
follow_links=True,
)

for path in source:
yield extract_bids_file(path, with_meta=with_meta)


Expand Down
12 changes: 12 additions & 0 deletions tests/test_bids2table.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,18 @@ def test_bids2table_nonexist(tmp_path: Path):
with pytest.raises(FileNotFoundError):
bids2table(root=tmp_path / "nonexistent_dataset")

def test_bids2table_exclude(tmp_path: Path):
root = BIDS_EXAMPLES / "ds001"
index_path = tmp_path / "index_exclude.b2t"
exclude_list = ["anat"]

tab = bids2table(
root=root, with_meta=True, persistent=True, index_path=index_path, exclude=exclude_list
)

# Check that the excluded strings are not in the indexed table
assert 'ent__datatype' in tab.columns
assert 'anat' not in tab['ent__datatype'].values

if __name__ == "__main__":
pytest.main([__file__])
Loading