From a4f45e9659d0f8a8c1c24063d7a81f0dddbe7746 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paul=20M=C3=BCller?= Date: Tue, 28 May 2024 07:45:07 +0200 Subject: [PATCH] feat: output file taps input file, introcude --drain-basins CLI option --- CHANGELOG | 3 +++ chipstream/cli/cli_main.py | 9 +++++++-- chipstream/cli/cli_proc.py | 7 ++++--- tests/test_cli.py | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 5 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 71342bd..41b67bf 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,7 @@ 0.4.0 + - feat: CLI by default creates output file that contains no redundant + data and uses the input file as a basin; old behavior can be brought + back with the "--drain-basins" command-line option - fix: show correct data PPID in CLI - setup: bump dcnum from 0.19.1 to 0.20.1 0.3.1 diff --git a/chipstream/cli/cli_main.py b/chipstream/cli/cli_main.py index 6ee1641..52410b6 100644 --- a/chipstream/cli/cli_main.py +++ b/chipstream/cli/cli_main.py @@ -104,6 +104,10 @@ "You can also specify a step size (e.g. '5000-7000-2' for " "every second event). The convention follows Python slices " "with 'n' substituting for 'None'.") +@click.option("--drain-basins", type=str, is_flag=True, + help="Write all basin features from input to output file. This " + "option trades computation time and small file size for " + "an output file that contains all available features.") @click.option("-r", "--recursive", is_flag=True, help="Recurse into subdirectories.") @click.option("--num-cpus", @@ -129,6 +133,7 @@ def chipstream_cli( gate_kwargs=None, pixel_size=0, limit_events="0", + drain_basins=False, recursive=False, num_cpus=None, dry_run=False, @@ -142,7 +147,6 @@ def chipstream_cli( verbose = True # Parse limit_frames to get the HDF5Data index_mapping - if limit_events == "0": index_mapping = None elif limit_events.count("-"): @@ -175,8 +179,9 @@ def chipstream_cli( feature_kwargs=feature_kwargs, gate_kwargs=gate_kwargs, pixel_size=pixel_size, - # Below this line are arguments that do not define the pipeline ID index_mapping=index_mapping, + # Below this line are arguments that do not define the pipeline ID + basin_strategy="drain" if drain_basins else "tap", num_cpus=num_cpus or mp.cpu_count(), dry_run=dry_run, debug=debug, diff --git a/chipstream/cli/cli_proc.py b/chipstream/cli/cli_proc.py index 41992c6..3ea2d34 100644 --- a/chipstream/cli/cli_proc.py +++ b/chipstream/cli/cli_proc.py @@ -1,6 +1,6 @@ import pathlib import time -from typing import List +from typing import List, Literal import click import dcnum.logic @@ -25,8 +25,9 @@ def process_dataset( feature_kwargs: List[str], gate_kwargs: List[str], pixel_size: float, - # Below this line are arguments that do not affect the pipeline ID index_mapping: int | slice | None, + # Below this line are arguments that do not affect the pipeline ID + basin_strategy: Literal["drain", "tap"], num_cpus: int, dry_run: bool, debug: bool, @@ -99,7 +100,7 @@ def process_dataset( feature_kwargs=feat_kwargs, gate_code=gate_cls.get_ppid_code(), gate_kwargs=gate_kwargs, - basin_strategy="drain", + basin_strategy=basin_strategy, num_procs=num_cpus, debug=debug, ) diff --git a/tests/test_cli.py b/tests/test_cli.py index 1a34ba4..6c3cde1 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -12,6 +12,39 @@ from chipstream.cli import cli_main # noqa: E402 +@pytest.mark.parametrize("drain", [True, False]) +def test_cli_basins(cli_runner, drain): + path_temp = retrieve_data( + "fmt-hdf5_cytoshot_full-features_legacy_allev_2023.zip") + path = path_temp.with_name("input_path.rtdc") + + # create a test file for more than 100 events + with dcnum.read.concatenated_hdf5_data( + paths=3*[path_temp], + path_out=path, + compute_frame=True): + pass + + path_out = path.with_name("with_pixel_size_dcn.rtdc") + args = [str(path), + str(path_out), + "-s", "thresh", + ] + if drain: + args.append("--drain-basins") + result = cli_runner.invoke(cli_main.chipstream_cli, args) + assert result.exit_code == 0 + + with h5py.File(path_out) as h5: + for feat in ["image", "frame"]: + if drain: + assert feat in h5["events"] + else: + assert feat not in h5["events"] + for feat in ["mask", "deform", "aspect"]: + assert feat in h5["events"] + + @pytest.mark.parametrize("limit_events,dcnum_mapping,dcnum_yield,f0", [ # this is the default ["0", "0", 36, 1], @@ -46,6 +79,7 @@ def test_cli_limit_events(cli_runner, limit_events, dcnum_yield, str(path_out), "-s", "thresh", "--limit-events", limit_events, + "--drain-basins", ]) assert result.exit_code == 0