From 2a08ab8fa8f89ab5a07b4a5c4d9887982e2e2f30 Mon Sep 17 00:00:00 2001 From: Michelle Wang Date: Fri, 17 Nov 2023 16:38:04 -0500 Subject: [PATCH] Update trackers to use manifest and handle tar archives (#181) * use manifest instead of doughnut to get participants to track * fix identical bagel still being written * Check `tar` and `tar.gz` extensions * add HAS_DATATYPE__{} columns to bagel * fix warning in pd.DataFrame.compare() * change logic for UNAVAILABLE status based on manifest * commit dashboard schema * fix warning about df_bagel_old_full not being defined * refactor based on Nikhil comments in Slack meeting --- nipoppy/trackers/bagel_schema.json | 94 ++++++++++++++++++ nipoppy/trackers/run_tracker.py | 152 +++++++++++++++++++---------- 2 files changed, 195 insertions(+), 51 deletions(-) create mode 100644 nipoppy/trackers/bagel_schema.json diff --git a/nipoppy/trackers/bagel_schema.json b/nipoppy/trackers/bagel_schema.json new file mode 100644 index 00000000..a2f1a411 --- /dev/null +++ b/nipoppy/trackers/bagel_schema.json @@ -0,0 +1,94 @@ +{ + "GLOBAL_COLUMNS": { + "participant_id": { + "Description": "Participant identifier within a given dataset.", + "dtype": "str", + "IsRequired": true, + "IsPrefixedColumn": false + }, + "bids_id": { + "Description": "BIDS dataset identifier for a participant, if available/different from the participant_id.", + "dtype": "str", + "IsRequired": false, + "IsPrefixedColumn": false + }, + "session": { + "Description": "Participant session ID.", + "dtype": "str", + "IsRequired": true, + "IsPrefixedColumn": false + }, + "has_mri_data": { + "Description": "Whether or not participant had MRI data acquired in a given session.", + "dtype": "bool", + "IsRequired": false, + "Range": [true, false], + "IsPrefixedColumn": false + }, + "HAS_DATATYPE__": { + "Description": "Whether or not participant session has specified raw BIDS datatype. Column suffix should correspond to a specific BIDS subdirectory. e.g., 'HAS_DATATYPE__anat'", + "dtype": "bool", + "IsRequired": false, + "Range": [true, false], + "IsPrefixedColumn": true + }, + "HAS_IMAGE__": { + "Description": "Whether or not participant session has specified imaging file. Column suffix should correspond to a BIDS file suffix. e.g. 'HAS_IMAGE__T1w'", + "dtype": "bool", + "IsRequired": false, + "Range": [true, false], + "IsPrefixedColumn": true + }, + "pipeline_name": { + "Description": "Name of a pipeline that was run for the participant, if applicable. Example value: 'freesurfer'", + "dtype": "str", + "IsRequired": true, + "MissingValue": "UNAVAILABLE", + "IsPrefixedColumn": false + }, + "pipeline_version": { + "description": "Version of pipeline that was run. Must have a value if the value for 'pipeline_name' is not 'UNAVAILABLE'. Example value: '7.3.0'", + "dtype": "str", + "IsRequired": true, + "MissingValue": "UNAVAILABLE", + "IsPrefixedColumn": false + }, + "pipeline_starttime": { + "Description": "Date/time that pipeline run was started. In format of 'YYYY-MM-DD HH:MM:SS'.", + "dtype": "str", + "IsRequired": true, + "MissingValue": "UNAVAILABLE", + "IsPrefixedColumn": false + }, + "pipeline_endtime": { + "Description": "Date/time that pipeline run ended. In format of 'YYYY-MM-DD HH:MM:SS'.", + "dtype": "str", + "IsRequired": false, + "MissingValue": "UNAVAILABLE", + "IsPrefixedColumn": false + } + }, + "PIPELINE_STATUS_COLUMNS": { + "pipeline_complete": { + "Description": "Status of pipeline run. 'SUCCESS': All stages of pipeline (as configured by user) finished successfully (all expected pipeline output files are present). 'FAIL': At least one stage of the pipeline failed. 'INCOMPLETE': Pipeline has not yet been run for the participant or at least one stage is unfinished/still running. 'UNAVAILABLE': Relevant data modality for pipeline not available for participant.", + "dtype": "str", + "IsRequired": true, + "Range": ["SUCCESS", "FAIL", "INCOMPLETE", "UNAVAILABLE"], + "IsPrefixedColumn": false + }, + "PHASE__": { + "Description": "Completion status of tracker-specified phase/subworkflow of a pipeline. This prefix must be followed by a second that is a composite of {pipeline_name}-{pipeline_version} to be grouped to the relevant pipeline. e.g., 'PHASE__fmriprep-20.2.7__func'. Each phase may correspond to a specific output subdirectory, and may be associated with multiple related output files. If phase and stage columns are both present, each phase is expected to correspond to >= 1 stage. 'SUCCESS': All output files corresponding to phase are present. 'FAIL': At least one output file of phase is missing. This status may be used to indicate that the phase crashed. 'INCOMPLETE': Output files for phase are not present. This status may be used to indicate that the phase was not configured for the run (e.g., if it corresponds to a specific derivative type). 'UNAVAILABLE': Relevant data modality for pipeline not available for participant. '' (no value): Specified phase not part of pipeline described by current row/record.", + "dtype": "str", + "IsRequired": false, + "Range": ["SUCCESS", "FAIL", "INCOMPLETE", "UNAVAILABLE", ""], + "IsPrefixedColumn": true + }, + "STAGE__": { + "Description": "Completion status of tracker-specified stage of a pipeline. This prefix must be followed by a second that is a composite of {pipeline_name}-{pipeline_version} to be grouped to the relevant pipeline. e.g., 'STAGE__fmriprep-20.2.7__space-MNI152Lin_res-1'. Each stage may correspond to a single output file or a few linked outputs expected to always coexist. If phase and stage columns are both present, each phase is expected to correspond to >= 1 stage. 'SUCCESS': All output files corresponding to stage are present. 'FAIL': At least one output file of stage is missing. This status may be used to indicate that the stage crashed. 'INCOMPLETE': Output files for phase are not present. This status may be used to indicate that this stage was not configured for the run. 'UNAVAILABLE': Relevant data modality for pipeline not available for participant. '' (no value): Specified stage not part of pipeline described by current row/record.", + "dtype": "str", + "IsRequired": false, + "Range": ["SUCCESS", "FAIL", "INCOMPLETE", "UNAVAILABLE", ""], + "IsPrefixedColumn": true + } + } +} \ No newline at end of file diff --git a/nipoppy/trackers/run_tracker.py b/nipoppy/trackers/run_tracker.py index 1e06e10a..0b93c0d2 100755 --- a/nipoppy/trackers/run_tracker.py +++ b/nipoppy/trackers/run_tracker.py @@ -1,28 +1,26 @@ #!/usr/bin/env python import argparse -import json import bids import json +import tarfile import warnings from pathlib import Path import pandas as pd import nipoppy.workflow.logger as my_logger -from nipoppy.trackers.tracker import Tracker, get_start_time, get_end_time, UNAVAILABLE, INCOMPLETE, TRUE +from nipoppy.trackers.tracker import Tracker, get_start_time, get_end_time, SUCCESS, UNAVAILABLE, INCOMPLETE, TRUE from nipoppy.trackers import bids_tracker, fs_tracker, fmriprep_tracker, mriqc_tracker, tractoflow_tracker from nipoppy.workflow.utils import ( - BIDS_SUBJECT_PREFIX, BIDS_SESSION_PREFIX, + COL_DATATYPE_MANIFEST, COL_SUBJECT_MANIFEST, COL_BIDS_ID_MANIFEST, COL_SESSION_MANIFEST, - COL_CONV_STATUS, DNAME_BACKUPS_BAGEL, FNAME_BAGEL, - FNAME_DOUGHNUT, - load_doughnut, + FNAME_MANIFEST, + load_manifest, save_backup, - session_id_to_bids_session, ) # Globals @@ -34,6 +32,14 @@ "mriqc": mriqc_tracker.tracker_configs, "tractoflow": tractoflow_tracker.tracker_configs, } +PIPELINE_REQUIRED_DATATYPES = { + "heudiconv": [], + "freesurfer": ["anat"], + "fmriprep": ["anat"], + "mriqc": ["anat"], + "tractoflow": ["anat", "dwi"], +} +ALL_DATATYPES = sorted(["anat", "dwi", "func", "fmap"]) BIDS_PIPES = ["mriqc","fmriprep"] NO_TRACKER_PIPES = ["maget_brain"] @@ -45,10 +51,6 @@ def run(global_configs, dash_schema_file, pipelines, session_id="ALL", run_id="1 # for bids tracker bids_dir = f"{DATASET_ROOT}/bids/" - # Grab BIDS participants from the doughnut - doughnut_file = f"{DATASET_ROOT}/scratch/raw_dicom/doughnut.csv" - doughnut_df = load_doughnut(doughnut_file) - # logging log_dir = f"{DATASET_ROOT}/scratch/logs/" if logger is None: @@ -56,6 +58,7 @@ def run(global_configs, dash_schema_file, pipelines, session_id="ALL", run_id="1 logger = my_logger.get_logger(log_file, level=log_level) logger.info(f"Tracking pipelines: {pipelines}") + logger.info(f"Tracking run: {run_id} and acq_label: {acq_label}") if session_id == "ALL": sessions = global_configs["SESSIONS"] @@ -69,7 +72,6 @@ def run(global_configs, dash_schema_file, pipelines, session_id="ALL", run_id="1 pipe_tracker = Tracker(global_configs, dash_schema_file, pipeline) # TODO revise tracker class - # DATASET_ROOT, session_ids, version = pipe_tracker.get_global_configs() if pipeline == "heudiconv": version = global_configs["BIDS"][pipeline]["VERSION"] else: @@ -82,15 +84,15 @@ def run(global_configs, dash_schema_file, pipelines, session_id="ALL", run_id="1 else: logger.warning(f"Skipping pipeline: {pipeline}. Tracker not listed in the config") - # Grab BIDS participants from the doughnut - doughnut_file = f"{DATASET_ROOT}/scratch/raw_dicom/{FNAME_DOUGHNUT}" - doughnut_df = load_doughnut(doughnut_file) - participants_total = doughnut_df[doughnut_df[COL_CONV_STATUS]][COL_BIDS_ID_MANIFEST].unique() - n_participants_total = len(participants_total) + # Grab BIDS participants from the manifest + fpath_manifest = f"{DATASET_ROOT}/tabular/{FNAME_MANIFEST}" + df_manifest = load_manifest(fpath_manifest) + df_manifest_imaging = df_manifest.loc[df_manifest[COL_DATATYPE_MANIFEST].apply(lambda x: len(x) != 0)] + n_participants_with_imaging = len(df_manifest_imaging[COL_BIDS_ID_MANIFEST].unique()) logger.info("-"*50) logger.info(f"pipeline: {pipeline}, version: {version}") - logger.info(f"n_participants_total: {n_participants_total}, sessions: {sessions}") + logger.info(f"n_participants_with_imaging: {n_participants_with_imaging}, sessions: {sessions}") logger.info("-"*50) status_check_dict = pipe_tracker.get_pipe_tasks(tracker_configs, PIPELINE_STATUS_COLUMNS, pipeline, version) @@ -99,13 +101,16 @@ def run(global_configs, dash_schema_file, pipelines, session_id="ALL", run_id="1 # for prefixed columns we need to generate the column name dash_col_list = list(key for key, value in schema["GLOBAL_COLUMNS"].items() if not value["IsPrefixedColumn"]) # status_check_dict will typically only have minimal pipeline_complete key + for datatype in ALL_DATATYPES: + dash_col_list.append(f"HAS_DATATYPE__{datatype}") dash_col_list = dash_col_list + list(status_check_dict.keys()) for session in sessions: session_id = session.removeprefix(BIDS_SESSION_PREFIX) logger.info(f"Checking session: {session}") - participants_session = doughnut_df[(doughnut_df[COL_BIDS_ID_MANIFEST].isin(participants_total)) & (doughnut_df[COL_SESSION_MANIFEST] == session)][COL_BIDS_ID_MANIFEST].drop_duplicates().astype(str).str.strip().values + df_manifest_session = df_manifest_imaging.loc[df_manifest_imaging[COL_SESSION_MANIFEST] == session] + participants_session = df_manifest_session[COL_BIDS_ID_MANIFEST].unique() n_participants_session = len(participants_session) logger.info(f"n_participants_session: {n_participants_session}") @@ -113,9 +118,9 @@ def run(global_configs, dash_schema_file, pipelines, session_id="ALL", run_id="1 _df[COL_SESSION_MANIFEST] = session _df["pipeline_name"] = pipeline _df["pipeline_version"] = version - _df["has_mri_data"] = TRUE # everyone in the doughnut file has MRI data + _df["has_mri_data"] = TRUE # everyone in participants_session has MRI data - # Set correct dtype based on dash schema to avoid panads warning + # Set correct dtype based on dash schema to avoid pandas warning # i.e. "FutureWarning: Setting an item of incompatible dtype" dash_col_dtype = "str" for dash_col, _ in status_check_dict.items(): @@ -139,7 +144,7 @@ def run(global_configs, dash_schema_file, pipelines, session_id="ALL", run_id="1 # make sure the number of participants is consistent across pipelines if set(participants_session) != old_participants_session and not old_pipelines_session.issubset(set(pipelines)): warnings.warn( - f'The existing bagel file might be obsolete (participant list does not match the doughnut file for session {session})' + f'The existing bagel file might be obsolete (participant list does not match the manifest file for session {session})' f'. Rerun the tracker script with --pipelines {" ".join(old_pipelines_session.union(pipelines))}' ) @@ -154,61 +159,106 @@ def run(global_configs, dash_schema_file, pipelines, session_id="ALL", run_id="1 else: df_bagel_old = None - for bids_id in participants_session: - participant_id = doughnut_df[doughnut_df[COL_BIDS_ID_MANIFEST]==bids_id][COL_SUBJECT_MANIFEST].values[0] + for bids_id, participant_id, available_datatypes in df_manifest_session[[COL_BIDS_ID_MANIFEST, COL_SUBJECT_MANIFEST, COL_DATATYPE_MANIFEST]].itertuples(index=False): _df.loc[bids_id, COL_SUBJECT_MANIFEST] = participant_id _df.loc[bids_id, COL_BIDS_ID_MANIFEST] = bids_id + # TODO eventually we should move these to the {pipeline}_tracker.py files if pipeline == "heudiconv": subject_dir = f"{DATASET_ROOT}/bids/{bids_id}" subject_ses_dir = f"{subject_dir}/{session}" - elif pipeline in ["freesurfer","tractoflow"]: - subject_dir = f"{DATASET_ROOT}/derivatives/{pipeline}/v{version}/output/{session}/{bids_id}" + elif pipeline in ["freesurfer", "tractoflow"]: + subject_dir = f"{DATASET_ROOT}/derivatives/{pipeline}/{version}/output/{session}/{bids_id}" subject_ses_dir = subject_dir elif pipeline in BIDS_PIPES: - subject_dir = f"{DATASET_ROOT}/derivatives/{pipeline}/v{version}/output/{bids_id}" + subject_dir = f"{DATASET_ROOT}/derivatives/{pipeline}/{version}/output/{bids_id}" + # NOTE temporary solution while we refactor tracker configs to be version-specific + if pipeline == "fmriprep": + subject_ses_dir = f"{subject_dir}/{session}" + subject_ses_tar_paths = [ + Path(subject_ses_dir).with_suffix('.tar'), + Path(subject_ses_dir).with_suffix('.tar.gz'), + ] + if ( + not Path(subject_dir).is_dir() and + not any([path.exists() for path in subject_ses_tar_paths]) + ): + subject_dir = f"{DATASET_ROOT}/derivatives/{pipeline}/{version}/output/fmriprep/{bids_id}" subject_ses_dir = f"{subject_dir}/{session}" elif pipeline in NO_TRACKER_PIPES: logger.warning(f"pipeline: {pipeline} does not have a tracker yet...") else: logger.error(f"unknown pipeline: {pipeline}") - - subject_ses_dir_status = Path(subject_ses_dir).is_dir() - logger.debug(f"subject_dir:{subject_ses_dir_status}, dir_status: {subject_ses_dir_status}") - # TODO incorporate manifest datatype availability - if subject_ses_dir_status: - for name, func in status_check_dict.items(): - if pipeline == "heudiconv": - status = func(bids_layout, participant_id, session_id, run_id, acq_label) - else: - status = func(subject_dir, session_id, run_id, acq_label) + # populate HAS_DATATYPE__ columns + # and check if all required datatypes are available + required_datatypes = PIPELINE_REQUIRED_DATATYPES[pipeline] + has_required_datatypes = True + for datatype in ALL_DATATYPES: + _df.loc[bids_id, f"HAS_DATATYPE__{datatype}"] = datatype in available_datatypes + if (datatype in required_datatypes) and (datatype not in available_datatypes): + has_required_datatypes = False + + if has_required_datatypes: + + subject_ses_dir_status = Path(subject_ses_dir).is_dir() + subject_ses_tar_paths = [ + Path(subject_ses_dir).with_suffix('.tar'), + Path(subject_ses_dir).with_suffix('.tar.gz'), + ] + subject_ses_tar_status = any([path.exists() for path in subject_ses_tar_paths]) + logger.debug(f"subject_ses_dir: {subject_ses_dir}, dir_status: {subject_ses_dir_status}, subject_ses_tar_status: {subject_ses_tar_status}") + + if subject_ses_tar_status: + logger.debug(f"subject_ses_dir: {subject_ses_dir} is a tar file") + for name in status_check_dict.keys(): + if name == 'pipeline_complete': + _df.loc[bids_id,name] = SUCCESS + else: + # here, UNAVAILABLE refers to the functionality not being implemented yet for phases/stages + # unrelated to pipeline_complete being UNAVAILABLE, which is related to the datatypes column in the manifest + _df.loc[bids_id,name] = UNAVAILABLE # TODO check if files are available in the tar file + _df.loc[bids_id,"pipeline_starttime"] = UNAVAILABLE + _df.loc[bids_id,"pipeline_endtime"] = UNAVAILABLE + elif subject_ses_dir_status: + for name, func in status_check_dict.items(): + if pipeline == "heudiconv": + status = func(bids_layout, participant_id, session_id, run_id, acq_label) + else: + status = func(subject_dir, session_id, run_id, acq_label) - logger.debug(f"task_name: {name}, status: {status}") + logger.debug(f"task_name: {name}, status: {status}") - _df.loc[bids_id,name] = status - _df.loc[bids_id,"pipeline_starttime"] = get_start_time(subject_dir) - # TODO only check files listed in the tracker config - _df.loc[bids_id,"pipeline_endtime"] = UNAVAILABLE # get_end_time(subject_dir) + _df.loc[bids_id,name] = status + _df.loc[bids_id,"pipeline_starttime"] = get_start_time(subject_dir) + # TODO only check files listed in the tracker config + _df.loc[bids_id,"pipeline_endtime"] = UNAVAILABLE # get_end_time(subject_dir) + else: + logger.debug(f"{pipeline} output is expected based on manifest but not found for bids_id: {bids_id}, session: {session}") + for name in status_check_dict.keys(): + _df.loc[bids_id, name] = INCOMPLETE + _df.loc[bids_id, "pipeline_starttime"] = UNAVAILABLE + _df.loc[bids_id, "pipeline_endtime"] = UNAVAILABLE else: - logger.debug(f"{pipeline} output is expected based on manifest but not found for bids_id: {bids_id}, session: {session}") - for name in status_check_dict.keys(): - _df.loc[bids_id,name] = INCOMPLETE - _df.loc[bids_id,"pipeline_starttime"] = UNAVAILABLE - _df.loc[bids_id,"pipeline_endtime"] = UNAVAILABLE + logger.debug(f"{pipeline} output is not expected based on manifest for bids_id: {bids_id}, session: {session}") + for name in status_check_dict.keys(): + _df.loc[bids_id, name] = UNAVAILABLE + _df.loc[bids_id, "pipeline_starttime"] = UNAVAILABLE + _df.loc[bids_id, "pipeline_endtime"] = UNAVAILABLE _df = _df.reset_index(drop=True) # add old rows from other pipelines/sessions and sort for consistent order - df_bagel: pd.DataFrame = pd.concat([df_bagel_old, _df], axis='index') - df_bagel = df_bagel.sort_values(["pipeline_name", "pipeline_version", COL_BIDS_ID_MANIFEST], ignore_index=True) + df_bagel: pd.DataFrame = pd.concat([df_bagel_old, _df], axis='index', ignore_index=True) + df_bagel = df_bagel.sort_values(["pipeline_name", "pipeline_version", COL_BIDS_ID_MANIFEST, COL_SESSION_MANIFEST], ignore_index=True) # don't write a new file if no changes try: - if len(df_bagel.compare(df_bagel_old_full)) == 0: + if (df_bagel_old is not None) and (df_bagel.shape == df_bagel_old_full.shape) and (set(df_bagel.columns) == set(df_bagel_old_full.columns)) and (len(df_bagel.compare(df_bagel_old_full)) == 0): logger.info(f'No change in bagel file for pipeline {pipeline}, session {session}') continue - except Exception: + except Exception as exception: + logger.warning(exception) pass # save bagel