Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace legacy validator with schema validator #337

Merged
merged 10 commits into from
Dec 13, 2024
6 changes: 2 additions & 4 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,8 @@ jobs:
source activate cubids
conda install -c conda-forge -y datalad

# Add nodejs and the validator
conda install nodejs
npm install -g yarn && \
npm install -g [email protected]
# Add deno to run the schema validator
conda install deno
tsalo marked this conversation as resolved.
Show resolved Hide resolved

# Install CuBIDS
pip install -e .[tests]
Expand Down
36 changes: 36 additions & 0 deletions cubids/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,41 @@ def _enter_validate(argv=None):
workflows.validate(**args)


def _parse_bids_version():
parser = argparse.ArgumentParser(
tientong98 marked this conversation as resolved.
Show resolved Hide resolved
description="cubids bids-version: Get BIDS Validator and Schema version",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
PathExists = partial(_path_exists, parser=parser)

parser.add_argument(
"bids_dir",
type=PathExists,
action="store",
help=(
"the root of a BIDS dataset. It should contain "
tientong98 marked this conversation as resolved.
Show resolved Hide resolved
"sub-X directories and dataset_description.json"
),
)
parser.add_argument(
"--write",
action="store_true",
default=False,
help=(
"Save the validator and schema version to 'dataset_description.json' "
"when using `cubids bids-version /bids/path --write`. "
"By default, `cubids bids-version /bids/path` prints to the terminal."
),
)
return parser


def _enter_bids_version(argv=None):
options = _parse_bids_version().parse_args(argv)
args = vars(options).copy()
workflows.bids_version(**args)


def _parse_bids_sidecar_merge():
parser = argparse.ArgumentParser(
description=("bids-sidecar-merge: merge critical keys from one sidecar to another"),
Expand Down Expand Up @@ -655,6 +690,7 @@ def _enter_print_metadata_fields(argv=None):

COMMANDS = [
("validate", _parse_validate, workflows.validate),
("bids-version", _parse_bids_version, workflows.bids_version),
("sidecar-merge", _parse_bids_sidecar_merge, workflows.bids_sidecar_merge),
("group", _parse_group, workflows.group),
("apply", _parse_apply, workflows.apply),
Expand Down
16 changes: 13 additions & 3 deletions cubids/cubids.py
Original file line number Diff line number Diff line change
Expand Up @@ -1336,9 +1336,19 @@ def get_all_metadata_fields(self):
found_fields = set()
for json_file in Path(self.path).rglob("*.json"):
if ".git" not in str(json_file):
with open(json_file, "r") as jsonr:
metadata = json.load(jsonr)
found_fields.update(metadata.keys())
# add this in case `print-metadata-fields` is run before validate
try:
with open(json_file, "r", encoding="utf-8") as jsonr:
content = jsonr.read().strip()
if not content:
print(f"Empty file: {json_file}")
continue
metadata = json.loads(content)
found_fields.update(metadata.keys())
except json.JSONDecodeError as e:
print(f"Error decoding JSON in {json_file}: {e}")
tientong98 marked this conversation as resolved.
Show resolved Hide resolved
except Exception as e:
print(f"Unexpected error with file {json_file}: {e}")
return sorted(found_fields)

def remove_metadata_fields(self, fields_to_remove):
Expand Down
3 changes: 2 additions & 1 deletion cubids/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@
"""

import argparse

import pytest

from cubids.cli import _path_exists, _is_file, _get_parser, _main
from cubids.cli import _get_parser, _is_file, _main, _path_exists


def _test_path_exists():
Expand Down
228 changes: 167 additions & 61 deletions cubids/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import logging
import os
import pathlib
import re
import subprocess

import pandas as pd
Expand All @@ -14,16 +15,33 @@

def build_validator_call(path, ignore_headers=False):
"""Build a subprocess command to the bids validator."""
# build docker call
# CuBIDS automatically ignores subject consistency.
command = ["bids-validator", path, "--verbose", "--json", "--ignoreSubjectConsistency"]
# New schema BIDS validator doesn't have option to ignore subject consistency.
# Build the deno command to run the BIDS validator.
command = ["deno", "run", "-A", "jsr:@bids/validator", path, "--verbose", "--json"]
tientong98 marked this conversation as resolved.
Show resolved Hide resolved

if ignore_headers:
command.append("--ignoreNiftiHeaders")

return command


def get_bids_validator_version():
"""Get the version of the BIDS validator.

Returns
-------
version : :obj:`str`
Version of the BIDS validator.
"""
command = ["deno", "run", "-A", "jsr:@bids/validator", "--version"]
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output = result.stdout.decode("utf-8").strip()
version = output.split()[-1]
# Remove ANSI color codes
clean_ver = re.sub(r"\x1b\[[0-9;]*m", "", version)
return {"ValidatorVersion": clean_ver}


def build_subject_paths(bids_dir):
"""Build a list of BIDS dirs with 1 subject each."""
bids_dir = str(bids_dir)
Expand Down Expand Up @@ -52,6 +70,26 @@ def build_subject_paths(bids_dir):
return subjects_dict


def build_first_subject_path(bids_dir, subject):
"""Build a list of BIDS dirs with 1 subject each."""
bids_dir = str(bids_dir)
if not bids_dir.endswith("/"):
bids_dir += "/"

root_files = [x for x in glob.glob(bids_dir + "*") if os.path.isfile(x)]

subject_dict = {}

purepath = pathlib.PurePath(subject)
sub_label = purepath.name

files = [x for x in glob.glob(subject + "**", recursive=True) if os.path.isfile(x)]
files.extend(root_files)
subject_dict[sub_label] = files

return subject_dict


def run_validator(call):
"""Run the validator with subprocess.

Expand Down Expand Up @@ -87,32 +125,6 @@ def parse_validator_output(output):
Dataframe of validator output.
"""

def get_nested(dct, *keys):
"""Get a nested value from a dictionary.

Parameters
----------
dct : :obj:`dict`
Dictionary to get value from.
keys : :obj:`list`
List of keys to get value from.

Returns
-------
:obj:`dict`
The nested value.
"""
for key in keys:
try:
dct = dct[key]
except (KeyError, TypeError):
return None
return dct

data = json.loads(output)

issues = data["issues"]

def parse_issue(issue_dict):
"""Parse a single issue from the validator output.

Expand All @@ -126,30 +138,30 @@ def parse_issue(issue_dict):
return_dict : :obj:`dict`
Dictionary of parsed issue.
"""
return_dict = {}
return_dict["files"] = [
get_nested(x, "file", "relativePath") for x in issue_dict.get("files", "")
]
return_dict["type"] = issue_dict.get("key", "")
return_dict["severity"] = issue_dict.get("severity", "")
return_dict["description"] = issue_dict.get("reason", "")
return_dict["code"] = issue_dict.get("code", "")
return_dict["url"] = issue_dict.get("helpUrl", "")

return return_dict

df = pd.DataFrame()

for warn in issues["warnings"]:
parsed = parse_issue(warn)
parsed = pd.DataFrame(parsed)
df = pd.concat([df, parsed], ignore_index=True)

for err in issues["errors"]:
parsed = parse_issue(err)
parsed = pd.DataFrame(parsed)
df = pd.concat([df, parsed], ignore_index=True)
return {
"location": issue_dict.get("location", ""),
"code": issue_dict.get("code", ""),
"issueMessage": issue_dict.get("issueMessage", ""),
"subCode": issue_dict.get("subCode", ""),
"severity": issue_dict.get("severity", ""),
"rule": issue_dict.get("rule", ""),
}

# Load JSON data
data = json.loads(output)

# Extract issues
issues = data.get("issues", {}).get("issues", [])
if not issues:
return pd.DataFrame(
columns=["location", "code", "issueMessage", "subCode", "severity", "rule"]
)

# Parse all issues
parsed_issues = [parse_issue(issue) for issue in issues]

# Convert to DataFrame
df = pd.DataFrame(parsed_issues)
return df


Expand All @@ -161,12 +173,106 @@ def get_val_dictionary():
val_dict : dict
Dictionary of values.
"""
val_dict = {}
val_dict["files"] = {"Description": "File with warning orerror"}
val_dict["type"] = {"Description": "BIDS validation warning or error"}
val_dict["severity"] = {"Description": "gravity of problem (warning/error"}
val_dict["description"] = {"Description": "Description of warning/error"}
val_dict["code"] = {"Description": "BIDS validator issue code number"}
val_dict["url"] = {"Description": "Link to the issue's neurostars thread"}

return val_dict
return {
"location": {"Description": "File with the validation issue."},
"code": {"Description": "Code of the validation issue."},
"issueMessage": {"Description": "Validation issue message."},
"subCode": {"Description": "Subcode providing additional issue details."},
"severity": {"Description": "Severity of the issue (e.g., warning, error)."},
"rule": {"Description": "Validation rule that triggered the issue."},
}


def extract_summary_info(output):
"""Extract summary information from the JSON output.

Parameters
----------
output : str
JSON string of BIDS validator output.

Returns
-------
dict
Dictionary containing SchemaVersion and other summary info.
"""
try:
data = json.loads(output)
except json.JSONDecodeError as e:
raise ValueError("Invalid JSON provided to get SchemaVersion.") from e

summary = data.get("summary", {})

return {"SchemaVersion": summary.get("schemaVersion", "")}


def update_dataset_description(path, new_info):
"""Update or append information to dataset_description.json.

Parameters
----------
path : :obj:`str`
Path to the dataset.
new_info : :obj:`dict`
Information to add or update.
"""
description_path = os.path.join(path, "dataset_description.json")

# Load existing data if the file exists
if os.path.exists(description_path):
with open(description_path, "r") as f:
existing_data = json.load(f)
else:
existing_data = {}

# Update the existing data with the new info
existing_data.update(new_info)

# Write the updated data back to the file
with open(description_path, "w") as f:
json.dump(existing_data, f, indent=4)
print(f"Updated dataset_description.json at: {description_path}")

# Check if .datalad directory exists before running the DataLad save command
datalad_dir = os.path.join(path, ".datalad")
if os.path.exists(datalad_dir) and os.path.isdir(datalad_dir):
try:
subprocess.run(
[
"datalad",
"save",
"-m",
"Save BIDS validator and schema version to dataset_description",
description_path,
],
check=True,
)
print("Changes saved with DataLad.")
except subprocess.CalledProcessError as e:
print(f"Error running DataLad save: {e}")


def bids_validator_version(output, path, write=False):
tientong98 marked this conversation as resolved.
Show resolved Hide resolved
"""Save BIDS validator and schema version.

Parameters
----------
output : :obj:`str`
Path to JSON file of BIDS validator output.
path : :obj:`str`
Path to the dataset.
write : :obj:`bool`
If True, write to dataset_description.json. If False, print to terminal.
"""
# Get the BIDS validator version
validator_version = get_bids_validator_version()
# Extract schemaVersion
summary_info = extract_summary_info(output)

combined_info = {**validator_version, **summary_info}

if write:
# Update the dataset_description.json file
update_dataset_description(path, combined_info)
elif not write:
print(combined_info)
Loading
Loading