From 4fb0d280362923188a4b8875ec35065f79c6dea1 Mon Sep 17 00:00:00 2001 From: April Shen Date: Thu, 5 Sep 2024 12:00:16 +0100 Subject: [PATCH] add filtering to trait name parsing and evidence string generation --- bin/trait_mapping/parse_traits.py | 3 +-- cmat/clinvar_xml_io/filtering.py | 12 +++++++++ .../clinvar_to_evidence_strings.py | 25 ++++++++++++------- cmat/output_generation/report.py | 5 +++- cmat/trait_mapping/trait_names_parsing.py | 14 ++--------- 5 files changed, 35 insertions(+), 24 deletions(-) create mode 100644 cmat/clinvar_xml_io/filtering.py diff --git a/bin/trait_mapping/parse_traits.py b/bin/trait_mapping/parse_traits.py index e49a7ca5..d1ab119f 100644 --- a/bin/trait_mapping/parse_traits.py +++ b/bin/trait_mapping/parse_traits.py @@ -5,8 +5,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description="Parse traits from ClinVar XML") - parser.add_argument("-i", dest="input_filepath", required=True, - help="ClinVar XML dump file. One record per line.") + parser.add_argument("-i", dest="input_filepath", required=True, help="ClinVar XML dump file.") parser.add_argument("-o", dest="output_traits_filepath", required=True, help="path to output file for all traits for downstream processing") parser.add_argument("-u", dest="output_for_platform", required=False, diff --git a/cmat/clinvar_xml_io/filtering.py b/cmat/clinvar_xml_io/filtering.py new file mode 100644 index 00000000..d16714dc --- /dev/null +++ b/cmat/clinvar_xml_io/filtering.py @@ -0,0 +1,12 @@ +# Filtering functions that can be used in multiple pipelines. + +# Identified as problematic submissions, e.g. too many unmappable trait names. +submission_names_to_exclude = ['SUB14299258'] + + +def filter_by_submission_name(clinvar_set): + """Return False (i.e. filter out) if every submitted record in the set has submission_name in the exclusion list.""" + for submitted_record in clinvar_set.scvs: + if submitted_record.submission_name not in submission_names_to_exclude: + return True + return False diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py index 85572a44..36d2c991 100644 --- a/cmat/output_generation/clinvar_to_evidence_strings.py +++ b/cmat/output_generation/clinvar_to_evidence_strings.py @@ -10,6 +10,7 @@ from cmat.clinvar_xml_io import ClinVarDataset from cmat.clinvar_xml_io.clinical_classification import MultipleClinicalClassificationsError +from cmat.clinvar_xml_io.filtering import filter_by_submission_name from cmat.output_generation import consequence_type as CT from cmat.output_generation.report import Report @@ -64,8 +65,8 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings logger.info('Processing ClinVar records') i = -1 - # TODO filter here - for clinvar_record in ClinVarDataset(clinvar_xml): + dataset = ClinVarDataset(clinvar_xml) + for clinvar_set in dataset.iter_cvs(): # If start & end provided, only process records in the range [start, end) i += 1 if start and i < start: @@ -79,7 +80,13 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings # Catch any exceptions for this record so we can continue processing. try: - # Failure mode 0 (skip). Contains multiple clinical classification annotations. + # Failure mode 1 (fatal). Record is only supported by submissions deemed to be unusable. + if not filter_by_submission_name(clinvar_set): + report.clinvar_fatal_excluded_submission += 1 + continue + clinvar_record = clinvar_set.rcv + + # Failure mode 2 (skip). Contains multiple clinical classification annotations. # This is new as of V2 of the ClinVar XSD and should definitely be supported at some point, # but as it can cause parsing complications we catch these cases first. # See GH issue for context: https://github.com/EBIvariation/CMAT/issues/396 @@ -88,18 +95,18 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings report.clinvar_skip_multiple_clinical_classifications += 1 continue - # Failure mode 1 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid, + # Failure mode 3 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid, # potentially mappable name). if not clinvar_record.traits_with_valid_names: report.clinvar_fatal_no_valid_traits += 1 continue - # Failure mode 2 (fatal). A ClinVar record contains no valid clinical significance terms, likely due to + # Failure mode 4 (fatal). A ClinVar record contains no valid clinical significance terms, likely due to # submissions being flagged. if not clinvar_record.valid_clinical_significances: report.clinvar_fatal_no_clinical_significance += 1 continue - # Failure mode 3 (skip). A ClinVar record contains an unsupported variation type. + # Failure mode 5 (skip). A ClinVar record contains an unsupported variation type. if clinvar_record.measure is None: report.clinvar_skip_unsupported_variation += 1 continue @@ -111,7 +118,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names, string_to_efo_mappings) - # Failure mode 4 (skip). No functional consequences are available. + # Failure mode 6 (skip). No functional consequences are available. if not consequence_types: report.clinvar_skip_no_functional_consequences += 1 continue @@ -122,7 +129,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings if is_structural_variant(clinvar_record.measure): report.structural_variants += len(consequence_types) - # Failure mode 5 (skip). A ClinVar record has at least one trait with at least one valid name, but no + # Failure mode 7 (skip). A ClinVar record has at least one trait with at least one valid name, but no # suitable EFO mappings were found in the database. This will still generate an evidence string, but is # tracked as a failure so we can continue to measure mapping coverage. if not contains_mapping(grouped_diseases): @@ -176,7 +183,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings except Exception as e: # We catch exceptions but record when one is thrown, so that the pipeline will crash after processing all # records and printing the report. - logger.error(f'Problem generating evidence for {clinvar_record.accession}') + logger.error(f'Problem generating evidence for {clinvar_set.rcv.accession}') logger.error(f'Error: {e}') exception_raised = True continue diff --git a/cmat/output_generation/report.py b/cmat/output_generation/report.py index 714a502e..c9b45005 100644 --- a/cmat/output_generation/report.py +++ b/cmat/output_generation/report.py @@ -27,6 +27,7 @@ def __init__(self, trait_mappings=None, consequence_mappings=None): self.clinvar_total = 0 self.clinvar_fatal_no_valid_traits = 0 self.clinvar_fatal_no_clinical_significance = 0 + self.clinvar_fatal_excluded_submission = 0 self.clinvar_skip_unsupported_variation = 0 self.clinvar_skip_no_functional_consequences = 0 self.clinvar_skip_missing_efo_mapping = 0 @@ -88,7 +89,8 @@ def load_from_file(self, filename): def compute_record_tallies(self): """Compute tallies of records fatal/skipped/done based on the more granular counts.""" - self.clinvar_fatal = self.clinvar_fatal_no_valid_traits + self.clinvar_fatal_no_clinical_significance + self.clinvar_fatal = (self.clinvar_fatal_no_valid_traits + self.clinvar_fatal_no_clinical_significance + + self.clinvar_fatal_excluded_submission) self.clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences + self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string + self.clinvar_skip_multiple_clinical_classifications) @@ -115,6 +117,7 @@ def print_report(self): Fatal: Cannot produce evidence\t{self.clinvar_fatal} No traits with valid names\t{self.clinvar_fatal_no_valid_traits} No clinical significance\t{self.clinvar_fatal_no_clinical_significance} + Excluded submissions\t{self.clinvar_fatal_excluded_submission} Skipped: Can be rescued by future improvements\t{self.clinvar_skipped} Unsupported variation type\t{self.clinvar_skip_unsupported_variation} No functional consequences\t{self.clinvar_skip_no_functional_consequences} diff --git a/cmat/trait_mapping/trait_names_parsing.py b/cmat/trait_mapping/trait_names_parsing.py index 39497e21..682dfa56 100644 --- a/cmat/trait_mapping/trait_names_parsing.py +++ b/cmat/trait_mapping/trait_names_parsing.py @@ -1,8 +1,7 @@ from collections import Counter -from typing import Iterable from cmat.clinvar_xml_io import ClinVarDataset -from cmat.clinvar_xml_io.clinvar_set import ClinVarSet +from cmat.clinvar_xml_io.filtering import filter_by_submission_name from cmat.trait_mapping.trait import Trait @@ -31,8 +30,7 @@ def parse_trait_names(filepath: str) -> list: dataset = ClinVarDataset(filepath) for clinvar_set in dataset.iter_cvs(): - # TODO where to put this logic (both the method & the exclusion list)? - if should_exclude_record(clinvar_set, ['SUB14299258']): + if not filter_by_submission_name(clinvar_set): continue clinvar_record = clinvar_set.rcv trait_names_and_ids = set((trait.preferred_or_other_valid_name.lower(), trait.identifier) @@ -53,11 +51,3 @@ def parse_trait_names(filepath: str) -> list: associated_with_nt_expansion=associated_with_nt_expansion)) return traits - - -def should_exclude_record(clinvar_set: ClinVarSet, names_to_exclude: Iterable) -> bool: - """Return True if every submitted record in the set has submission_name in the exclusion list.""" - for submitted_record in clinvar_set.scvs: - if submitted_record.submission_name not in names_to_exclude: - return False - return True