Skip to content

Commit

Permalink
Merge pull request #443 from apriltuesday/issue-435_2
Browse files Browse the repository at this point in the history
Issue 435: Filter gene-related disorder submission from curation and evidence generation
  • Loading branch information
apriltuesday authored Sep 9, 2024
2 parents 86cc604 + facb45d commit 269479e
Show file tree
Hide file tree
Showing 12 changed files with 63 additions and 32 deletions.
3 changes: 1 addition & 2 deletions bin/trait_mapping/parse_traits.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@

if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Parse traits from ClinVar XML")
parser.add_argument("-i", dest="input_filepath", required=True,
help="ClinVar XML dump file. One record per line.")
parser.add_argument("-i", dest="input_filepath", required=True, help="ClinVar XML dump file.")
parser.add_argument("-o", dest="output_traits_filepath", required=True,
help="path to output file for all traits for downstream processing")
parser.add_argument("-u", dest="output_for_platform", required=False,
Expand Down
7 changes: 6 additions & 1 deletion cmat/clinvar_xml_io/clinvar_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from datetime import date

from cmat.clinvar_xml_io.clinvar_reference_record import ClinVarReferenceRecord
from cmat.clinvar_xml_io.xml_parsing import iterate_rcv_from_xml, parse_header_attributes
from cmat.clinvar_xml_io.clinvar_set import ClinVarSet
from cmat.clinvar_xml_io.xml_parsing import iterate_rcv_from_xml, parse_header_attributes, iterate_cvs_from_xml

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
Expand All @@ -22,6 +23,10 @@ def __iter__(self):
for rcv in iterate_rcv_from_xml(self.clinvar_xml):
yield ClinVarReferenceRecord(rcv, self.xsd_version)

def iter_cvs(self):
for cvs in iterate_cvs_from_xml(self.clinvar_xml):
yield ClinVarSet(cvs, self.xsd_version)

def get_xsd_version(self):
# For format, see https://github.com/ncbi/clinvar/blob/master/FTPSiteXsdChanges.md
if 'xsi:noNamespaceSchemaLocation' in self.header_attr:
Expand Down
1 change: 0 additions & 1 deletion cmat/clinvar_xml_io/clinvar_reference_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from functools import cached_property

from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification

from cmat.clinvar_xml_io.clinvar_record import ClinVarRecord
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_elements

Expand Down
4 changes: 2 additions & 2 deletions cmat/clinvar_xml_io/clinvar_set.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from cmat.clinvar_xml_io import ClinVarRecord
from cmat.clinvar_xml_io.clinvar_reference_record import ClinVarReferenceRecord
from cmat.clinvar_xml_io.clinvar_submitted_record import ClinVarSubmittedRecord
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_elements

Expand All @@ -12,7 +12,7 @@ def __init__(self, cvs_xml, xsd_version):
self.cvs_xml = cvs_xml

rcv_elem = find_mandatory_unique_element(self.cvs_xml, 'ReferenceClinVarAssertion')
self.rcv = ClinVarRecord(rcv_elem, xsd_version)
self.rcv = ClinVarReferenceRecord(rcv_elem, xsd_version)

scv_elems = find_elements(self.cvs_xml, 'ClinVarAssertion', allow_zero=False, allow_multiple=True)
self.scvs = [ClinVarSubmittedRecord(elem, xsd_version, self.rcv) for elem in scv_elems]
Expand Down
2 changes: 1 addition & 1 deletion cmat/clinvar_xml_io/clinvar_submitted_record.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from functools import cached_property

from cmat.clinvar_xml_io import ClinVarRecord
from cmat.clinvar_xml_io.clinvar_record import ClinVarRecord
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element

logger = logging.getLogger(__name__)
Expand Down
12 changes: 12 additions & 0 deletions cmat/clinvar_xml_io/filtering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Filtering functions that can be used in multiple pipelines.

# Identified as problematic submissions, e.g. too many unmappable trait names.
submission_names_to_exclude = ['SUB14299258']


def filter_by_submission_name(clinvar_set):
"""Return False (i.e. filter out) if every submitted record in the set has submission_name in the exclusion list."""
for submitted_record in clinvar_set.scvs:
if submitted_record.submission_name not in submission_names_to_exclude:
return True
return False
26 changes: 17 additions & 9 deletions cmat/output_generation/clinvar_to_evidence_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from cmat.clinvar_xml_io import ClinVarDataset
from cmat.clinvar_xml_io.clinical_classification import MultipleClinicalClassificationsError
from cmat.clinvar_xml_io.filtering import filter_by_submission_name
from cmat.output_generation import consequence_type as CT
from cmat.output_generation.report import Report

Expand Down Expand Up @@ -64,7 +65,8 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings

logger.info('Processing ClinVar records')
i = -1
for clinvar_record in ClinVarDataset(clinvar_xml):
dataset = ClinVarDataset(clinvar_xml)
for clinvar_set in dataset.iter_cvs():
# If start & end provided, only process records in the range [start, end)
i += 1
if start and i < start:
Expand All @@ -78,7 +80,13 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings

# Catch any exceptions for this record so we can continue processing.
try:
# Failure mode 0 (skip). Contains multiple clinical classification annotations.
# Failure mode 1 (fatal). Record is only supported by submissions deemed to be unusable.
if not filter_by_submission_name(clinvar_set):
report.clinvar_fatal_excluded_submission += 1
continue
clinvar_record = clinvar_set.rcv

# Failure mode 2 (skip). Contains multiple clinical classification annotations.
# This is new as of V2 of the ClinVar XSD and should definitely be supported at some point,
# but as it can cause parsing complications we catch these cases first.
# See GH issue for context: https://github.com/EBIvariation/CMAT/issues/396
Expand All @@ -87,18 +95,18 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
report.clinvar_skip_multiple_clinical_classifications += 1
continue

# Failure mode 1 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid,
# Failure mode 3 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid,
# potentially mappable name).
if not clinvar_record.traits_with_valid_names:
report.clinvar_fatal_no_valid_traits += 1
continue
# Failure mode 2 (fatal). A ClinVar record contains no valid clinical significance terms, likely due to
# Failure mode 4 (fatal). A ClinVar record contains no valid clinical significance terms, likely due to
# submissions being flagged.
if not clinvar_record.valid_clinical_significances:
report.clinvar_fatal_no_clinical_significance += 1
continue

# Failure mode 3 (skip). A ClinVar record contains an unsupported variation type.
# Failure mode 5 (skip). A ClinVar record contains an unsupported variation type.
if clinvar_record.measure is None:
report.clinvar_skip_unsupported_variation += 1
continue
Expand All @@ -110,7 +118,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names,
string_to_efo_mappings)

# Failure mode 4 (skip). No functional consequences are available.
# Failure mode 6 (skip). No functional consequences are available.
if not consequence_types:
report.clinvar_skip_no_functional_consequences += 1
continue
Expand All @@ -121,7 +129,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
if is_structural_variant(clinvar_record.measure):
report.structural_variants += len(consequence_types)

# Failure mode 5 (skip). A ClinVar record has at least one trait with at least one valid name, but no
# Failure mode 7 (skip). A ClinVar record has at least one trait with at least one valid name, but no
# suitable EFO mappings were found in the database. This will still generate an evidence string, but is
# tracked as a failure so we can continue to measure mapping coverage.
if not contains_mapping(grouped_diseases):
Expand Down Expand Up @@ -175,8 +183,8 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
except Exception as e:
# We catch exceptions but record when one is thrown, so that the pipeline will crash after processing all
# records and printing the report.
logger.error(f'Problem generating evidence for {clinvar_record.accession}')
logger.error(f'Error: {e}')
logger.error(f'Problem generating evidence for {clinvar_set.rcv.accession}')
logger.error(f'Error: {repr(e)}')
exception_raised = True
continue

Expand Down
5 changes: 4 additions & 1 deletion cmat/output_generation/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def __init__(self, trait_mappings=None, consequence_mappings=None):
self.clinvar_total = 0
self.clinvar_fatal_no_valid_traits = 0
self.clinvar_fatal_no_clinical_significance = 0
self.clinvar_fatal_excluded_submission = 0
self.clinvar_skip_unsupported_variation = 0
self.clinvar_skip_no_functional_consequences = 0
self.clinvar_skip_missing_efo_mapping = 0
Expand Down Expand Up @@ -88,7 +89,8 @@ def load_from_file(self, filename):

def compute_record_tallies(self):
"""Compute tallies of records fatal/skipped/done based on the more granular counts."""
self.clinvar_fatal = self.clinvar_fatal_no_valid_traits + self.clinvar_fatal_no_clinical_significance
self.clinvar_fatal = (self.clinvar_fatal_no_valid_traits + self.clinvar_fatal_no_clinical_significance +
self.clinvar_fatal_excluded_submission)
self.clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string +
self.clinvar_skip_multiple_clinical_classifications)
Expand All @@ -115,6 +117,7 @@ def print_report(self):
Fatal: Cannot produce evidence\t{self.clinvar_fatal}
No traits with valid names\t{self.clinvar_fatal_no_valid_traits}
No clinical significance\t{self.clinvar_fatal_no_clinical_significance}
Excluded submissions\t{self.clinvar_fatal_excluded_submission}
Skipped: Can be rescued by future improvements\t{self.clinvar_skipped}
Unsupported variation type\t{self.clinvar_skip_unsupported_variation}
No functional consequences\t{self.clinvar_skip_no_functional_consequences}
Expand Down
9 changes: 7 additions & 2 deletions cmat/trait_mapping/trait_names_parsing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections import Counter

from cmat import clinvar_xml_io
from cmat.clinvar_xml_io import ClinVarDataset
from cmat.clinvar_xml_io.filtering import filter_by_submission_name
from cmat.trait_mapping.trait import Trait


Expand All @@ -27,7 +28,11 @@ def parse_trait_names(filepath: str) -> list:
# Their curation is of highest importance regardless of how many records they are actually associated with.
nt_expansion_traits = set()

for clinvar_record in clinvar_xml_io.ClinVarDataset(filepath):
dataset = ClinVarDataset(filepath)
for clinvar_set in dataset.iter_cvs():
if not filter_by_submission_name(clinvar_set):
continue
clinvar_record = clinvar_set.rcv
trait_names_and_ids = set((trait.preferred_or_other_valid_name.lower(), trait.identifier)
for trait in clinvar_record.traits_with_valid_names)
for trait_tuple in trait_names_and_ids:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ chédiak-higashi syndrome http://www.orpha.net/ORDO/Orphanet_167 chédiak-higash
cobalamin c disease http://purl.obolibrary.org/obo/MONDO_0010184 methylmalonic aciduria and homocystinuria type cblC
cobalamin c disease http://www.orpha.net/ORDO/Orphanet_26 Methylmalonic acidemia with homocystinuria
cobalamin c disease http://www.orpha.net/ORDO/Orphanet_79282 Methylmalonic acidemia with homocystinuria, type cblC
coffin-siris syndrome 1 http://purl.obolibrary.org/obo/MONDO_0015452 Coffin-Siris syndrome
coffin-siris syndrome 1 http://purl.obolibrary.org/obo/MONDO_0007617 coffin-siris syndrome 1
cog1 congenital disorder of glycosylation http://purl.obolibrary.org/obo/MONDO_0012637 COG1-congenital disorder of glycosylation
cog7 congenital disorder of glycosylation http://purl.obolibrary.org/obo/MONDO_0012118 COG7-congenital disorder of glycosylation
cohen syndrome http://purl.obolibrary.org/obo/MONDO_0008999 cohen syndrome
Expand Down Expand Up @@ -278,7 +278,7 @@ hepatoencephalopathy due to combined oxidative phosphorylation defect type 1 htt
hereditary breast ovarian cancer syndrome http://purl.obolibrary.org/obo/MONDO_0003582 hereditary breast ovarian cancer syndrome
hereditary cancer-predisposing syndrome http://purl.obolibrary.org/obo/MONDO_0015356 hereditary neoplastic syndrome
hereditary diffuse gastric adenocarcinoma http://purl.obolibrary.org/obo/MONDO_0007648 hereditary diffuse gastric adenocarcinoma
hereditary diffuse leukoencephalopathy with spheroids http://www.orpha.net/ORDO/Orphanet_313808 Hereditary diffuse leukoencephalopathy with axonal spheroids and pigmented glia
hereditary diffuse leukoencephalopathy with spheroids http://www.orpha.net/ORDO/Orphanet_313808 Adult-onset leukoencephalopathy with axonal spheroids and pigmented glia
hereditary hemorrhagic telangiectasia http://purl.obolibrary.org/obo/MONDO_0019180 hereditary hemorrhagic telangiectasia
hereditary insensitivity to pain with anhidrosis http://purl.obolibrary.org/obo/MONDO_0009746 hereditary sensory and autonomic neuropathy type 4
hereditary nonpolyposis colorectal neoplasms http://www.ebi.ac.uk/efo/EFO_0009911 hereditary nonpolyposis colorectal carcinoma
Expand Down Expand Up @@ -338,7 +338,7 @@ inflammatory skin and bowel disease, neonatal, 1 http://purl.obolibrary.org/obo/
intellectual developmental disorder, autosomal dominant 64 http://purl.obolibrary.org/obo/MONDO_0030934 intellectual developmental disorder, autosomal dominant 64
intellectual disability http://purl.obolibrary.org/obo/HP_0001249 intellectual disability
intellectual disability, autosomal dominant 1 http://purl.obolibrary.org/obo/MONDO_0016459 2q23.1 microdeletion syndrome
intellectual disability, autosomal dominant 20 http://purl.obolibrary.org/obo/MONDO_0016456 5q14.3 microdeletion syndrome
intellectual disability, autosomal dominant 20 http://purl.obolibrary.org/obo/MONDO_0013266 intellectual disability, autosomal dominant 20
intellectual disability, autosomal dominant 5 http://purl.obolibrary.org/obo/MONDO_0012960 intellectual disability, autosomal dominant 5
intellectual disability, autosomal dominant 6 http://purl.obolibrary.org/obo/MONDO_0100172 intellectual disability, autosomal dominant
intellectual disability, autosomal dominant 9 http://purl.obolibrary.org/obo/MONDO_0013656 intellectual disability, autosomal dominant 9
Expand Down Expand Up @@ -508,7 +508,7 @@ retinitis pigmentosa-deafness syndrome http://purl.obolibrary.org/obo/MONDO_0019
retinoblastoma http://purl.obolibrary.org/obo/MONDO_0008380 retinoblastoma
rett syndrome http://purl.obolibrary.org/obo/MONDO_0010726 rett syndrome
rett syndrome, congenital variant http://purl.obolibrary.org/obo/MONDO_0010726 Rett syndrome
rhabdoid tumor predisposition syndrome 2 http://purl.obolibrary.org/obo/MONDO_0016473 familial rhabdoid tumor
rhabdoid tumor predisposition syndrome 2 http://purl.obolibrary.org/obo/MONDO_0013224 rhabdoid tumor predisposition syndrome 2
rod-cone dystrophy http://www.orpha.net/ORDO/Orphanet_1872 Cone rod dystrophy
rubinstein-taybi syndrome http://purl.obolibrary.org/obo/MONDO_0019188 rubinstein-taybi syndrome
ryr1-related disorders http://www.ebi.ac.uk/efo/EFO_0009143 ryr1-related disorders
Expand Down
Loading

0 comments on commit 269479e

Please sign in to comment.