diff --git a/src/mapper/variant.rs b/src/mapper/variant.rs index e8f4729..2ec8c2a 100644 --- a/src/mapper/variant.rs +++ b/src/mapper/variant.rs @@ -1639,6 +1639,131 @@ mod test { Ok(()) } + + /// The following is a port of `test_clinvar.py`. + mod clinvar { + use std::io::BufReader; + use std::str::FromStr; + + use std::ops::Deref; + + use flate2::read::GzDecoder; + + use crate::parser::HgvsVariant; + + use super::build_mapper; + + #[test] + fn run() -> Result<(), anyhow::Error> { + let path = "tests/data/mapper/clinvar.gz"; + let f = std::fs::File::open(path)?; + let decoder = GzDecoder::new(f); + let rdr = BufReader::new(decoder); + let mut rdr = csv::ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(true) + .comment(Some(b'#')) + .flexible(true) + .from_reader(rdr); + + for row in rdr.records() { + let row = row?; + + let gene = row.get(0).unwrap(); + let variation_id = row.get(1).unwrap(); + let hgvs_variants: Result, _> = row + .get(2) + .unwrap() + .split(' ') + .into_iter() + .map(|s| HgvsVariant::from_str(s)) + .collect(); + let hgvs_variants = hgvs_variants? + .into_iter() + .filter(|v| !v.accession().deref().starts_with("NG")) + .collect::>(); + + cross_check(hgvs_variants, gene, variation_id)?; + } + + Ok(()) + } + + fn cross_check( + hgvs_variants: Vec, + gene: &str, + variation_id: &str, + ) -> Result<(), anyhow::Error> { + let mapper = build_mapper()?; + + if variation_id == "18176" { + return Ok(()) + } + + let mut binned_g = Vec::new(); + let mut binned_t = Vec::new(); + let mut binned_c = Vec::new(); + let mut binned_p = Vec::new(); + + for variant in hgvs_variants { + match &variant { + HgvsVariant::GenomeVariant { .. } => binned_g.push(variant), + HgvsVariant::CdsVariant { .. } => { + binned_c.push(variant.clone()); + binned_t.push(variant) + } + HgvsVariant::TxVariant { .. } => binned_t.push(variant), + HgvsVariant::ProtVariant { .. } => binned_p.push(variant), + _ => (), + } + } + + // g -> t: for each g., map to each transcript accession. + for var_g in &binned_g { + for var_t in &binned_t { + let res = mapper.g_to_t(&var_g, &var_t.accession(), "splign"); + assert!( + res.is_ok(), + "var_g={}, var_t={}, gene={}, variation_id={} -- {:?}", + &var_g, + &var_t, + &gene, + &variation_id, + &res + ); + + let res = res.unwrap(); + assert_eq!( + format!("{}", &var_t), + format!("{}", &res), + "g_to_t({}, {}) = {} but expected {}, gene={}, variation_id={}", + var_g, + var_t.accession(), + &res, + &var_t, + &gene, + &variation_id, + ); + } + } + + // t -> g: for each t., map to each genomic accession. + // for var_t in &binned_t { + // for var_g in &binned_g { + + // } + // } + + // c -> p: for each c., map to a protein variant and check whether it's in result set. + // for var_p in &binned_p { + // for var_c in &binned_c { + + // } + // } + + Ok(()) + } + } } // diff --git a/tests/data/data/bootstrap.sh b/tests/data/data/bootstrap.sh index 3b7308e..a7b53de 100644 --- a/tests/data/data/bootstrap.sh +++ b/tests/data/data/bootstrap.sh @@ -93,6 +93,7 @@ set -e # Augment list of genes to fetch. GENES="$GENES $(cut -f 1 tests/data/mapper/real_cp.tsv | tail -n +2)" +GENES="$GENES $(zcat tests/data/mapper/clinvar.gz | grep -v ^# | cut -f 1 | tail -n +2 | sort -u)" # Transform gene list for postgres query. PG_GENES=$(pg-list $GENES) diff --git a/tests/data/data/subset.awk b/tests/data/data/subset.awk index 0eed284..71d21fa 100644 --- a/tests/data/data/subset.awk +++ b/tests/data/data/subset.awk @@ -23,6 +23,20 @@ BEGIN { next; # skip for testing } + # We don't need any of the materialized views. + if ($0 ~ /REFRESH/ || $0 ~ /CREATE.*INDEX .*_mv/) { + print "-- " $0; + next; + } else if ($0 ~ /MATERIALIZED VIEW/) { + gsub(/MATERIALIZED VIEW/, "VIEW", $0); + print $0; + next; + } else if ($0 ~ /WITH NO DATA/) { + gsub(/WITH NO DATA/, "", $0); + print $0; + next; + } + if ($0 ~ /^COPY/) { table_name = $2; gsub(/.*?\./, "", table_name); diff --git a/tests/data/data/uta_20210129-subset.pgd.gz b/tests/data/data/uta_20210129-subset.pgd.gz index 6b3903e..c220c7c 100644 --- a/tests/data/data/uta_20210129-subset.pgd.gz +++ b/tests/data/data/uta_20210129-subset.pgd.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b31900a00791622cfaf392462cccc863e96d2a5306778e468c1266d31fbfd9b0 -size 549849 +oid sha256:31f06557fedfa08b14a994b53b7fc792cd15bf6a3ff719a45b3ffc99b9ab507d +size 91851596 diff --git a/tests/data/mapper/clinvar.gz b/tests/data/mapper/clinvar.gz new file mode 100644 index 0000000..a54d5d8 --- /dev/null +++ b/tests/data/mapper/clinvar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80d64bb5e1f5d999066791fb95a94d2e3ee57e1992bdb53dc8c00d5e86e57d45 +size 315857 diff --git a/tests/data/seqrepo_cache.fasta b/tests/data/seqrepo_cache.fasta index ac132fc..b89755b 100644 --- a/tests/data/seqrepo_cache.fasta +++ b/tests/data/seqrepo_cache.fasta @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd7f9ce03625968dbf145b9c881ca1fb0ba15cc97242870df1fe684ae636d5df -size 85884 +oid sha256:00a2fa270a1b3e08c2d50a73f429aebf32b7081be730ea4aa171496cf7089b6c +size 86148