From 3d10c759baa0c6eda579299d789ecb87aaee4d2e Mon Sep 17 00:00:00 2001 From: egor-dolzhenko Date: Mon, 8 Apr 2024 12:49:18 -0400 Subject: [PATCH] Update to version 0.9.0 --- README.md | 3 +++ repeats/pathogenic_repeats.hg38.bed | 18 +++++++++--------- trgt/Cargo.toml | 2 +- trgt/src/hmm/builder.rs | 7 ++++--- trgt/src/hmm/events.rs | 12 +++++++++++- trgt/src/hmm/purity.rs | 11 +++++++++++ trgt/src/hmm/utils.rs | 12 ++++++++---- trgt/src/workflows/tr.rs | 4 ++-- trvz/Cargo.toml | 2 +- trvz/src/hmm/builder.rs | 3 ++- trvz/src/hmm/events.rs | 12 +++++++++++- 11 files changed, 63 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 4729165..0825fb3 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,9 @@ tandem repeats at genome scale. 2024](https://www.nature.com/articles/s41587-023 fixed - Optimizations to the `--genotyper=cluster` mode, including haploid genotyping of the X chromosome when `--karyotype` is set to `XY` +- 0.9.0 + - Add support for polyalanine repeats (by allowing characters `N` in the motif sequence) + - Fix a bug causing TRVZ to error out on polyalanine repeats ### DISCLAIMER diff --git a/repeats/pathogenic_repeats.hg38.bed b/repeats/pathogenic_repeats.hg38.bed index db56978..b3bc8c2 100644 --- a/repeats/pathogenic_repeats.hg38.bed +++ b/repeats/pathogenic_repeats.hg38.bed @@ -1,4 +1,4 @@ -chr1 57367043 57367119 ID=DAB1;MOTIFS=AAAAT,GAAAT;STRUC=(AAAAT)n(GAAAT)n(AAAAT)n +chr1 57367043 57367119 ID=DAB1;MOTIFS=AAAAT,GAAAT;STRUC= chr1 146228800 146228821 ID=NOTCH2NLA;MOTIFS=GCC;STRUC=(GCC)n chr1 149390802 149390841 ID=NOTCH2NLC;MOTIFS=GGC;STRUC=(GGC)n chr10 79826383 79826404 ID=NUTM2B-AS1;MOTIFS=CGG;STRUC=(CGG)n @@ -14,15 +14,15 @@ chr14 23321472 23321490 ID=PABPN1;MOTIFS=GCG;STRUC=(GCG)n chr14 92071009 92071042 ID=ATXN3;MOTIFS=GCT;STRUC=(GCT)n chr15 22786677 22786701 ID=NIPA1;MOTIFS=GCG;STRUC=(GCG)n chr16 17470907 17470923 ID=XYLT1;MOTIFS=GCC;STRUC=(GCC)n -chr16 24613439 24613530 ID=TNRC6A;MOTIFS=TTTTA,TTTCA;STRUC=(TTTTA)n(TTTCA)n(TTTTA)n -chr16 66490398 66490467 ID=BEAN1;MOTIFS=TGGAA,TAAAA;STRUC=(TGGAA)n(TAAAA)n +chr16 24613439 24613530 ID=TNRC6A;MOTIFS=TTTTA,TTTCA;STRUC= +chr16 66490398 66490467 ID=BEAN1;MOTIFS=TGGAA,TAAAA;STRUC= chr16 87604287 87604329 ID=JPH3;MOTIFS=CTG;STRUC=(CTG)n chr18 55586155 55586227 ID=TCF4;MOTIFS=CAG;STRUC=(CAG)n chr19 13207858 13207897 ID=CACNA1A;MOTIFS=CTG;STRUC=(CTG)n chr19 14496041 14496074 ID=GIPC1;MOTIFS=CCG;STRUC=(CCG)n chr19 18786034 18786050 ID=COMP;MOTIFS=GTC;STRUC=(GTC)n chr19 45770204 45770264 ID=DMPK;MOTIFS=CAG;STRUC=(CAG)n -chr2 96197066 96197122 ID=STARD7;MOTIFS=TGAAA,TAAAA;STRUC=(TGAAA)n(TAAAA)n +chr2 96197066 96197122 ID=STARD7;MOTIFS=TGAAA,TAAAA;STRUC= chr2 100104799 100104824 ID=AFF3;MOTIFS=GCC;STRUC=(GCC)n chr2 176093058 176093104 ID=HOXD13;MOTIFS=GCN;STRUC=(GCN)n chr2 190880872 190880920 ID=GLS;MOTIFS=GCA;STRUC=(GCA)n @@ -30,14 +30,14 @@ chr20 2652733 2652775 ID=NOP56;MOTIFS=GGCCTG,CGCCTG;STRUC=(GGCCTG)n(CGCCTG)n chr21 43776443 43776479 ID=CSTB;MOTIFS=CGCGGGGCGGGG;STRUC=(CGCGGGGCGGGG)n chr22 45795354 45795424 ID=ATXN10;MOTIFS=ATTCT;STRUC=(ATTCT)n chr3 63912684 63912726 ID=ATXN7;MOTIFS=GCA,GCC;STRUC=(GCA)n(GCC)n -chr3 129172576 129172732 ID=CNBP;MOTIFS=CAGG,CAGA,CA;STRUC=(CAGG)n(CAGA)n(CA)n +chr3 129172576 129172732 ID=CNBP;MOTIFS=CAGG,CAGA,CA;STRUC= chr3 138946020 138946063 ID=FOXL2;MOTIFS=NGC;STRUC=(NGC)n -chr3 183712187 183712223 ID=YEATS2;MOTIFS=TTTTA,TTTCA;STRUC=(TTTTA)n(TTTCA)n +chr3 183712187 183712223 ID=YEATS2;MOTIFS=TTTTA,TTTCA;STRUC= chr4 3074876 3074966 ID=HTT;MOTIFS=CAG,CCG;STRUC=(CAG)nCAACAG(CCG)n -chr4 39348424 39348479 ID=RFC1;MOTIFS=AAAAG,AAAGG,AAGGG,AAGAG,AGAGG,AACGG,GGGAC,AAAGGG;STRUC= +chr4 39348424 39348479 ID=RFC1;MOTIFS=AAAAG,AAAGG,AAGGG,AAGAG,AGAGG,AACGG,GGGAC,AAAGGG,CAGGA,CAAGA;STRUC= chr4 41745972 41746032 ID=PHOX2B;MOTIFS=GCN;STRUC=(GCN)n -chr4 159342526 159342617 ID=RAPGEF2;MOTIFS=TTTTA,TTTCA;STRUC=(TTTTA)n(TTTCA)n(TTTTA)n -chr5 10356346 10356412 ID=MARCHF6;MOTIFS=TTTTA,TTTCA;STRUC=(TTTTA)n(TTTCA)n +chr4 159342526 159342617 ID=RAPGEF2;MOTIFS=TTTTA,TTTCA;STRUC= +chr5 10356346 10356412 ID=MARCHF6;MOTIFS=TTTTA,TTTCA;STRUC= chr5 146878727 146878757 ID=PPP2R2B;MOTIFS=GCT;STRUC=(GCT)n chr6 16327633 16327723 ID=ATXN1;MOTIFS=TGC;STRUC=(TGC)n chr6 45422750 45422802 ID=RUNX2;MOTIFS=GCN;STRUC=(GCN)n diff --git a/trgt/Cargo.toml b/trgt/Cargo.toml index 842b20f..7371cf5 100644 --- a/trgt/Cargo.toml +++ b/trgt/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "trgt" -version = "0.8.0" +version = "0.9.0" edition = "2021" build = "build.rs" diff --git a/trgt/src/hmm/builder.rs b/trgt/src/hmm/builder.rs index ca50f73..85644e2 100644 --- a/trgt/src/hmm/builder.rs +++ b/trgt/src/hmm/builder.rs @@ -173,13 +173,14 @@ fn define_motif_block(hmm: &mut Hmm, ms: usize, motif: &Vec) { } } -fn get_match_emissions(char: u8) -> Vec { - match char { +fn get_match_emissions(base: u8) -> Vec { + match base { b'A' => vec![0.00, 0.90, 0.03, 0.03, 0.03], b'T' => vec![0.00, 0.03, 0.90, 0.03, 0.03], b'C' => vec![0.00, 0.03, 0.03, 0.90, 0.03], b'G' => vec![0.00, 0.03, 0.03, 0.03, 0.90], - _ => panic!("Encountered unknown base {char}"), + b'N' => vec![0.00, 0.25, 0.25, 0.25, 0.25], + _ => panic!("Encountered unknown base {}", base as char), } } diff --git a/trgt/src/hmm/events.rs b/trgt/src/hmm/events.rs index 7b0d405..b0aa817 100644 --- a/trgt/src/hmm/events.rs +++ b/trgt/src/hmm/events.rs @@ -67,7 +67,8 @@ pub fn get_events(hmm: &Hmm, motifs: &[Vec], states: &[usize], query: &[u8]) let event = match offset.div_euclid(motif_len) { 0 => { let base = query[base_index]; - if base == get_base_match(&hmm, state) { + let expected_base = get_base_match(&hmm, state); + if base == expected_base || expected_base == b'N' { HmmEvent::Match } else { HmmEvent::Mismatch @@ -110,6 +111,8 @@ pub fn get_base_match(hmm: &Hmm, state: usize) -> u8 { 4 => b'G', _ => handle_error_and_exit(format!("Unexpected base match event")), } + } else if top_indexes.len() == 4 { + b'N' } else { b' ' } @@ -127,6 +130,13 @@ mod tests { assert_eq!(get_base_match(&hmm, 3), b'A'); } + #[test] + fn states_with_equal_emission_probs_match_n_base() { + let motifs = vec!["N".as_bytes().to_vec()]; + let hmm = build_hmm(&motifs); + assert_eq!(get_base_match(&hmm, 3), b'N'); + } + #[test] fn silent_states_match_a_blank_character() { let mut hmm = Hmm::new(3); diff --git a/trgt/src/hmm/purity.rs b/trgt/src/hmm/purity.rs index 7db830b..599bd8f 100644 --- a/trgt/src/hmm/purity.rs +++ b/trgt/src/hmm/purity.rs @@ -74,6 +74,17 @@ mod tests { assert_eq!(purity, 18.0 / 26.0); } + #[test] + fn calculate_purity_of_polyalanine_repeat() { + let motifs = vec!["GCN".as_bytes().to_vec()]; + let hmm = build_hmm(&motifs); + // GCNGCNGCNGXN + let query = "GCAGCCGCTGAG"; + let states = hmm.label(&query); + let purity = calc_purity(query.as_bytes(), &hmm, &motifs, &states); + assert_eq!(purity, 11.0 / 12.0); + } + #[test] fn calculate_purity_of_empty_query() { let motifs = vec!["CAG".as_bytes().to_vec(), "CCG".as_bytes().to_vec()]; diff --git a/trgt/src/hmm/utils.rs b/trgt/src/hmm/utils.rs index 7ebe68e..38bf54c 100644 --- a/trgt/src/hmm/utils.rs +++ b/trgt/src/hmm/utils.rs @@ -26,13 +26,17 @@ pub fn collapse_labels(spans: Vec) -> Vec { collapsed } -pub fn replace_invalid_bases(seq: &str) -> String { +pub fn replace_invalid_bases(seq: &str, allowed_bases: &[char]) -> String { seq.as_bytes() .iter() .enumerate() - .map(|(i, b)| match b { - b'A' | b'T' | b'C' | b'G' => *b as char, - _ => ['A', 'T', 'C', 'G'][i % 4], + .map(|(index, base)| { + let base = *base as char; + if allowed_bases.contains(&base) { + base + } else { + allowed_bases[index % allowed_bases.len()] + } }) .collect() } diff --git a/trgt/src/workflows/tr.rs b/trgt/src/workflows/tr.rs index b0df031..6369d2e 100644 --- a/trgt/src/workflows/tr.rs +++ b/trgt/src/workflows/tr.rs @@ -345,14 +345,14 @@ fn label_with_hmm(locus: &Locus, seqs: &Vec) -> Vec { let motifs = locus .motifs .iter() - .map(|m| replace_invalid_bases(m)) + .map(|m| replace_invalid_bases(m, &['A', 'T', 'C', 'G', 'N'])) .map(|m| m.as_bytes().to_vec()) .collect_vec(); let hmm = build_hmm(&motifs); let mut annotations = Vec::new(); for seq in seqs { - let seq = replace_invalid_bases(seq); + let seq = replace_invalid_bases(seq, &['A', 'T', 'C', 'G']); let labels = hmm.label(&seq); let purity = calc_purity(&seq.as_bytes(), &hmm, &motifs, &labels); let labels = hmm.label_motifs(&labels); diff --git a/trvz/Cargo.toml b/trvz/Cargo.toml index fabf470..c4a650b 100644 --- a/trvz/Cargo.toml +++ b/trvz/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "trvz" -version = "0.8.0" +version = "0.9.0" edition = "2021" build = "build.rs" diff --git a/trvz/src/hmm/builder.rs b/trvz/src/hmm/builder.rs index ca50f73..6be0c2c 100644 --- a/trvz/src/hmm/builder.rs +++ b/trvz/src/hmm/builder.rs @@ -179,7 +179,8 @@ fn get_match_emissions(char: u8) -> Vec { b'T' => vec![0.00, 0.03, 0.90, 0.03, 0.03], b'C' => vec![0.00, 0.03, 0.03, 0.90, 0.03], b'G' => vec![0.00, 0.03, 0.03, 0.03, 0.90], - _ => panic!("Encountered unknown base {char}"), + b'N' => vec![0.00, 0.25, 0.25, 0.25, 0.25], + _ => panic!("Encountered unknown base {}", char as char), } } diff --git a/trvz/src/hmm/events.rs b/trvz/src/hmm/events.rs index 7b0d405..b0aa817 100644 --- a/trvz/src/hmm/events.rs +++ b/trvz/src/hmm/events.rs @@ -67,7 +67,8 @@ pub fn get_events(hmm: &Hmm, motifs: &[Vec], states: &[usize], query: &[u8]) let event = match offset.div_euclid(motif_len) { 0 => { let base = query[base_index]; - if base == get_base_match(&hmm, state) { + let expected_base = get_base_match(&hmm, state); + if base == expected_base || expected_base == b'N' { HmmEvent::Match } else { HmmEvent::Mismatch @@ -110,6 +111,8 @@ pub fn get_base_match(hmm: &Hmm, state: usize) -> u8 { 4 => b'G', _ => handle_error_and_exit(format!("Unexpected base match event")), } + } else if top_indexes.len() == 4 { + b'N' } else { b' ' } @@ -127,6 +130,13 @@ mod tests { assert_eq!(get_base_match(&hmm, 3), b'A'); } + #[test] + fn states_with_equal_emission_probs_match_n_base() { + let motifs = vec!["N".as_bytes().to_vec()]; + let hmm = build_hmm(&motifs); + assert_eq!(get_base_match(&hmm, 3), b'N'); + } + #[test] fn silent_states_match_a_blank_character() { let mut hmm = Hmm::new(3);