Skip to content

Commit

Permalink
Update to version 0.9.0
Browse files Browse the repository at this point in the history
  • Loading branch information
egor-dolzhenko committed Apr 8, 2024
1 parent 3987f61 commit 3d10c75
Show file tree
Hide file tree
Showing 11 changed files with 63 additions and 23 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ tandem repeats at genome scale. 2024](https://www.nature.com/articles/s41587-023
fixed
- Optimizations to the `--genotyper=cluster` mode, including haploid genotyping
of the X chromosome when `--karyotype` is set to `XY`
- 0.9.0
- Add support for polyalanine repeats (by allowing characters `N` in the motif sequence)
- Fix a bug causing TRVZ to error out on polyalanine repeats

### DISCLAIMER

Expand Down
18 changes: 9 additions & 9 deletions repeats/pathogenic_repeats.hg38.bed
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
chr1 57367043 57367119 ID=DAB1;MOTIFS=AAAAT,GAAAT;STRUC=(AAAAT)n(GAAAT)n(AAAAT)n
chr1 57367043 57367119 ID=DAB1;MOTIFS=AAAAT,GAAAT;STRUC=<DAB1>
chr1 146228800 146228821 ID=NOTCH2NLA;MOTIFS=GCC;STRUC=(GCC)n
chr1 149390802 149390841 ID=NOTCH2NLC;MOTIFS=GGC;STRUC=(GGC)n
chr10 79826383 79826404 ID=NUTM2B-AS1;MOTIFS=CGG;STRUC=(CGG)n
Expand All @@ -14,30 +14,30 @@ chr14 23321472 23321490 ID=PABPN1;MOTIFS=GCG;STRUC=(GCG)n
chr14 92071009 92071042 ID=ATXN3;MOTIFS=GCT;STRUC=(GCT)n
chr15 22786677 22786701 ID=NIPA1;MOTIFS=GCG;STRUC=(GCG)n
chr16 17470907 17470923 ID=XYLT1;MOTIFS=GCC;STRUC=(GCC)n
chr16 24613439 24613530 ID=TNRC6A;MOTIFS=TTTTA,TTTCA;STRUC=(TTTTA)n(TTTCA)n(TTTTA)n
chr16 66490398 66490467 ID=BEAN1;MOTIFS=TGGAA,TAAAA;STRUC=(TGGAA)n(TAAAA)n
chr16 24613439 24613530 ID=TNRC6A;MOTIFS=TTTTA,TTTCA;STRUC=<TNRC6A>
chr16 66490398 66490467 ID=BEAN1;MOTIFS=TGGAA,TAAAA;STRUC=<BEAN1>
chr16 87604287 87604329 ID=JPH3;MOTIFS=CTG;STRUC=(CTG)n
chr18 55586155 55586227 ID=TCF4;MOTIFS=CAG;STRUC=(CAG)n
chr19 13207858 13207897 ID=CACNA1A;MOTIFS=CTG;STRUC=(CTG)n
chr19 14496041 14496074 ID=GIPC1;MOTIFS=CCG;STRUC=(CCG)n
chr19 18786034 18786050 ID=COMP;MOTIFS=GTC;STRUC=(GTC)n
chr19 45770204 45770264 ID=DMPK;MOTIFS=CAG;STRUC=(CAG)n
chr2 96197066 96197122 ID=STARD7;MOTIFS=TGAAA,TAAAA;STRUC=(TGAAA)n(TAAAA)n
chr2 96197066 96197122 ID=STARD7;MOTIFS=TGAAA,TAAAA;STRUC=<STARD7>
chr2 100104799 100104824 ID=AFF3;MOTIFS=GCC;STRUC=(GCC)n
chr2 176093058 176093104 ID=HOXD13;MOTIFS=GCN;STRUC=(GCN)n
chr2 190880872 190880920 ID=GLS;MOTIFS=GCA;STRUC=(GCA)n
chr20 2652733 2652775 ID=NOP56;MOTIFS=GGCCTG,CGCCTG;STRUC=(GGCCTG)n(CGCCTG)n
chr21 43776443 43776479 ID=CSTB;MOTIFS=CGCGGGGCGGGG;STRUC=(CGCGGGGCGGGG)n
chr22 45795354 45795424 ID=ATXN10;MOTIFS=ATTCT;STRUC=(ATTCT)n
chr3 63912684 63912726 ID=ATXN7;MOTIFS=GCA,GCC;STRUC=(GCA)n(GCC)n
chr3 129172576 129172732 ID=CNBP;MOTIFS=CAGG,CAGA,CA;STRUC=(CAGG)n(CAGA)n(CA)n
chr3 129172576 129172732 ID=CNBP;MOTIFS=CAGG,CAGA,CA;STRUC=<CNBP>
chr3 138946020 138946063 ID=FOXL2;MOTIFS=NGC;STRUC=(NGC)n
chr3 183712187 183712223 ID=YEATS2;MOTIFS=TTTTA,TTTCA;STRUC=(TTTTA)n(TTTCA)n
chr3 183712187 183712223 ID=YEATS2;MOTIFS=TTTTA,TTTCA;STRUC=<YEATS2>
chr4 3074876 3074966 ID=HTT;MOTIFS=CAG,CCG;STRUC=(CAG)nCAACAG(CCG)n
chr4 39348424 39348479 ID=RFC1;MOTIFS=AAAAG,AAAGG,AAGGG,AAGAG,AGAGG,AACGG,GGGAC,AAAGGG;STRUC=<RFC1>
chr4 39348424 39348479 ID=RFC1;MOTIFS=AAAAG,AAAGG,AAGGG,AAGAG,AGAGG,AACGG,GGGAC,AAAGGG,CAGGA,CAAGA;STRUC=<RFC1>
chr4 41745972 41746032 ID=PHOX2B;MOTIFS=GCN;STRUC=(GCN)n
chr4 159342526 159342617 ID=RAPGEF2;MOTIFS=TTTTA,TTTCA;STRUC=(TTTTA)n(TTTCA)n(TTTTA)n
chr5 10356346 10356412 ID=MARCHF6;MOTIFS=TTTTA,TTTCA;STRUC=(TTTTA)n(TTTCA)n
chr4 159342526 159342617 ID=RAPGEF2;MOTIFS=TTTTA,TTTCA;STRUC=<RAPGEF2>
chr5 10356346 10356412 ID=MARCHF6;MOTIFS=TTTTA,TTTCA;STRUC=<MARCHF6>
chr5 146878727 146878757 ID=PPP2R2B;MOTIFS=GCT;STRUC=(GCT)n
chr6 16327633 16327723 ID=ATXN1;MOTIFS=TGC;STRUC=(TGC)n
chr6 45422750 45422802 ID=RUNX2;MOTIFS=GCN;STRUC=(GCN)n
Expand Down
2 changes: 1 addition & 1 deletion trgt/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "trgt"
version = "0.8.0"
version = "0.9.0"
edition = "2021"
build = "build.rs"

Expand Down
7 changes: 4 additions & 3 deletions trgt/src/hmm/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,13 +173,14 @@ fn define_motif_block(hmm: &mut Hmm, ms: usize, motif: &Vec<u8>) {
}
}

fn get_match_emissions(char: u8) -> Vec<f64> {
match char {
fn get_match_emissions(base: u8) -> Vec<f64> {
match base {
b'A' => vec![0.00, 0.90, 0.03, 0.03, 0.03],
b'T' => vec![0.00, 0.03, 0.90, 0.03, 0.03],
b'C' => vec![0.00, 0.03, 0.03, 0.90, 0.03],
b'G' => vec![0.00, 0.03, 0.03, 0.03, 0.90],
_ => panic!("Encountered unknown base {char}"),
b'N' => vec![0.00, 0.25, 0.25, 0.25, 0.25],
_ => panic!("Encountered unknown base {}", base as char),
}
}

Expand Down
12 changes: 11 additions & 1 deletion trgt/src/hmm/events.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ pub fn get_events(hmm: &Hmm, motifs: &[Vec<u8>], states: &[usize], query: &[u8])
let event = match offset.div_euclid(motif_len) {
0 => {
let base = query[base_index];
if base == get_base_match(&hmm, state) {
let expected_base = get_base_match(&hmm, state);
if base == expected_base || expected_base == b'N' {
HmmEvent::Match
} else {
HmmEvent::Mismatch
Expand Down Expand Up @@ -110,6 +111,8 @@ pub fn get_base_match(hmm: &Hmm, state: usize) -> u8 {
4 => b'G',
_ => handle_error_and_exit(format!("Unexpected base match event")),
}
} else if top_indexes.len() == 4 {
b'N'
} else {
b' '
}
Expand All @@ -127,6 +130,13 @@ mod tests {
assert_eq!(get_base_match(&hmm, 3), b'A');
}

#[test]
fn states_with_equal_emission_probs_match_n_base() {
let motifs = vec!["N".as_bytes().to_vec()];
let hmm = build_hmm(&motifs);
assert_eq!(get_base_match(&hmm, 3), b'N');
}

#[test]
fn silent_states_match_a_blank_character() {
let mut hmm = Hmm::new(3);
Expand Down
11 changes: 11 additions & 0 deletions trgt/src/hmm/purity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,17 @@ mod tests {
assert_eq!(purity, 18.0 / 26.0);
}

#[test]
fn calculate_purity_of_polyalanine_repeat() {
let motifs = vec!["GCN".as_bytes().to_vec()];
let hmm = build_hmm(&motifs);
// GCNGCNGCNGXN
let query = "GCAGCCGCTGAG";
let states = hmm.label(&query);
let purity = calc_purity(query.as_bytes(), &hmm, &motifs, &states);
assert_eq!(purity, 11.0 / 12.0);
}

#[test]
fn calculate_purity_of_empty_query() {
let motifs = vec!["CAG".as_bytes().to_vec(), "CCG".as_bytes().to_vec()];
Expand Down
12 changes: 8 additions & 4 deletions trgt/src/hmm/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,17 @@ pub fn collapse_labels(spans: Vec<Span>) -> Vec<Span> {
collapsed
}

pub fn replace_invalid_bases(seq: &str) -> String {
pub fn replace_invalid_bases(seq: &str, allowed_bases: &[char]) -> String {
seq.as_bytes()
.iter()
.enumerate()
.map(|(i, b)| match b {
b'A' | b'T' | b'C' | b'G' => *b as char,
_ => ['A', 'T', 'C', 'G'][i % 4],
.map(|(index, base)| {
let base = *base as char;
if allowed_bases.contains(&base) {
base
} else {
allowed_bases[index % allowed_bases.len()]
}
})
.collect()
}
4 changes: 2 additions & 2 deletions trgt/src/workflows/tr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -345,14 +345,14 @@ fn label_with_hmm(locus: &Locus, seqs: &Vec<String>) -> Vec<Annotation> {
let motifs = locus
.motifs
.iter()
.map(|m| replace_invalid_bases(m))
.map(|m| replace_invalid_bases(m, &['A', 'T', 'C', 'G', 'N']))
.map(|m| m.as_bytes().to_vec())
.collect_vec();
let hmm = build_hmm(&motifs);

let mut annotations = Vec::new();
for seq in seqs {
let seq = replace_invalid_bases(seq);
let seq = replace_invalid_bases(seq, &['A', 'T', 'C', 'G']);
let labels = hmm.label(&seq);
let purity = calc_purity(&seq.as_bytes(), &hmm, &motifs, &labels);
let labels = hmm.label_motifs(&labels);
Expand Down
2 changes: 1 addition & 1 deletion trvz/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "trvz"
version = "0.8.0"
version = "0.9.0"
edition = "2021"
build = "build.rs"

Expand Down
3 changes: 2 additions & 1 deletion trvz/src/hmm/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ fn get_match_emissions(char: u8) -> Vec<f64> {
b'T' => vec![0.00, 0.03, 0.90, 0.03, 0.03],
b'C' => vec![0.00, 0.03, 0.03, 0.90, 0.03],
b'G' => vec![0.00, 0.03, 0.03, 0.03, 0.90],
_ => panic!("Encountered unknown base {char}"),
b'N' => vec![0.00, 0.25, 0.25, 0.25, 0.25],
_ => panic!("Encountered unknown base {}", char as char),
}
}

Expand Down
12 changes: 11 additions & 1 deletion trvz/src/hmm/events.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ pub fn get_events(hmm: &Hmm, motifs: &[Vec<u8>], states: &[usize], query: &[u8])
let event = match offset.div_euclid(motif_len) {
0 => {
let base = query[base_index];
if base == get_base_match(&hmm, state) {
let expected_base = get_base_match(&hmm, state);
if base == expected_base || expected_base == b'N' {
HmmEvent::Match
} else {
HmmEvent::Mismatch
Expand Down Expand Up @@ -110,6 +111,8 @@ pub fn get_base_match(hmm: &Hmm, state: usize) -> u8 {
4 => b'G',
_ => handle_error_and_exit(format!("Unexpected base match event")),
}
} else if top_indexes.len() == 4 {
b'N'
} else {
b' '
}
Expand All @@ -127,6 +130,13 @@ mod tests {
assert_eq!(get_base_match(&hmm, 3), b'A');
}

#[test]
fn states_with_equal_emission_probs_match_n_base() {
let motifs = vec!["N".as_bytes().to_vec()];
let hmm = build_hmm(&motifs);
assert_eq!(get_base_match(&hmm, 3), b'N');
}

#[test]
fn silent_states_match_a_blank_character() {
let mut hmm = Hmm::new(3);
Expand Down

0 comments on commit 3d10c75

Please sign in to comment.