Skip to content

Commit

Permalink
Update to version 1.4.0
Browse files Browse the repository at this point in the history
  • Loading branch information
pbsena committed Nov 20, 2024
1 parent 881174c commit 475bf52
Show file tree
Hide file tree
Showing 11 changed files with 80 additions and 38 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "trgt"
version = "1.3.0"
version = "1.4.0"
edition = "2021"
build = "build.rs"

Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,10 @@ tandem repeats at genome scale. 2024](https://www.nature.com/articles/s41587-023
- Plotting code has been refactored as we prepare to revamp repeat visualizations
- The maximum number of reads per allele to plot can now be specified by `--max-allele-reads`
- bugfix: repeat identifiers are now permitted to contain commas
- 1.4.0
- Parameters appropriate for targeted sequencing can now be set with `--preset targeted` option
- Waterfall plots no longer panic when there are no reads in a locus
- Algorithmic changes to `--genotyper cluster` allow fewer reads to be assigned to an allele; this may result in minor changes to consensus sequence and read assignment

### DISCLAIMER

Expand Down
4 changes: 1 addition & 3 deletions docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,14 @@ Options:
of HiFi reads overlapping the repeats (`<OUTPUT_PREFIX>.spanning.bam`).
- `-k, --karyotype <KARYOTYPE>` Sample karyotype (XX or XY) [default: XX].
- `-t, --threads <THREADS>` Number of threads [default: 1].
`--preset <PRESET> ` Parameter preset (wgs or targeted)

Advanced:
- `--sample-name <SAMPLE_NAME>` Sample name. If it is not provided, the sample name is extracted from the input BAM or file stem.
- `--genotyper <GENOTYPER>` Genotyping algorithm (size or cluster), [default: size].
- `--aln-scoring <SCORING>` Scoring function to align to flanks (non-negative values): MATCH,MISM,GAPO,GAPE,KMERLEN,BANDWIDTH.
- `--min-flank-id-frac <PERC>` Minimum fraction of matches in a flank sequence to consider it 'found'.
- `--flank-len <FLANK_LEN>` Minimum length of the flanking sequence [default: 250].
- `--output-flank-len <FLANK_LEN>` Length of flanking sequence to report on output [default: 50].
- `--fixed-flanks` Keep flank length fixed.
- `--min-read-quality <MIN_RQ>` Minimum HiFi rq value required to use a read for genotyping [default: 0.98].
- `--disable-bam-output` Disable BAM output.
- `--max-depth <MAX_DEPTH>` Maximum locus depth [default: 250].

Expand Down
19 changes: 17 additions & 2 deletions src/cli.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::{
merge::vcf_writer::OutputType,
utils::{Genotyper, Result, TrgtScoring},
utils::{Genotyper, Result, TrgtPreset, TrgtScoring},
};
use chrono::Datelike;
use clap::{ArgAction, ArgGroup, Parser, Subcommand};
Expand Down Expand Up @@ -150,7 +150,7 @@ pub struct MergeArgs {
pub contigs: Option<Vec<String>>,
}

#[derive(Parser, Debug)]
#[derive(Parser, Debug, Clone)]
#[command(group(ArgGroup::new("genotype")))]
#[command(arg_required_else_help(true))]
pub struct GenotypeArgs {
Expand Down Expand Up @@ -201,6 +201,12 @@ pub struct GenotypeArgs {
#[arg(value_parser = threads_in_range)]
pub num_threads: usize,

#[clap(long = "preset")]
#[clap(value_name = "PRESET")]
#[clap(help = "Parameter preset (wgs or targeted)")]
#[clap(default_value = "wgs")]
pub preset: TrgtPreset,

#[clap(help_heading("Advanced"))]
#[clap(long = "sample-name")]
#[clap(value_name = "SAMPLE_NAME")]
Expand All @@ -214,23 +220,28 @@ pub struct GenotypeArgs {
#[clap(value_name = "GENOTYPER")]
#[clap(help = "Genotyping algorithm (size or cluster)")]
#[clap(default_value = "size")]
#[clap(default_value_if("preset", "targeted", "cluster"))]
pub genotyper: Genotyper,

#[clap(hide = true)]
#[clap(help_heading("Advanced"))]
#[clap(long = "aln-scoring")]
#[clap(value_name = "SCORING")]
#[clap(
help = "Scoring function to align to flanks (non-negative values): MATCH,MISM,GAPO,GAPE,KMERLEN,BANDWIDTH"
)]
#[clap(default_value = "1,1,5,1,8,6")]
#[clap(default_value_if("preset", "targeted", "1,1,0,1,1,5000"))]
#[arg(value_parser = scoring_from_string)]
pub aln_scoring: TrgtScoring,

#[clap(hide = true)]
#[clap(help_heading("Advanced"))]
#[clap(long = "min-flank-id-frac")]
#[clap(value_name = "PERC")]
#[clap(help = "Minimum fraction of matches in a flank sequence to consider it 'found'")]
#[clap(default_value = "0.7")]
#[clap(default_value_if("preset", "targeted", "0.8"))]
#[arg(value_parser = ensure_unit_float)]
pub min_flank_id_frac: f64,

Expand All @@ -239,6 +250,7 @@ pub struct GenotypeArgs {
#[clap(value_name = "FLANK_LEN")]
#[clap(help = "Minimum length of the flanking sequence")]
#[clap(default_value = "250")]
#[clap(default_value_if("preset", "targeted", "200"))]
pub flank_len: usize,

#[clap(help_heading("Advanced"))]
Expand All @@ -254,11 +266,13 @@ pub struct GenotypeArgs {
#[clap(help = "Keep flank length fixed")]
pub fixed_flanks: bool,

#[clap(hide = true)]
#[clap(help_heading("Advanced"))]
#[clap(long = "min-read-quality")]
#[clap(value_name = "MIN_RQ")]
#[clap(help = "Minimum HiFi rq value required to use a read for genotyping")]
#[clap(default_value = "0.98")]
#[clap(default_value_if("preset", "targeted", "-1.0"))]
// #[arg(value_parser = ensure_unit_float)]
pub min_hifi_read_qual: f64,

Expand All @@ -272,6 +286,7 @@ pub struct GenotypeArgs {
#[clap(value_name = "MAX_DEPTH")]
#[clap(help = "Maximum locus depth")]
#[clap(default_value = "250")]
#[clap(default_value_if("preset", "targeted", "10000"))]
pub max_depth: usize,
}

Expand Down
1 change: 0 additions & 1 deletion src/commands/genotype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ const CHANNEL_BUFFER_SIZE: usize = 2048;

pub fn trgt(args: GenotypeArgs) -> Result<()> {
let start_timer = time::Instant::now();

let karyotype = Karyotype::new(&args.karyotype)?;

let bam_header = get_bam_header(&args.reads_path)?;
Expand Down
53 changes: 29 additions & 24 deletions src/trgt/genotype/genotype_cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,24 +37,28 @@ pub fn central_read(num_seqs: usize, group: &[usize], dists: &[f64]) -> usize {

pub fn make_consensus(
num_seqs: usize,
trs: &[&str],
trs: &[&[u8]],
dists: &[f64],
group: &[usize],
) -> (String, TrSize) {
let seqs = group.iter().map(|&i| trs[i]).collect_vec();
let backbone = trs[central_read(num_seqs, group, dists)];
let seqs = group
.iter()
.map(|&i| std::str::from_utf8(trs[i]).unwrap())
.collect_vec();
let backbone = std::str::from_utf8(trs[central_read(num_seqs, group, dists)]).unwrap();
let aligns = align(backbone, &seqs);
let allele = consensus::repair_consensus(backbone, &seqs, &aligns);
let size = TrSize::new(allele.len(), get_ci(&seqs));
(allele, size)
}

pub fn genotype(ploidy: Ploidy, seqs: &[&[u8]], trs: &[&str]) -> (Gt, Vec<String>, Vec<i32>) {
let mut dists = get_dist_matrix(seqs);
let num_seqs = seqs.len();
pub fn genotype(ploidy: Ploidy, trs: &[&str]) -> (Gt, Vec<String>, Vec<i32>) {
let trs: Vec<&[u8]> = trs.iter().map(|x| x.as_bytes()).collect();
let mut dists = get_dist_matrix(&trs);
let num_seqs = trs.len();
if ploidy == Ploidy::One || num_seqs == 1 {
let group: Vec<usize> = (0..num_seqs).collect();
let (allele, size) = make_consensus(num_seqs, trs, &dists, &group);
let (allele, size) = make_consensus(num_seqs, &trs, &dists, &group);
let classifications = vec![0; num_seqs];
if ploidy == Ploidy::One {
let gt = Gt::from(size);
Expand All @@ -73,8 +77,8 @@ pub fn genotype(ploidy: Ploidy, seqs: &[&[u8]], trs: &[&str]) -> (Gt, Vec<String
let group1 = groups.pop().unwrap();
let group2 = groups.pop().unwrap();

let (allele1, size1) = make_consensus(num_seqs, trs, &dists, &group1);
let (allele2, size2) = make_consensus(num_seqs, trs, &dists, &group2);
let (allele1, size1) = make_consensus(num_seqs, &trs, &dists, &group1);
let (allele2, size2) = make_consensus(num_seqs, &trs, &dists, &group2);

// GS: this should be handled better, but for now we just check for small
// differences in size and large differences in cov
Expand All @@ -90,8 +94,8 @@ pub fn genotype(ploidy: Ploidy, seqs: &[&[u8]], trs: &[&str]) -> (Gt, Vec<String
// redo the homozygous case
let group1: Vec<usize> = (0..num_seqs).step_by(2).collect();
let group2: Vec<usize> = (1..num_seqs).step_by(2).collect();
let (allele1, size1) = make_consensus(num_seqs, trs, &dists, &group1);
let (allele2, size2) = make_consensus(num_seqs, trs, &dists, &group2);
let (allele1, size1) = make_consensus(num_seqs, &trs, &dists, &group1);
let (allele2, size2) = make_consensus(num_seqs, &trs, &dists, &group2);
let mut classifications: Vec<i32> = (0..num_seqs).map(|x| (x % 2) as i32).collect();
let (gt, alleles) = if allele1.len() > allele2.len() {
classifications = classifications.iter().map(|x| 1 - x).collect();
Expand All @@ -112,11 +116,14 @@ pub fn genotype(ploidy: Ploidy, seqs: &[&[u8]], trs: &[&str]) -> (Gt, Vec<String
}

// assign outlier reads (discarded in cluster()) to the closest consensus
// avoid constant conversion
let a1 = allele1.as_bytes();
let a2 = allele2.as_bytes();
for i in 0..num_seqs {
let mut tie_breaker = 1;
if classifications[i] == 2 {
let dist1 = get_dist(trs[i].as_bytes(), allele1.as_bytes());
let dist2 = get_dist(trs[i].as_bytes(), allele2.as_bytes());
let dist1 = get_dist(trs[i], a1);
let dist2 = get_dist(trs[i], a2);
if dist1 < dist2 {
classifications[i] = 0;
} else if dist2 < dist1 {
Expand Down Expand Up @@ -150,14 +157,13 @@ pub fn cluster(num_seqs: usize, dists: &mut [f64]) -> Vec<Vec<usize>> {
let steps = dendrogram.steps();
let mut cutoff = 0.0;

// for low-coverage: at least 10% of the reads must be on the smaller allele
const MIN_SMALLER_FRAC: f64 = 0.1;
const MIN_SMALLER_FRAC: f64 = 0.01;

// for high-coverage: at least 10 reads must be on the smaller allele
const MIN_CLUSTER_SIZE: usize = 10;
const MIN_CLUSTER_SIZE: usize = 2;

// choose whichever is most liberal
let min_cluster_size = std::cmp::min(
let min_cluster_size = std::cmp::max(
MIN_CLUSTER_SIZE,
(MIN_SMALLER_FRAC * (num_seqs as f64)).round() as usize,
);
Expand Down Expand Up @@ -222,24 +228,23 @@ fn get_ci(seqs: &[&str]) -> (usize, usize) {

fn get_dist(seq1: &[u8], seq2: &[u8]) -> f64 {
// we'll skip ED in cases we already know it will be too costly to do so.
const MAX_K: u32 = 2000;
const MAX_K: u32 = 10000;

let seq_diff = seq1.len().abs_diff(seq2.len()) as u32;
let dist = if seq_diff <= MAX_K {
bounded_levenshtein(seq1, seq2, MAX_K).unwrap_or(MAX_K)
} else {
seq_diff // lower bound on ED
};

(dist as f64).sqrt()
}

fn get_dist_matrix(seqs: &[&[u8]]) -> Vec<f64> {
let dist_len = seqs.len() * (seqs.len() - 1) / 2;
fn get_dist_matrix(trs: &[&[u8]]) -> Vec<f64> {
let dist_len = trs.len() * (trs.len() - 1) / 2;
let mut dists = Vec::with_capacity(dist_len);
for (index1, seq1) in seqs.iter().enumerate() {
for (_index2, seq2) in seqs.iter().enumerate().skip(index1 + 1) {
let dist = get_dist(seq1, seq2);
for (index1, tr1) in trs.iter().enumerate() {
for (_index2, tr2) in trs.iter().enumerate().skip(index1 + 1) {
let dist = get_dist(tr1, tr2);
dists.push(dist);
}
}
Expand Down
9 changes: 3 additions & 6 deletions src/trgt/workflows/tr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,7 @@ pub fn analyze(

let (mut gt, mut allele_seqs, mut classification) = match locus.genotyper {
Genotyper::Size => genotype_size::genotype(locus.ploidy, &trs),
Genotyper::Cluster => {
let spanning = reads.iter().map(|r| &r.bases[..]).collect_vec();
genotype_cluster::genotype(locus.ploidy, &spanning, &trs)
}
Genotyper::Cluster => genotype_cluster::genotype(locus.ploidy, &trs),
};

// Attempt flank re-genotyping only if alleles have similar length
Expand Down Expand Up @@ -404,11 +401,11 @@ fn filter_impure_trs(
spans: &[(usize, usize)],
rq_cutoff: f64,
) -> (Vec<HiFiRead>, Vec<(usize, usize)>) {
let max_filter = std::cmp::max(1_usize, (0.02 * (reads.len() as f64)).round() as usize);
let max_filter = std::cmp::max(1_usize, (0.1 * (reads.len() as f64)).round() as usize);
let mut num_filtered = 0;
let mut hmm = Hmm::new(0);
let mut motifs = Vec::new();
const PURITY_CUTOFF: f64 = 0.7;
const PURITY_CUTOFF: f64 = 0.9;
izip!(reads, spans)
.filter(|(read, span)| {
if num_filtered == max_filter {
Expand Down
4 changes: 4 additions & 0 deletions src/trvz/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ pub fn get_reads(
seqs
};

if seqs.is_empty() {
return Err(format!("No reads found for TRID={}", locus.id));
}

Ok(seqs)
}

Expand Down
2 changes: 2 additions & 0 deletions src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ mod io_utils;
mod karyotype;
mod math;
mod ploidy;
mod presets;
mod readers;
mod region;
pub mod util;
Expand All @@ -16,6 +17,7 @@ pub use io_utils::create_writer;
pub use karyotype::Karyotype;
pub use math::median;
pub use ploidy::Ploidy;
pub use presets::TrgtPreset;
pub use readers::{open_catalog_reader, open_genome_reader};
pub use region::GenomicRegion;
pub use util::{format_number_with_commas, handle_error_and_exit, Result};
18 changes: 18 additions & 0 deletions src/utils/presets.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
use std::str::FromStr;

#[derive(Debug, Clone, Copy, PartialEq)]
pub enum TrgtPreset {
WGS,
Targeted,
}

impl FromStr for TrgtPreset {
type Err = &'static str;
fn from_str(preset: &str) -> Result<Self, Self::Err> {
match preset {
"wgs" => Ok(TrgtPreset::WGS),
"targeted" => Ok(TrgtPreset::Targeted),
_ => Err("Invalid preset. Options are: wgs, targeted"),
}
}
}

0 comments on commit 475bf52

Please sign in to comment.