Skip to content

Commit

Permalink
Update to version 1.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
tmokveld committed Jul 17, 2024
1 parent e231f95 commit 4b7576d
Show file tree
Hide file tree
Showing 21 changed files with 1,400 additions and 56 deletions.
11 changes: 9 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "trgt"
version = "1.0.0"
version = "1.1.0"
edition = "2021"
build = "build.rs"

Expand All @@ -26,4 +26,5 @@ tiny-skia = "0.11"
svg2pdf = "0.9"
tempfile = "3"
rayon = "1.10"
crossbeam-channel = "0.5"
crossbeam-channel = "0.5"
semver = "1.0"
20 changes: 12 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,14 @@ changes to the input and output file formats of TRGT.
- Repeat definition files are available in [this Zenodo repository](https://zenodo.org/record/8329210)
and definitions of known pathogenic repeats are [also available here](repeats/).

## TRGTdb
## Joint analysis of multiple samples

TRGT outputs VCFs containing TR alleles from each region in the repeat catalog.
To facilitate analysis of alleles across multiple samples, we provide the TRGTdb
which can be found [here](https://github.com/PacificBiosciences/trgt/pull/6).
After cloning that fork, the TRGTdb can be installed by running
`python3 -m pip install trgt/`. See the fork's `notebooks/` directory for tutorials
converting results into TRGTdb as well as example analyses. TRGTdb was developed by
[Adam English](https://github.com/ACEnglish).
TRGT outputs VCFs containing repeat alleles from each region in the repeat
catalog. To facilitate analysis of repeats across multiple samples, VCFs can be
either merged into a multi-sample VCF using the `merge` sub-command or converted
into a database using the [TDB tool](https://github.com/ACEnglish/tdb) (formerly
called TRGTdb). TDB offers many advantages over multi-sample VCFs, including
simpler data extraction, support for queries, and reduced file sizes.

## Documentation

Expand Down Expand Up @@ -119,6 +118,11 @@ tandem repeats at genome scale. 2024](https://www.nature.com/articles/s41587-023
- Lower memory footprint: Better memory management significantly reduces memory usage with large repeat catalogs.
- Updated error handling: Malformed entries are now logged as errors without terminating the program.
- Added shorthand CLI options to simplify command usage.
- 1.1.0
- Added a new subcommand `trgt merge`. This command merges VCF files generated by `trgt genotype` into a joint VCF file. **Works with VCFs generated by all versions of TRGT** (the resulting joint VCF will always be in the TRGT v1.0+ format which includes padding bases).
- Added subsampling of regions with ultra-high coverage (`>MAX_DEPTH * 3`, by default 750); implemented via reservoir sampling.
- Fixed a cluster genotyper bug that occurred when only a single read covered a locus.
- Added new logic for filtering non-HiFi reads: remove up to 3% of lower quality reads that do not match the expected repeat sequence.

### DISCLAIMER

Expand Down
143 changes: 142 additions & 1 deletion src/cli.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use crate::utils::{Genotyper, Result, TrgtScoring};
use crate::{
merge::vcf_writer::OutputType,
utils::{Genotyper, Result, TrgtScoring},
};
use chrono::Datelike;
use clap::{ArgAction, ArgGroup, Parser, Subcommand};
use env_logger::fmt::Color;
Expand Down Expand Up @@ -46,6 +49,91 @@ pub enum Command {
Plot(PlotArgs),
#[clap(about = "Tandem Repeat Catalog Validator")]
Validate(ValidateArgs),
#[clap(about = "Tandem Repeat VCF Merger")]
Merge(MergeArgs),
}

#[derive(Parser, Debug)]
#[command(group(ArgGroup::new("merge")))]
#[command(arg_required_else_help(true))]
pub struct MergeArgs {
#[clap(required = true)]
#[clap(short = 'v')]
#[clap(long = "vcf")]
#[clap(help = "VCF files to merge")]
#[clap(value_name = "VCF")]
#[clap(num_args = 1..)]
pub vcfs: Vec<PathBuf>,

#[clap(short = 'g')]
#[clap(long = "genome")]
#[clap(help = "Path to reference genome FASTA")]
#[clap(value_name = "FASTA")]
#[arg(value_parser = check_file_exists)]
pub genome_path: PathBuf,

#[clap(short = 'o')]
#[clap(long = "output")]
#[clap(value_name = "FILE")]
#[clap(help = "Write output to a file [standard output]")]
#[arg(value_parser = check_prefix_path)]
pub output: Option<String>,

#[clap(help_heading("Advanced"))]
#[clap(short = 'O')]
#[clap(long = "output-type")]
#[clap(value_name = "OUTPUT_TYPE")]
#[clap(help = "Output type: u|b|v|z, u/b: un/compressed BCF, v/z: un/compressed VCF")]
#[clap(value_parser = merge_validate_output_type)]
pub output_type: Option<OutputType>,

#[clap(help_heading("Advanced"))]
#[clap(long = "skip-n")]
#[clap(value_name = "SKIP_N")]
#[clap(help = "Skip the first N records")]
pub skip_n: Option<usize>,

#[clap(help_heading("Advanced"))]
#[clap(long = "process-n")]
#[clap(value_name = "process_N")]
#[clap(help = "Only process N records")]
pub process_n: Option<usize>,

#[clap(help_heading("Advanced"))]
#[clap(long = "print-header")]
#[clap(help = "Print only the merged header and exit")]
pub print_header: bool,

#[clap(help_heading("Advanced"))]
#[clap(long = "force-single")]
#[clap(help = "Run even if there is only one file on input")]
pub force_single: bool,

#[clap(help_heading("Advanced"))]
#[clap(hide = true)]
#[clap(long = "force-samples")]
#[clap(help = "Resolve duplicate sample names")]
pub force_samples: bool,

#[clap(help_heading("Advanced"))]
#[clap(long = "no-version")]
#[clap(help = "Do not append version and command line to the header")]
pub no_version: bool,

#[clap(help_heading("Advanced"))]
#[clap(hide = true)]
#[clap(long = "missing-to-ref")]
#[clap(help = "Assume genotypes at missing sites are 0/0")]
pub missing_to_ref: bool,

#[clap(help_heading("Advanced"))]
#[clap(hide = true)]
#[clap(long = "strategy")]
#[clap(value_name = "STRATEGY")]
#[clap(help = "Set variant merging strategy to use")]
#[clap(value_parser(["exact"]))]
#[clap(default_value = "exact")]
pub merge_strategy: String,
}

#[derive(Parser, Debug)]
Expand Down Expand Up @@ -391,3 +479,56 @@ fn scoring_from_string(s: &str) -> Result<TrgtScoring> {
bandwidth: values[5] as usize,
})
}

fn merge_validate_output_type(s: &str) -> Result<OutputType> {
let valid_prefixes = ["u", "b", "v", "z"];
if valid_prefixes.contains(&s) {
return match s {
"u" => Ok(OutputType::Bcf {
is_uncompressed: true,
level: None,
}),
"v" => Ok(OutputType::Vcf {
is_uncompressed: true,
level: None,
}),
"b" => Ok(OutputType::Bcf {
is_uncompressed: false,
level: None,
}),
"z" => Ok(OutputType::Vcf {
is_uncompressed: false,
level: None,
}),
_ => unreachable!(),
};
}

// NOTE: Can't actually set compression level in rust/htslib at the moment
// if s.len() == 2 {
// let (prefix, suffix) = s.split_at(1);
// if (prefix == "b" || prefix == "z") && suffix.chars().all(|c| c.is_digit(10)) {
// return match prefix {
// "b" => Ok(OutputType::Bcf {
// is_uncompressed: false,
// level: Some(suffix.parse().unwrap()),
// }),
// "z" => Ok(OutputType::Vcf {
// is_uncompressed: false,
// level: Some(suffix.parse().unwrap()),
// }),
// _ => unreachable!(),
// };
// } else if (prefix == "u" || prefix == "v") && suffix.chars().all(|c| c.is_digit(10)) {
// return Err(format!(
// "Error: compression level ({}) cannot be set on uncompressed streams ({})",
// suffix, prefix
// ));
// }
// }

Err(format!(
"Invalid output type: {}. Must be one of u, b, v, z.",
s
))
}
1 change: 1 addition & 0 deletions src/commands.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub mod genotype;
pub mod merge;
pub mod plot;
pub mod validate;
4 changes: 2 additions & 2 deletions src/commands/genotype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ struct ThreadLocalData {
}

thread_local! {
static LOCAL_BAM_READER: ThreadLocalData = ThreadLocalData {
static LOCAL_BAM_READER: ThreadLocalData = const { ThreadLocalData {
bam: RefCell::new(None),
};
} };
}

const CHANNEL_BUFFER_SIZE: usize = 2048;
Expand Down
20 changes: 20 additions & 0 deletions src/commands/merge.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
use crate::cli::MergeArgs;
use crate::merge::vcf_processor::VcfProcessor;
use crate::utils::Result;
use std::time;

pub fn merge(args: MergeArgs) -> Result<()> {
let start_timer = time::Instant::now();

let mut vcf_processor = VcfProcessor::new(&args)?;

if args.print_header {
return Ok(());
}

vcf_processor.merge_variants();

// TODO: If --output, --write-index is set and the output is compressed, index the file
log::info!("Total execution time: {:.2?}", start_timer.elapsed());
Ok(())
}
2 changes: 1 addition & 1 deletion src/hmm/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ mod tests {
use super::*;
use crate::hmm::spans::Span;

fn summarize(spans: &Vec<Span>) -> Vec<(usize, usize, usize)> {
fn summarize(spans: &[Span]) -> Vec<(usize, usize, usize)> {
let mut summary = Vec::new();
for (motif_index, group) in &spans
.iter()
Expand Down
6 changes: 3 additions & 3 deletions src/hmm/purity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ mod tests {
let hmm = build_hmm(&motifs);
// GCNGCNGCNGXN
let query = "GCAGCCGCTGAG";
let states = hmm.label(&query);
let states = hmm.label(query);
let purity = calc_purity(query.as_bytes(), &hmm, &motifs, &states);
assert_eq!(purity, 11.0 / 12.0);
}
Expand All @@ -90,8 +90,8 @@ mod tests {
let motifs = vec!["CAG".as_bytes().to_vec(), "CCG".as_bytes().to_vec()];
let hmm = build_hmm(&motifs);
let query = "";
let states = hmm.label(&query);
let purity = calc_purity(&query.as_bytes(), &hmm, &motifs, &states);
let states = hmm.label(query);
let purity = calc_purity(query.as_bytes(), &hmm, &motifs, &states);
assert!(purity.is_nan());
}
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
pub mod cli;
pub mod commands;
pub mod hmm;
pub mod merge;
pub mod trgt;
pub mod trvz;
pub mod utils;
4 changes: 3 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use clap::Parser;
use trgt::{
cli::{init_verbose, Cli, Command, FULL_VERSION},
commands::{genotype, plot, validate},
commands::{genotype, merge, plot, validate},
utils::{handle_error_and_exit, Result},
};

Expand All @@ -12,6 +12,7 @@ fn runner() -> Result<()> {
Command::Genotype(_) => "genotype",
Command::Plot(_) => "plot",
Command::Validate(_) => "validate",
Command::Merge(_) => "merge",
};

log::info!(
Expand All @@ -24,6 +25,7 @@ fn runner() -> Result<()> {
Command::Genotype(args) => genotype::trgt(args)?,
Command::Plot(args) => plot::trvz(args)?,
Command::Validate(args) => validate::validate(args)?,
Command::Merge(args) => merge::merge(args)?,
}
log::info!("{} end", env!("CARGO_PKG_NAME"));
Ok(())
Expand Down
4 changes: 4 additions & 0 deletions src/merge/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
pub mod strategy;
pub mod vcf_processor;
pub mod vcf_reader;
pub mod vcf_writer;
Loading

0 comments on commit 4b7576d

Please sign in to comment.