diff --git a/README.md b/README.md index d5e6f93..628be52 100644 --- a/README.md +++ b/README.md @@ -238,18 +238,24 @@ plots, flat plots, genome plots and Circos plots. - `--features FILE` add an additional track containing features to plot alongside the duplications. - - `--restrict-fragments A B ...` only plots fragments whose names are given + - `--restrict-fragments A B ...` only plots fragments whose names + are given - - `--exclude-fragments A B ...` do not plot fragments whose names are given + - `--exclude-fragments A B ...` do not plot fragments whose names + are given - `--filter-features DISTANCE` don't plot duplications that are farther away then `DISTANCE` bp from the features in the track. - - `--colorize TYPE` set the method used to colorize the duplicons. Options are - `by-type` (different colors for direct and palindromic duplications); `by-position` (color - depends on the duplication position within the input file(s)); `by-fragment` (each - duplication is colorized according to its left-most duplicons); `none` (all are drawn in - medium grey). + - `--min-thickness` set the minimal graphical width of a duplicon + (default: 0.1) + + - `--colorize TYPE` set the method used to colorize the duplicons. + Options are `by-type` (different colors for direct and palindromic + duplications); `by-position` (color depends on the duplication + position within the input file(s)); `by-fragment` (each + duplication is colorized according to its left-most duplicons); + `none` (all are drawn in medium grey). ### Features File Format @@ -340,13 +346,31 @@ manually replaced. _Please note that ASGART following the [semver](https://semver.org/) versioning scheme, where an increase in the major version number reflects a non backward-compatible update._ -## v2.1.0 +## v2.1.1 + +- Various minor refactoring & bug-fixes + +## v2.1.1 - `asgart-concat` has been renamed to `asgart-cat` - `asgart-cat` now offers filtering options - `asgart-cat` now takes advantage of multi-cores CPU when possible - `asgart-plot` now offers more filtering options +- `asgart-plot` now let the user customizes the minimal graphical + width of a duplicon with `--min-thickness` - `asgart-plot` now offer several algorithms to set duplicons colors +- Various bug-fixes + +## v2.1.1 +- Fix manifest file + +## v2.1.0 + +- Ensure that multiple fragments in a mFASTA file are processed separately +- Add a flag to specify the minimum width of a chord +- Add filtering options +- Add tooltips to chord graphs +- Fix output files naming scheme ## v2.0.2 diff --git a/src/automaton.rs b/src/automaton.rs index 6dda5df..1295163 100644 --- a/src/automaton.rs +++ b/src/automaton.rs @@ -1,15 +1,14 @@ -extern crate rayon; extern crate indicatif; +extern crate rayon; use std::cmp; use std::fmt; use super::divsufsort::*; -use super::structs::{RunSettings, ProtoSD, ProtoSDsFamily}; use super::searcher::Searcher; -use std::sync::atomic::{AtomicUsize, Ordering}; +use super::structs::{ProtoSD, ProtoSDsFamily, RunSettings}; use rayon::prelude::*; - +use std::sync::atomic::{AtomicUsize, Ordering}; #[derive(Clone)] pub struct Segment { @@ -20,11 +19,13 @@ pub struct Segment { impl fmt::Debug for Segment { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, - "S{{{} -> {}}} ({})", - self.start, - self.end, - self.end - self.start) + write!( + f, + "S{{{} -> {}}} ({})", + self.start, + self.end, + self.end - self.start + ) } } impl Segment { @@ -33,15 +34,13 @@ impl Segment { } } -#[derive(Debug,Clone)] +#[derive(Debug, Clone)] pub struct ProtoProtoSD { bottom: usize, top: usize, matches: Vec, } - - #[derive(Debug)] struct Arm { left: Segment, @@ -53,48 +52,82 @@ struct Arm { } enum Operation { - ExtendArm {i: usize, l_end: usize, r_end: usize}, - NewArm {i: usize, m_start: usize, m_end: usize}, + ExtendArm { + i: usize, + l_end: usize, + r_end: usize, + }, + NewArm { + i: usize, + m_start: usize, + m_end: usize, + }, } #[allow(clippy::too_many_arguments)] pub fn search_duplications( id: usize, - needle: &[u8], needle_offset: usize, - strand: &[u8], sa: &[SAIdx], + needle: &[u8], + needle_offset: usize, + strand: &[u8], + sa: &[SAIdx], searcher: &Searcher, progress: &AtomicUsize, - settings: RunSettings + settings: RunSettings, ) -> Vec { fn try_extend_arms(arms: &[Arm], m: &Segment, e: i64, i: usize, ps: usize) -> Operation { for (j, a) in arms.iter().enumerate() { - if a.active && d_ss(&a.right, m) < cmp::max(e, (0.1*a.left.len() as f64) as i64) as i64 && m.end > a.right.end { - return Operation::ExtendArm {i: j, l_end: i + ps, r_end: m.end} + if a.active + && d_ss(&a.right, m) < cmp::max(e, (0.1 * a.left.len() as f64) as i64) as i64 + && m.end > a.right.end + { + return Operation::ExtendArm { + i: j, + l_end: i + ps, + r_end: m.end, + }; } } - Operation::NewArm {i, m_start: m.start, m_end: m.end} + Operation::NewArm { + i, + m_start: m.start, + m_end: m.end, + } } - let mut arms : Vec = Vec::new(); + let mut arms: Vec = Vec::new(); let mut i = 0; let mut r = Vec::new(); let mut current_family_id = 1; - let step_size = settings.probe_size/2; + let step_size = settings.probe_size / 2; + if needle.len() < settings.min_duplication_length { + return Vec::new(); + } while i < needle.len() - settings.probe_size - step_size { i += step_size; progress.store(i, Ordering::Relaxed); - if needle[i] == b'N' { continue } - let matches: Vec = searcher.search(strand, sa, &needle[i .. i + settings.probe_size]) + if needle[i] == b'N' { + continue; + } + let matches: Vec = searcher + .search(strand, sa, &needle[i..i + settings.probe_size]) .into_iter() .filter(|m| m.start != i) - .filter(|m| if !settings.reverse { m.start > i + needle_offset } - else { m.start >= needle_offset + needle.len() - i }) + .filter(|m| { + if !settings.reverse { + m.start > i + needle_offset + } else { + m.start >= needle_offset + needle.len() - i + } + }) .collect(); - if matches.len() > settings.max_cardinality {continue} + if matches.len() > settings.max_cardinality { + continue; + } // Reset dirty bits of arms arms.iter_mut().for_each(|arm| arm.dirty = false); @@ -102,62 +135,81 @@ pub fn search_duplications( let todo = matches .par_iter() .with_min_len(8) - .map(|m| try_extend_arms(&arms, m, i64::from(settings.max_gap_size), i, settings.probe_size) ) + .map(|m| { + try_extend_arms( + &arms, + m, + i64::from(settings.max_gap_size), + i, + settings.probe_size, + ) + }) .collect::>(); - todo.iter() - .for_each(|op| { - if let Operation::ExtendArm {i, l_end, r_end} = op { - arms[*i].left.end = *l_end; - arms[*i].right.end = *r_end; - arms[*i].dirty = true; - arms[*i].gap = 0; - } - }); - - todo.iter() - .for_each(|op| { - if let Operation::NewArm {i, m_start, m_end} = op { - arms.push(Arm{ - left: Segment{start: *i, end: *i + settings.probe_size, tag: 0}, - right: Segment{start: *m_start, end: *m_end, tag: 0}, - family_id: format!("{}-{}", id, current_family_id), - active: true, dirty: false, - gap: 0 - }) - } - }); + todo.iter().for_each(|op| { + if let Operation::ExtendArm { i, l_end, r_end } = op { + arms[*i].left.end = *l_end; + arms[*i].right.end = *r_end; + arms[*i].dirty = true; + arms[*i].gap = 0; + } + }); + + todo.iter().for_each(|op| { + if let Operation::NewArm { i, m_start, m_end } = op { + arms.push(Arm { + left: Segment { + start: *i, + end: *i + settings.probe_size, + tag: 0, + }, + right: Segment { + start: *m_start, + end: *m_end, + tag: 0, + }, + family_id: format!("{}-{}", id, current_family_id), + active: true, + dirty: false, + gap: 0, + }) + } + }); // Update the gaps of non-dirty arms - arms.iter_mut() - .filter(|a| !a.dirty) - .for_each(|a|{ - a.gap += step_size; - if a.gap as u32 >= settings.max_gap_size { a.active = false } - }); + arms.iter_mut().filter(|a| !a.dirty).for_each(|a| { + a.gap += step_size; + if a.gap as u32 >= settings.max_gap_size { + a.active = false + } + }); if arms.len() > 200 { arms.retain(|a| { - a.active || a.left.len() >= settings.min_duplication_length || a.right.len() >= settings.min_duplication_length + a.active + || a.left.len() >= settings.min_duplication_length + || a.right.len() >= settings.min_duplication_length }); } // Check if there are still extending arms if !arms.is_empty() && arms.iter().all(|a| !a.active) { - let family: ProtoSDsFamily = arms.iter() + let family: ProtoSDsFamily = arms + .iter() .filter(|a| a.right.len() >= settings.min_duplication_length) - .map(|a| { - ProtoSD { - left: a.left.start, - right: a.right.start, - left_length: a.left.len(), - right_length: a.right.len(), - identity: 0., - reversed: false, - complemented: false, - }}) + .map(|a| ProtoSD { + left: a.left.start, + right: a.right.start, + left_length: a.left.len(), + right_length: a.right.len(), + identity: 0., + reversed: false, + complemented: false, + }) .collect(); - if !family.is_empty() { r.push(family); } + if !family.is_empty() { + r.push(family); + } arms.clear(); current_family_id += 1; @@ -167,14 +219,13 @@ pub fn search_duplications( r } - - fn d_ss(a: &Segment, m: &Segment) -> i64 { - if (m.start >= a.start && m.start <= a.end) - || (m.end >= a.start && m.end <= a.end) { - 0 - } else { - cmp::min((a.start as i64 - m.end as i64).abs(), - (a.end as i64 - m.start as i64).abs()) - } + if (m.start >= a.start && m.start <= a.end) || (m.end >= a.start && m.end <= a.end) { + 0 + } else { + cmp::min( + (a.start as i64 - m.end as i64).abs(), + (a.end as i64 - m.start as i64).abs(), + ) + } } diff --git a/src/bin/asgart-cat.rs b/src/bin/asgart-cat.rs index 3ccab48..f5669a3 100644 --- a/src/bin/asgart-cat.rs +++ b/src/bin/asgart-cat.rs @@ -9,7 +9,6 @@ use asgart::log::LevelFilter; use asgart::exporters::Exporter; use asgart::exporters; use asgart::logger::Logger; -use asgart::rayon::prelude::*; use asgart::errors::*; use asgart::structs::*; diff --git a/src/bin/asgart-concat.rs b/src/bin/asgart-concat.rs new file mode 100644 index 0000000..3c1308b --- /dev/null +++ b/src/bin/asgart-concat.rs @@ -0,0 +1,82 @@ +#[macro_use] +extern crate log; +#[macro_use] +extern crate clap; +extern crate asgart; + +use std::fs::File; + +use asgart::exporters; +use asgart::exporters::Exporter; +use asgart::log::LevelFilter; +use asgart::logger::Logger; +use clap::{App, AppSettings, Arg}; + +use asgart::errors::*; + +fn main() { + Logger::init(LevelFilter::Info).expect("Unable to initialize logger"); + let x = run(); + if let Err(ref e) = x { + error!("{}", e); + for e in e.iter().skip(1) { + println!("{}", e); + } + if let Some(backtrace) = e.backtrace() { + println!("Backtrace: {:?}", backtrace); + } + std::process::exit(1); + } +} + +fn run() -> Result<()> { + let args = App::new("ASGART concatenate") + .setting(AppSettings::ColoredHelp) + .setting(AppSettings::ColorAuto) + .setting(AppSettings::VersionlessSubcommands) + .setting(AppSettings::UnifiedHelpMessage) + .version(crate_version!()) + .author(crate_authors!()) + .about("ASGART concatenate convert multiple input JSON files into a single one. Its main intended use is to merge together multiple files resulting from complementary runs on the same dataset, e.g. direct and palindromic duplications searches.") + .arg(Arg::with_name("INPUT") + .help("Set the input file(s) to use") + .required(true) + .takes_value(true) + .min_values(1)) + .arg(Arg::with_name("format") + .short("f") + .long("format") + .help("Set the desired output format") + .possible_values(&["json", "gff2", "gff3"]) + .default_value("json")) + .arg(Arg::with_name("OUTPUT") + .short("o") + .long("out") + .help("Write the result to this file. If unspecifed, write to STDOUT.") + .takes_value(true) + .number_of_values(1)) + .get_matches(); + + let inputs = values_t!(args, "INPUT", String).unwrap(); + let format = value_t!(args, "format", String).unwrap_or("json".to_string()); + let mut out: Box = if args.is_present("OUTPUT") { + let out_filename = + asgart::utils::make_out_filename(args.value_of("OUTPUT"), "out", &format); + Box::new(File::create(out_filename).expect("")) + } else { + Box::new(std::io::stdout()) + }; + let exporter = match &format[..] { + "json" => Box::new(exporters::JSONExporter) as Box, + "gff2" => Box::new(exporters::GFF2Exporter) as Box, + "gff3" => Box::new(exporters::GFF3Exporter) as Box, + format @ _ => { + warn!("Unknown output format `{}`: using json instead", format); + Box::new(exporters::JSONExporter) as Box + } + }; + + let results = asgart::structs::RunResult::from_files(&inputs)?; + exporter.save(&results, &mut out)?; + Ok(()) +} diff --git a/src/bin/asgart-plot.rs b/src/bin/asgart-plot.rs index 99c22f5..7c74d28 100644 --- a/src/bin/asgart-plot.rs +++ b/src/bin/asgart-plot.rs @@ -18,7 +18,6 @@ use std::collections::HashMap; use clap::{App, AppSettings}; use colored::Colorize; -use asgart::structs::*; use asgart::plot::*; use asgart::plot::chord_plot::ChordPlotter; use asgart::plot::flat_plot::FlatPlotter; @@ -29,131 +28,151 @@ use asgart::errors::*; use asgart::logger::Logger; use asgart::log::LevelFilter; +use asgart::structs::*; -fn filter_families_in_features(result: &mut RunResult, features_families: &[Vec], threshold: usize) { +fn filter_families_in_features( + result: &mut RunResult, + features_families: &[Vec], + threshold: usize, +) { fn _overlap((xstart, xlen): (usize, usize), (ystart, ylen): (usize, usize)) -> bool { let xend = xstart + xlen; let yend = ystart + ylen; - (xstart >= ystart && xstart <= yend) || - (ystart >= xstart && ystart <= xend) + (xstart >= ystart && xstart <= yend) || (ystart >= xstart && ystart <= xend) } - result.families = result.families + result.families = result + .families .clone() .into_iter() - .filter( - |family| - family.iter() - .any(|sd| { - features_families.iter() - .any(|feature_family| { - feature_family.iter() - .any(|feature| { - for position in &feature.positions{ - let (start, length) = match *position { - FeaturePosition::Relative { ref chr, start, length} => { - let chr = result.strand.find_chr(&chr).expect(&format!("Unable to find fragment `{}`", chr)); - (chr.position + start, length) - } - FeaturePosition::Absolute { start, length } => { - (start, length) - } - }; - if _overlap(sd.left_part(), (start - threshold, length + 2*threshold)) - || _overlap(sd.right_part(), (start - threshold, length + 2*threshold)) - { - return true; - } - } - false - }) - }) + .filter(|family| { + family.iter().any(|sd| { + features_families.iter().any(|feature_family| { + feature_family.iter().any(|feature| { + for position in &feature.positions { + let (start, length) = match *position { + FeaturePosition::Relative { + ref chr, + start, + length, + } => { + let chr = result + .strand + .find_chr(&chr) + .expect(&format!("Unable to find fragment `{}`", chr)); + (chr.position + start, length) + } + FeaturePosition::Absolute { start, length } => (start, length), + }; + if _overlap(sd.left_part(), (start - threshold, length + 2 * threshold)) + || _overlap( + sd.right_part(), + (start - threshold, length + 2 * threshold), + ) + { + return true; + } + } + false + }) }) - ) + }) + }) .collect(); } - -fn filter_duplicons_in_features(result: &mut RunResult, features_families: &[Vec], threshold: usize) { +fn filter_duplicons_in_features( + result: &mut RunResult, + features_families: &[Vec], + threshold: usize, +) { fn _overlap((xstart, xlen): (usize, usize), (ystart, ylen): (usize, usize)) -> bool { let xend = xstart + xlen; let yend = ystart + ylen; - (xstart >= ystart && xstart <= yend) || - (ystart >= xstart && ystart <= xend) + (xstart >= ystart && xstart <= yend) || (ystart >= xstart && ystart <= xend) } let mut families = result.families.clone(); - families - .iter_mut() - .for_each( - |family| - family - .retain(|sd| { - features_families.iter() - .any(|feature_family| { - feature_family.iter() - .any(|feature| { - for position in &feature.positions{ - let (start, length) = match *position { - FeaturePosition::Relative { ref chr, start, length} => { - let chr = result.strand.find_chr(&chr).expect(&format!("Unable to find fragment `{}`", chr)); - (chr.position + start, length) - } - FeaturePosition::Absolute { start, length } => { - (start, length) - } - }; - if _overlap(sd.left_part(), (start - threshold, length + 2*threshold)) - || _overlap(sd.right_part(), (start - threshold, length + 2*threshold)) - { - return true; - } - } - false - }) - }) + families.iter_mut().for_each(|family| { + family.retain(|sd| { + features_families.iter().any(|feature_family| { + feature_family.iter().any(|feature| { + for position in &feature.positions { + let (start, length) = match *position { + FeaturePosition::Relative { + ref chr, + start, + length, + } => { + let chr = result + .strand + .find_chr(&chr) + .expect(&format!("Unable to find fragment `{}`", chr)); + (chr.position + start, length) + } + FeaturePosition::Absolute { start, length } => (start, length), + }; + if _overlap(sd.left_part(), (start - threshold, length + 2 * threshold)) + || _overlap( + sd.right_part(), + (start - threshold, length + 2 * threshold), + ) + { + return true; + } + } + false }) - ); + }) + }) + }); result.families = families; } -fn filter_features_in_sds(result: &mut RunResult, features_families: &mut Vec>, threshold: usize) { +fn filter_features_in_sds( + result: &mut RunResult, + features_families: &mut Vec>, + threshold: usize, +) { fn _overlap((xstart, xlen): (usize, usize), (ystart, ylen): (usize, usize)) -> bool { let xend = xstart + xlen; let yend = ystart + ylen; - (xstart >= ystart && xstart <= yend) || - (ystart >= xstart && ystart <= xend) + (xstart >= ystart && xstart <= yend) || (ystart >= xstart && ystart <= xend) } - features_families - .iter_mut() - .for_each(|ref mut family| - family - .retain(|feature| { - feature.positions - .iter() - .any(|p| { - let (start, length) = match *p { - FeaturePosition::Relative { ref chr, start, length} => { - let chr = result.strand.find_chr(&chr).expect(&format!("Unable to find fragment `{}`", chr)); - (chr.position + start, length) - } - FeaturePosition::Absolute { start, length } => { (start, length) } - }; - - result.families.iter().any( - |family| family.iter().any( - |ref sd| - _overlap(sd.left_part(), (start - threshold, length + 2*threshold)) - || _overlap(sd.right_part(), (start - threshold, length + 2*threshold)) - ) - ) - }) - }) - ); + features_families.iter_mut().for_each(|ref mut family| { + family.retain(|feature| { + feature.positions.iter().any(|p| { + let (start, length) = match *p { + FeaturePosition::Relative { + ref chr, + start, + length, + } => { + let chr = result + .strand + .find_chr(&chr) + .expect(&format!("Unable to find fragment `{}`", chr)); + (chr.position + start, length) + } + FeaturePosition::Absolute { start, length } => (start, length), + }; + + result.families.iter().any(|family| { + family.iter().any(|ref sd| { + _overlap(sd.left_part(), (start - threshold, length + 2 * threshold)) + || _overlap( + sd.right_part(), + (start - threshold, length + 2 * threshold), + ) + }) + }) + }) + }) + }); } fn read_feature_file(r: &RunResult, file: &str) -> Result> { @@ -161,8 +180,8 @@ fn read_feature_file(r: &RunResult, file: &str) -> Result> { let extension: &str = path.extension().unwrap().to_str().unwrap(); match extension { - "gff3" => {read_gff3_feature_file(r, file)} - _ => {read_custom_feature_file(r, file)} + "gff3" => read_gff3_feature_file(r, file), + _ => read_custom_feature_file(r, file), } } @@ -170,7 +189,8 @@ fn read_gff3_feature_file(_r: &RunResult, file: &str) -> Result> { let f = File::open(file).chain_err(|| format!("Unable to open {}", file))?; let f = BufReader::new(f); - let r = f.lines() + let r = f + .lines() .map(|l| l.unwrap()) .filter(|l| !l.is_empty() && !l.starts_with('#')) .fold(Vec::new(), |mut ax, l| { @@ -179,11 +199,12 @@ fn read_gff3_feature_file(_r: &RunResult, file: &str) -> Result> { let end = l[4].parse::().unwrap(); let name = if l[8].contains("Name=") { - l[8] - .split(';') - .find(|cx| cx.contains("Name")).unwrap() + l[8].split(';') + .find(|cx| cx.contains("Name")) + .unwrap() .split('=') - .nth(1).unwrap() // unwrap is safe because we check for "Name=" + .nth(1) + .unwrap() // unwrap is safe because we check for "Name=" .to_string() } else { l[8].to_owned() @@ -191,12 +212,12 @@ fn read_gff3_feature_file(_r: &RunResult, file: &str) -> Result> { let feature = Feature { name: name, - positions: vec![FeaturePosition::Relative{ + positions: vec![FeaturePosition::Relative { chr: l[0].to_string(), - start: start, + start: start, length: end - start, }], - } ; + }; ax.push(feature); ax }); @@ -210,41 +231,66 @@ fn read_custom_feature_file(r: &RunResult, file: &str) -> Result> { let mut d = HashMap::new(); let re = Regex::new(r"(.*)\+(\d+)").unwrap(); - for (i, line) in f. - lines() + for (i, line) in f + .lines() .map(|l| l.unwrap()) .filter(|l| !l.is_empty()) .filter(|l| !l.starts_with('#')) - .enumerate() { - let v: Vec<&str> = line.split(';').collect(); - if v.len() != 3 { bail!("{}:L{} `{}`: incorrect format, expecting two members, found {}", file, i+1, line, v.len()); } - let name = v[0].to_owned(); - - let position = if re.is_match(v[1]) { - let chr_name = re.captures(v[1]).unwrap().get(1).unwrap().as_str(); - let position = re.captures(v[1]).unwrap().get(2).unwrap().as_str().to_owned().parse::().unwrap(); - // XXX Incorrect for non-endofeature runs - let chr = r.strand.find_chr(chr_name).expect(&format!("Unable to find fragment `{}`", chr_name)); - if chr.length < position { - bail!("{} greater than {} length ({})", position, chr.name, chr.length); - } - FeaturePosition::Relative { - chr: chr.name.to_owned(), - start: position, - length: v[2].parse::().unwrap() - } - } else { - FeaturePosition::Absolute { - start: v[1].parse::().unwrap(), - length: v[2].parse::().unwrap(), - } - }; - d.entry(name).or_insert_with(Vec::new).push(position); + .enumerate() + { + let v: Vec<&str> = line.split(';').collect(); + if v.len() != 3 { + bail!( + "{}:L{} `{}`: incorrect format, expecting two members, found {}", + file, + i + 1, + line, + v.len() + ); } + let name = v[0].to_owned(); + + let position = if re.is_match(v[1]) { + let chr_name = re.captures(v[1]).unwrap().get(1).unwrap().as_str(); + let position = re + .captures(v[1]) + .unwrap() + .get(2) + .unwrap() + .as_str() + .to_owned() + .parse::() + .unwrap(); + // XXX Incorrect for non-endofeature runs + let chr = r + .strand + .find_chr(chr_name) + .expect(&format!("Unable to find fragment `{}`", chr_name)); + if chr.length < position { + bail!( + "{} greater than {} length ({})", + position, + chr.name, + chr.length + ); + } + FeaturePosition::Relative { + chr: chr.name.to_owned(), + start: position, + length: v[2].parse::().unwrap(), + } + } else { + FeaturePosition::Absolute { + start: v[1].parse::().unwrap(), + length: v[2].parse::().unwrap(), + } + }; + d.entry(name).or_insert_with(Vec::new).push(position); + } let mut features = Vec::new(); for (name, positions) in &d { - features.push(Feature{ + features.push(Feature { name: name.to_owned(), positions: positions.clone(), }) @@ -273,19 +319,20 @@ fn run() -> Result<()> { let json_files = values_t!(args, "FILE", String).unwrap(); let mut result = RunResult::from_files(&json_files)?; - let out_file = asgart::utils::make_out_filename(args.value_of("out"), &json_files.join("-"), ""); + let out_file = + asgart::utils::make_out_filename(args.value_of("out"), &json_files.join("-"), ""); let features_tracks: Result> = match args.values_of("features") { - Some(x) => { x - .map(|feature_track| read_feature_file(&result, feature_track)) - .collect() - } - None => Ok(Vec::new()) + Some(x) => x + .map(|feature_track| read_feature_file(&result, feature_track)) + .collect(), + None => Ok(Vec::new()), }; - if let Err(e) = features_tracks {return Err(e);} + if let Err(e) = features_tracks { + return Err(e); + } let mut features_tracks = features_tracks.unwrap(); - if args.is_present("no-direct") {result.remove_direct();} if args.is_present("no-reversed") {result.remove_reversed();} if args.is_present("no-uncomplemented") {result.remove_uncomplemented();} @@ -307,40 +354,45 @@ fn run() -> Result<()> { let min_length = value_t!(args, "min_length", usize).unwrap(); result.families.iter_mut().for_each(|family| family.retain(|sd| std::cmp::max(sd.left_length, sd.right_length) >= min_length)); let min_identity = value_t!(args, "min_identity", f32).unwrap(); - result.families.iter_mut().for_each(|family| family.retain(|sd| sd.identity >= min_identity)); + result + .families + .iter_mut() + .for_each(|family| family.retain(|sd| sd.identity >= min_identity)); if args.is_present("filter_families") { filter_families_in_features( - &mut result, &features_tracks, - value_t!(args, "filter_families", usize).unwrap() + &mut result, + &features_tracks, + value_t!(args, "filter_families", usize).unwrap(), ); } if args.is_present("filter_duplicons") { filter_duplicons_in_features( - &mut result, &features_tracks, - value_t!(args, "filter_duplicons", usize).unwrap() + &mut result, + &features_tracks, + value_t!(args, "filter_duplicons", usize).unwrap(), ); } if args.is_present("filter_features") { filter_features_in_sds( - &mut result, &mut features_tracks, - value_t!(args, "filter_features", usize).unwrap() + &mut result, + &mut features_tracks, + value_t!(args, "filter_features", usize).unwrap(), ); } let settings = Settings { - out_file: out_file.to_str().unwrap().to_owned(), + out_file: out_file.to_str().unwrap().to_owned(), - size: 200.0, - min_thickness: value_t!(args, "min_thickness", f64).unwrap(), - color1: "#ff5b00".to_owned(), - color2: "#00b2ae".to_owned(), + size: 200.0, + min_thickness: value_t!(args, "min_thickness", f64).unwrap(), + color1: "#ff5b00".to_owned(), + color2: "#00b2ae".to_owned(), - feature_tracks: features_tracks, + feature_tracks: features_tracks, }; - let colorizer = match args.value_of("colorize") { Some("by-type") => Box::new(TypeColorizer::new((1.0, 0.36, 0.0), (0.0, 0.70, 0.68))) as Box, Some("by-position") => Box::new(PositionColorizer::new(&result)) as Box, @@ -363,7 +415,9 @@ fn run() -> Result<()> { fn main() { if let Err(ref e) = run() { println!("{} {}", "Error: ".red(), e); - for e in e.iter().skip(1) { println!("{}", e); } + for e in e.iter().skip(1) { + println!("{}", e); + } if let Some(backtrace) = e.backtrace() { println!("Backtrace: {:?}", backtrace); } diff --git a/src/bin/asgart.rs b/src/bin/asgart.rs index 34fad04..a12f9f8 100644 --- a/src/bin/asgart.rs +++ b/src/bin/asgart.rs @@ -1,42 +1,36 @@ -#[macro_use] pub extern crate clap; -#[macro_use] extern crate log; -extern crate threadpool; -extern crate indicatif; -extern crate console; -extern crate num_cpus; -extern crate bio; +#[macro_use] +extern crate clap; +#[macro_use] +extern crate log; extern crate asgart; -pub extern crate rayon; -extern crate separator; -use separator::Separatable; -use indicatif::{ProgressBar, ProgressStyle, HumanDuration}; -use console::style; -use bio::io::fasta; use clap::{App, AppSettings}; -use log::LevelFilter; -use rayon::prelude::*; -use std::path; -use std::thread; -use std::time::Duration; +use asgart::console::style; +use asgart::indicatif::{HumanDuration, ProgressBar, ProgressStyle}; +use asgart::rayon::prelude::*; +use asgart::separator::Separatable; + use std::cmp; +use std::path; +use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::mpsc; +use std::sync::mpsc::TryRecvError; use std::sync::Arc; +use std::thread; +use std::time::Duration; use std::time::Instant; -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::mpsc::TryRecvError; -use asgart::structs::*; -use asgart::logger::Logger; +use asgart::automaton; +use asgart::divsufsort::{divsufsort64, SuffixArray}; use asgart::errors::*; -use asgart::exporters::Exporter; use asgart::exporters; -use asgart::automaton; +use asgart::exporters::Exporter; +use asgart::log::LevelFilter; +use asgart::logger::Logger; use asgart::searcher; +use asgart::structs::*; use asgart::utils; -use asgart::divsufsort::{SuffixArray, divsufsort64}; - trait Step { fn name(&self) -> &str; @@ -45,29 +39,31 @@ trait Step { struct ReOrder; impl Step for ReOrder { - fn name(&self) -> &str { "Re-ordering" } + fn name(&self) -> &str { + "Re-ordering" + } fn run(&self, mut input: Vec, _strand: &Strand) -> Vec { + input.par_iter_mut().for_each(|family| { + family.iter_mut().for_each(|sd| { + if sd.left > sd.right { + let tmp = sd.left; + sd.left = sd.right; + sd.right = tmp + } + }) + }); input - .par_iter_mut() - .for_each(|family| { - family - .iter_mut() - .for_each(|sd| - if sd.left > sd.right { - let tmp = sd.left; - sd.left = sd.right; - sd.right = tmp - } - )}); - input -} + } } struct Sort; impl Step for Sort { - fn name(&self) -> &str { "Sorting" } + fn name(&self) -> &str { + "Sorting" + } fn run(&self, mut input: Vec, _strand: &Strand) -> Vec { - input.iter_mut() + input + .iter_mut() .for_each(|family| family.sort_by(|a, b| (a.left).cmp(&b.left))); input } @@ -75,31 +71,34 @@ impl Step for Sort { struct ReduceOverlap; impl Step for ReduceOverlap { - fn name(&self) -> &str { "Reducing overlap" } + fn name(&self) -> &str { + "Reducing overlap" + } fn run(&self, mut input: Vec, _strand: &Strand) -> Vec { input - .iter_mut() - .map(|family| reduce_overlap(&family)) - .collect() + .iter_mut() + .map(|family| reduce_overlap(&family)) + .collect() } } struct ComputeScore; impl Step for ComputeScore { - fn name(&self) -> &str { "Computing Levenshtein distance" } + fn name(&self) -> &str { + "Computing Levenshtein distance" + } fn run(&self, mut input: Vec, strand: &Strand) -> Vec { - input - .par_iter_mut() - .for_each(|family| - family - .iter_mut() - .for_each(|ref mut sd| sd.identity = sd.levenshtein(&strand.data) as f32)); + input.par_iter_mut().for_each(|family| { + family + .iter_mut() + .for_each(|ref mut sd| sd.identity = sd.levenshtein(&strand.data) as f32) + }); input } } struct SearchDuplications<'a> { - chunks_to_process: &'a[(usize, usize)], + chunks_to_process: &'a [(usize, usize)], trim: Option<(usize, usize)>, settings: RunSettings, } @@ -108,14 +107,18 @@ impl SearchDuplications<'_> { chunks_to_process: &[(usize, usize)], trim: Option<(usize, usize)>, settings: RunSettings, - ) -> SearchDuplications { SearchDuplications { - chunks_to_process: chunks_to_process, - trim: trim, - settings: settings.clone(), - }} + ) -> SearchDuplications { + SearchDuplications { + chunks_to_process: chunks_to_process, + trim: trim, + settings: settings.clone(), + } + } } impl<'a> Step for SearchDuplications<'a> { - fn name(&self) -> &str { "Looking for proto-duplications" } + fn name(&self) -> &str { + "Looking for proto-duplications" + } fn run(&self, _input: Vec, strand: &Strand) -> Vec { // // Build the suffix array @@ -123,7 +126,7 @@ impl<'a> Step for SearchDuplications<'a> { debug!("Building suffix array"); let sa_build_time = Instant::now(); let shared_suffix_array = if let Some((start, end)) = self.trim { - let mut sub_strand = strand.data[start .. end].to_vec(); + let mut sub_strand = strand.data[start..end].to_vec(); sub_strand.push(b'$'); let mut suffix_array = r_divsufsort(&sub_strand); suffix_array.iter_mut().for_each(|x| *x += start as i64); @@ -131,81 +134,112 @@ impl<'a> Step for SearchDuplications<'a> { } else { Arc::new(r_divsufsort(&strand.data)) }; - let shared_searcher = Arc::new(searcher::Searcher::new(&strand.data, &Arc::clone(&shared_suffix_array), 0)); + let shared_searcher = Arc::new(searcher::Searcher::new( + &strand.data, + &Arc::clone(&shared_suffix_array), + 0, + )); debug!("Done in {}", HumanDuration(sa_build_time.elapsed())); - // // Set up th progress bar // let (tx_monitor, rx_monitor) = mpsc::channel(); - let progresses = Arc::new((0..self.chunks_to_process.len()).map(|_| Arc::new(AtomicUsize::new(0))).collect::>()); + let progresses = Arc::new( + (0..self.chunks_to_process.len()) + .map(|_| Arc::new(AtomicUsize::new(0))) + .collect::>(), + ); let total = self.chunks_to_process.iter().fold(0, |ax, c| ax + c.1); let monitor_thread = { let progresses = Arc::clone(&progresses); thread::spawn(move || { let pb = ProgressBar::new(100); - pb.set_style(ProgressStyle::default_bar().template("{spinner:.blue} [{elapsed}] {bar:50} {pos}% (~{eta} remaining)")); + pb.set_style( + ProgressStyle::default_bar() + .template("{spinner:.blue} [{elapsed}] {bar:50} {pos}% (~{eta} remaining)"), + ); loop { thread::sleep(Duration::from_millis(500)); match rx_monitor.try_recv() { Err(TryRecvError::Empty) => { - pb.set_position((progresses.iter().map(|x| x.load(Ordering::Relaxed)) - .fold(0, |ax, x| ax + x) as f64/total as f64 * 100.0) as u64); + pb.set_position( + (progresses + .iter() + .map(|x| x.load(Ordering::Relaxed)) + .fold(0, |ax, x| ax + x) + as f64 + / total as f64 + * 100.0) as u64, + ); + } + _ => { + pb.finish_and_clear(); + break; } - _ => { pb.finish_and_clear(); break; } } } - })}; - + }) + }; // // And do the job // - let results = self.chunks_to_process + let results = self + .chunks_to_process .par_iter() .enumerate() .map(|(id, chunk)| { let mut _needle; let needle = if !self.settings.reverse && !self.settings.complement { - &strand.data[chunk.0 .. chunk.0 + chunk.1] + &strand.data[chunk.0..chunk.0 + chunk.1] } else { - _needle = strand.data[chunk.0 .. chunk.0 + chunk.1].to_vec(); - if self.settings.complement { _needle = utils::complemented(&_needle); } - if self.settings.reverse { _needle.reverse(); } + _needle = strand.data[chunk.0..chunk.0 + chunk.1].to_vec(); + if self.settings.complement { + _needle = utils::complemented(&_needle); + } + if self.settings.reverse { + _needle.reverse(); + } &_needle }; let mut proto_sds_families = automaton::search_duplications( - id, needle, chunk.0, - &strand.data, &shared_suffix_array.clone(), &shared_searcher.clone(), - &progresses[id], self.settings.clone()); - proto_sds_families - .iter_mut() - .for_each(|proto_family| - proto_family - .iter_mut() - .for_each(|proto_sd| - if !self.settings.reverse { proto_sd.left += chunk.0 } - else { proto_sd.left = chunk.0 + chunk.1 - proto_sd.left - proto_sd.left_length } - ) - ); + id, + needle, + chunk.0, + &strand.data, + &shared_suffix_array.clone(), + &shared_searcher.clone(), + &progresses[id], + self.settings.clone(), + ); + proto_sds_families.iter_mut().for_each(|proto_family| { + proto_family.iter_mut().for_each(|proto_sd| { + if !self.settings.reverse { + proto_sd.left += chunk.0 + } else { + proto_sd.left = chunk.0 + chunk.1 - proto_sd.left - proto_sd.left_length + } + }) + }); proto_sds_families }) .collect::>(); - let result = - results - .iter() - .fold(Vec::new(), |mut a, b| { - a.extend(b.iter() - .map(|family| - family - .into_iter() - .map(|sd| ProtoSD {reversed: self.settings.reverse, complemented: self.settings.complement, .. *sd}) - .collect::())); - a - }); + let result = results.iter().fold(Vec::new(), |mut a, b| { + a.extend(b.iter().map(|family| { + family + .into_iter() + .map(|sd| ProtoSD { + reversed: self.settings.reverse, + complemented: self.settings.complement, + ..*sd + }) + .collect::() + })); + a + }); let _ = tx_monitor.send(()); monitor_thread.join().unwrap(); @@ -214,9 +248,9 @@ impl<'a> Step for SearchDuplications<'a> { } type PreparedData = ( - Option<(usize, usize)>, // Trim - Vec<(usize, usize)>, // The areas that are not filled with Ns - Strand, // DNA strand to process + Option<(usize, usize)>, // Trim + Vec<(usize, usize)>, // The areas that are not filled with Ns + Strand, // DNA strand to process ); struct Strand { @@ -228,10 +262,61 @@ struct Strand { fn prepare_data( strands_files: &[String], skip_masked: bool, - trim: Option<(usize, usize)> + trim: Option<(usize, usize)>, ) -> Result { + fn read_fasta(filename: &str, skip_masked: bool) -> Result<(Vec, Vec)> { + let mut map = Vec::new(); + let mut r = Vec::new(); + + let reader = bio::io::fasta::Reader::from_file(filename) + .chain_err(|| format!("Unable to open `{}`", filename))?; + let mut counter = 0; + + for record in reader.records() { + let record = record.chain_err(|| { + format!( + "Unable to read {:?}: not a FASTA file", + path::Path::new(filename).file_name().unwrap() + ) + })?; + + let name = record.id().to_owned(); + let mut seq = record.seq().to_vec(); + if !skip_masked { + seq = seq.to_ascii_uppercase(); + } + for c in &mut seq { + if ALPHABET_MASKED.contains(c) && skip_masked { + *c = b'N' + } else if !(ALPHABET).contains(c) { + trace!("Undefined base `{}` replaced by `N`", *c as char); + *c = b'N' + } + } + + map.push(Start { + name: name, + position: counter, + length: seq.len(), + }); + counter += seq.len(); + r.append(&mut seq); + } + + Ok((map, r)) + } + + // Given a DNA fragment, returns a list of segments without too many N's in them + // Coordinates are relative to the fragment. fn find_chunks_to_process(strand: &[u8]) -> Vec<(usize, usize)> { - fn count_n(strand: &[u8], start: usize) -> usize {strand.iter().skip(start).take_while(|x| **x == b'n' || **x == b'N').count()} + fn count_n(strand: &[u8], start: usize) -> usize { + strand + .iter() + .skip(start) + .take_while(|x| **x == b'n' || **x == b'N') + .count() + } + let threshold = 5000; let mut start = 0; let mut count = 0; @@ -243,7 +328,10 @@ fn prepare_data( b'n' | b'N' => { let n_count = count_n(&strand, i); if n_count > threshold { - if count > 0 { chunks.push((start, count)); count = 0; }; + if count > 0 { + chunks.push((start, count)); + count = 0; + }; start = i + n_count; } else { count += n_count; @@ -261,11 +349,15 @@ fn prepare_data( } } } - if count != 0 { chunks.push((start, count)) }; - if chunks.is_empty() { chunks.push((0, strand.len())) }; + if count != 0 { + chunks.push((start, count)) + }; + if chunks.is_empty() { + chunks.push((0, strand.len())) + }; chunks - } + } // // Read and map the FASTA files to process @@ -276,36 +368,61 @@ fn prepare_data( let mut chunks_to_process = Vec::new(); for file_name in strands_files { - let (map, new_strand) = read_fasta(file_name, skip_masked).chain_err(|| format!("Unable to parse `{}`", file_name))?; - maps.extend(map.into_iter().map(|start| Start { position: start.position + offset, .. start })); - chunks_to_process.extend(find_chunks_to_process(&new_strand).into_iter().map(|(start, length)| (start + offset, length))); + let (map, new_strand) = read_fasta(file_name, skip_masked) + .chain_err(|| format!("Unable to parse `{}`", file_name))?; + + // We want to add each fragment separately to ensure that chunks are cutting between fragments + for chr in map.iter() { + chunks_to_process.extend( + find_chunks_to_process(&new_strand[chr.position..chr.position + chr.length]) + .into_iter() + .map(|(start, length)| (chr.position + offset + start, length)), + ); + } + maps.extend(map.into_iter().map(|start| Start { + position: start.position + offset, + ..start + })); + offset = offset + new_strand.len(); strand.extend(new_strand); } - info!("Parsed {} file{} containing a total of {} fragments", - strands_files.len(), if strands_files.len() > 1 {"s"} else {""}, maps.len()); - maps.iter().for_each(|s| - debug!("{:>20}: {:>15} --> {:>15} {:>15} bp", - s.name, - s.position.separated_string(), (s.position + s.length).separated_string(), - s.length.separated_string()) + info!( + "Parsed {} file{} containing a total of {} fragments", + strands_files.len(), + if strands_files.len() > 1 { "s" } else { "" }, + maps.len() ); + maps.iter().for_each(|s| { + debug!( + "{:>20}: {:>15} --> {:>15} {:>15} bp", + s.name, + s.position.separated_string(), + (s.position + s.length).separated_string(), + s.length.separated_string() + ) + }); let chunks_length = chunks_to_process.iter().fold(0, |ax, c| ax + c.1); - info!("Processing {} chunks totalling {}bp, skipping {}bp out of {} ({}%)", - chunks_to_process.len().separated_string(), - chunks_length.separated_string(), - (strand.len() - chunks_length).separated_string(), - strand.len().separated_string(), - (((strand.len() as f64 - (chunks_length as f64))*100.0/strand.len() as f64) as i64).separated_string() + info!( + "Processing {} chunks totalling {}bp, skipping {}bp out of {} ({}%)", + chunks_to_process.len().separated_string(), + chunks_length.separated_string(), + (strand.len() - chunks_length).separated_string(), + strand.len().separated_string(), + (((strand.len() as f64 - (chunks_length as f64)) * 100.0 / strand.len() as f64) as i64) + .separated_string() ); chunks_to_process.iter().for_each(|c| { - trace!("{:>12} -> {:>12} {:>11} bp", c.0.separated_string(), (c.0 + c.1).separated_string(), c.1.separated_string()); + trace!( + "{:>12} -> {:>12} {:>11} bp", + c.0.separated_string(), + (c.0 + c.1).separated_string(), + c.1.separated_string() + ); }); - //let chunks_to_process = find_chunks_to_process(&strand); strand.push(b'$'); // For the SA construction - Ok(( trim.and_then(|(shift, _stop)| { // @@ -313,61 +430,41 @@ fn prepare_data( // let mut stop = _stop; if stop >= strand.len() { - warn!("Trimming: {} greater than total length ({}bp)", stop, strand.len()); + warn!( + "Trimming: {} greater than total length ({}bp)", + stop, + strand.len() + ); warn!("Using {} instead of {}", strand.len() - 1, stop); stop = strand.len() - 1; } if stop <= shift { - warn!("Trimming: {} greater than {}, skipping trimming", shift, stop); + warn!( + "Trimming: {} greater than {}, skipping trimming", + shift, stop + ); None } else if shift >= strand.len() { - warn!("Trimming: {} greater than total length ({}bp), skipping trimming", shift, strand.len()); + warn!( + "Trimming: {} greater than total length ({}bp), skipping trimming", + shift, + strand.len() + ); None } else { Some((shift, stop)) } }), chunks_to_process, - Strand{file_names: strands_files.join(", "), data: strand, map: maps}, + Strand { + file_names: strands_files.join(", "), + data: strand, + map: maps, + }, )) } -fn read_fasta(filename: &str, skip_masked: bool) -> Result<(Vec, Vec)> { - let mut map = Vec::new(); - let mut r = Vec::new(); - - let reader = fasta::Reader::from_file(filename).chain_err(|| format!("Unable to open `{}`", filename))?; - let mut counter = 0; - - for record in reader.records() { - let record = record.chain_err(|| format!("Unable to read {:?}: not a FASTA file", path::Path::new(filename).file_name().unwrap()))?; - - let name = record.id().to_owned(); - let mut seq = record.seq().to_vec(); - if !skip_masked {seq = seq.to_ascii_uppercase();} - for c in &mut seq { - if ALPHABET_MASKED.contains(c) && skip_masked { - *c = b'N' - } else if !(ALPHABET).contains(c) { - trace!("Undefined base `{}` replaced by `N`", std::char::from_u32(u32::from(*c)).unwrap()); - *c = b'N' - } - } - - map.push(Start { - name: name, - position: counter, - length: seq.len(), - }); - counter += seq.len(); - r.append(&mut seq); - } - - - Ok((map, r)) -} - pub fn r_divsufsort(dna: &[u8]) -> SuffixArray { let mut sa = Vec::with_capacity(dna.len()); sa.resize(dna.len(), 0); @@ -377,7 +474,6 @@ pub fn r_divsufsort(dna: &[u8]) -> SuffixArray { sa } - // Returns true if x ⊂ y fn subsegment((xstart, xlen): (usize, usize), (ystart, ylen): (usize, usize)) -> bool { let xend = xstart + xlen; @@ -390,8 +486,8 @@ fn overlap((xstart, xlen): (usize, usize), (ystart, ylen): (usize, usize)) -> bo let xend = xstart + xlen; let yend = ystart + ylen; - (xstart >= ystart && xstart <= yend && xend >= yend) || - (ystart >= xstart && ystart <= xend && yend >= xend) + (xstart >= ystart && xstart <= yend && xend >= yend) + || (ystart >= xstart && ystart <= xend && yend >= xend) } fn merge(x: &ProtoSD, y: &ProtoSD) -> ProtoSD { @@ -401,7 +497,6 @@ fn merge(x: &ProtoSD, y: &ProtoSD) -> ProtoSD { let new_right = cmp::min(x.right, y.right); let rsize = cmp::max(x.right + x.left_length, y.right + y.right_length) - new_right; - ProtoSD { left: new_left, right: new_right, @@ -419,30 +514,32 @@ fn reduce_overlap(result: &[ProtoSD]) -> Vec { 'to_insert: for x in result.iter() { for y in &mut news { // x ⊂ y - if subsegment(x.left_part(), y.left_part()) && - subsegment(x.right_part(), y.right_part()) { - continue 'to_insert; - } + if subsegment(x.left_part(), y.left_part()) + && subsegment(x.right_part(), y.right_part()) + { + continue 'to_insert; + } // x ⊃ y - if subsegment(y.left_part(), x.left_part()) && - subsegment(y.right_part(), x.right_part()) { - y.left = x.left; - y.right = x.right; - y.left_length = x.left_length; - y.right_length = x.right_length; - continue 'to_insert; - } + if subsegment(y.left_part(), x.left_part()) + && subsegment(y.right_part(), x.right_part()) + { + y.left = x.left; + y.right = x.right; + y.left_length = x.left_length; + y.right_length = x.right_length; + continue 'to_insert; + } - if overlap(x.left_part(), y.left_part()) && - overlap(x.right_part(), y.right_part()) { - let z = merge(x, y); - y.left = z.left; - y.right = z.right; - y.left_length = z.left_length; - y.right_length = z.right_length; - continue 'to_insert; - } + if overlap(x.left_part(), y.left_part()) && overlap(x.right_part(), y.right_part()) + { + let z = merge(x, y); + y.left = z.left; + y.right = z.right; + y.left_length = z.left_length; + y.right_length = z.right_length; + continue 'to_insert; + } } news.push(x.clone()); } @@ -460,26 +557,25 @@ fn reduce_overlap(result: &[ProtoSD]) -> Vec { news } - fn run() -> Result<()> { // Those settings are only used to handily parse arguments struct Settings { - strands_files: Vec, - kmer_size: usize, - gap_size: u32, + strands_files: Vec, + kmer_size: usize, + gap_size: u32, min_duplication_length: usize, - max_cardinality: usize, - skip_masked: bool, - - reverse: bool, - complement: bool, - trim: Vec, - - prefix: String, - out: String, - out_format: String, - compute_score: bool, - threads_count: usize, + max_cardinality: usize, + skip_masked: bool, + + reverse: bool, + complement: bool, + trim: Vec, + + prefix: String, + out: String, + out_format: String, + compute_score: bool, + threads_count: usize, } let yaml = load_yaml!("asgart.yaml"); @@ -493,35 +589,49 @@ fn run() -> Result<()> { .get_matches(); let settings = Settings { - strands_files: args.values_of("strand").unwrap().map(|s| s.to_owned()).collect(), - kmer_size: value_t_or_exit!(args, "probe_size", usize), - gap_size: value_t_or_exit!(args, "max_gap", u32), + strands_files: args + .values_of("strand") + .unwrap() + .map(|s| s.to_owned()) + .collect(), + kmer_size: value_t_or_exit!(args, "probe_size", usize), + gap_size: value_t_or_exit!(args, "max_gap", u32), min_duplication_length: value_t!(args, "min_length", usize).unwrap(), - max_cardinality: value_t!(args, "max_cardinality", usize).unwrap(), - skip_masked: args.is_present("skip_masked"), - - reverse: args.is_present("reverse"), - complement: args.is_present("complement"), - trim: values_t!(args, "trim", usize).unwrap_or_else(|_| Vec::new()), - - prefix: args.value_of("prefix").unwrap().to_owned(), - out: args.value_of("out").unwrap_or("").to_owned(), - out_format: args.value_of("out_format").unwrap().to_owned(), - compute_score: args.is_present("compute_score"), - threads_count: value_t!(args, "threads", usize).unwrap_or_else(|_| num_cpus::get()), + max_cardinality: value_t!(args, "max_cardinality", usize).unwrap(), + skip_masked: args.is_present("skip_masked"), + + reverse: args.is_present("reverse"), + complement: args.is_present("complement"), + trim: values_t!(args, "trim", usize).unwrap_or_else(|_| Vec::new()), + + prefix: args.value_of("prefix").unwrap().to_owned(), + out: args.value_of("out").unwrap_or("").to_owned(), + out_format: args.value_of("out_format").unwrap().to_owned(), + compute_score: args.is_present("compute_score"), + threads_count: value_t!(args, "threads", usize).unwrap_or_else(|_| num_cpus::get()), }; Logger::init(match args.occurrences_of("verbose") { - 0 => { LevelFilter::Info } - 1 => { LevelFilter::Debug } - 2 => { LevelFilter::Trace } - _ => { LevelFilter::Trace } - }).chain_err(|| "Unable to initialize logger")?; - - let radix = (settings.strands_files - .iter() - .map(|n| path::Path::new(&n).file_stem().unwrap().to_str().unwrap().to_string()) - .collect::>()).join("-"); + 0 => LevelFilter::Info, + 1 => LevelFilter::Debug, + 2 => LevelFilter::Trace, + _ => LevelFilter::Trace, + }) + .chain_err(|| "Unable to initialize logger")?; + + let radix = (settings + .strands_files + .iter() + .map(|n| { + path::Path::new(&n) + .file_stem() + .unwrap() + .to_str() + .unwrap() + .to_string() + }) + .collect::>()) + .join("-"); info!("Processing {}", &settings.strands_files.join(", ")); debug!("K-mers size {}", settings.kmer_size); @@ -529,14 +639,19 @@ fn run() -> Result<()> { debug!("Reversed duplications {}", settings.reverse); debug!("Complemented duplications {}", settings.complement); debug!("Skipping soft-masked {}", settings.skip_masked); - debug!("Min. length {}", settings.min_duplication_length); + debug!( + "Min. length {}", + settings.min_duplication_length + ); debug!("Max. cardinality {}", settings.max_cardinality); debug!("Threads count {}", settings.threads_count); if !settings.trim.is_empty() { - debug!("Trimming {} → {}", settings.trim[0], settings.trim[1]); + debug!( + "Trimming {} → {}", + settings.trim[0], settings.trim[1] + ); } - rayon::ThreadPoolBuilder::new() .num_threads(settings.threads_count) .build_global() @@ -545,18 +660,18 @@ fn run() -> Result<()> { let result = search_duplications( &settings.strands_files, RunSettings { - probe_size: settings.kmer_size, - max_gap_size: settings.gap_size + settings.kmer_size as u32, + probe_size: settings.kmer_size, + max_gap_size: settings.gap_size + settings.kmer_size as u32, min_duplication_length: settings.min_duplication_length, - max_cardinality: settings.max_cardinality, + max_cardinality: settings.max_cardinality, - reverse: settings.reverse, - complement: settings.complement, - skip_masked: settings.skip_masked, + reverse: settings.reverse, + complement: settings.complement, + skip_masked: settings.skip_masked, - compute_score: settings.compute_score, - threads_count: settings.threads_count, - trim: if !settings.trim.is_empty() { + compute_score: settings.compute_score, + threads_count: settings.threads_count, + trim: if !settings.trim.is_empty() { Some((settings.trim[0], settings.trim[1])) } else { None @@ -564,114 +679,144 @@ fn run() -> Result<()> { }, )?; - let out_radix = if settings.out.is_empty() { - format!("{}{}{}{}{}{}.{}", - &settings.prefix, - radix, - if settings.reverse || settings.complement {"_"} else {""}, - if settings.reverse {"R"} else {""}, - if settings.complement {"C"} else {""}, - &(if !settings.trim.is_empty() { format!("_{}-{}", settings.trim[0], settings.trim[1]) } else {"".into()}), - &settings.out_format) + format!( + "{}{}{}{}{}{}.{}", + &settings.prefix, + radix, + if settings.reverse || settings.complement { + "_" + } else { + "" + }, + if settings.reverse { "R" } else { "" }, + if settings.complement { "C" } else { "" }, + &(if !settings.trim.is_empty() { + format!("_{}-{}", settings.trim[0], settings.trim[1]) + } else { + "".into() + }), + &settings.out_format + ) } else { settings.out }; - let out_filename = asgart::utils::make_out_filename( - Some(&out_radix), - "", - &settings.out_format).to_str().unwrap().to_owned(); - let mut out = std::fs::File::create(&out_filename).chain_err(|| format!("Unable to create `{}`", out_filename))?; + let out_filename = asgart::utils::make_out_filename(Some(&out_radix), "", &settings.out_format) + .to_str() + .unwrap() + .to_owned(); + let mut out = std::fs::File::create(&out_filename) + .chain_err(|| format!("Unable to create `{}`", out_filename))?; let exporter = match &settings.out_format[..] { - "json" => { Box::new(exporters::JSONExporter) as Box } - "gff2" => { Box::new(exporters::GFF2Exporter) as Box } - "gff3" => { Box::new(exporters::GFF3Exporter) as Box } + "json" => Box::new(exporters::JSONExporter) as Box, + "gff2" => Box::new(exporters::GFF2Exporter) as Box, + "gff3" => Box::new(exporters::GFF3Exporter) as Box, format @ _ => { warn!("Unknown output format `{}`: using json instead", format); Box::new(exporters::JSONExporter) as Box } }; exporter.save(&result, &mut out)?; - info!("{}", style(format!("Result written to {}", &out_filename)).bold()); + info!( + "{}", + style(format!("Result written to {}", &out_filename)).bold() + ); Ok(()) } - -fn search_duplications( - strands_files: &[String], - settings: RunSettings, -) -> Result { - +fn search_duplications(strands_files: &[String], settings: RunSettings) -> Result { let total = Instant::now(); info!("Preprocessing data"); - let (trim, to_process, mut strand) = prepare_data(strands_files, settings.skip_masked, settings.trim)?; + let (trim, to_process, mut strand) = + prepare_data(strands_files, settings.skip_masked, settings.trim)?; - let mut steps : Vec> = Vec::new(); + let mut steps: Vec> = Vec::new(); steps.push(Box::new(SearchDuplications::new( &to_process, trim, settings, ))); - if settings.compute_score { steps.push(Box::new(ComputeScore{})); } - steps.push(Box::new(ReOrder{})); - steps.push(Box::new(ReduceOverlap{})); - steps.push(Box::new(Sort{})); + if settings.compute_score { + steps.push(Box::new(ComputeScore {})); + } + steps.push(Box::new(ReOrder {})); + steps.push(Box::new(ReduceOverlap {})); + steps.push(Box::new(Sort {})); let mut result = Vec::new(); for (i, step) in steps.iter().enumerate() { - info!("{} {}...", style(format!("[{}/{}]", i + 1, steps.len())).blue().bold(), step.name()); + info!( + "{} {}...", + style(format!("[{}/{}]", i + 1, steps.len())).blue().bold(), + step.name() + ); result = step.run(result, &mut strand); } - info!("{}", - style(format!("{:?} processed in {}.", - strands_files.join(", "), - HumanDuration(total.elapsed()))).green().bold() + info!( + "{}", + style(format!( + "{:?} processed in {}.", + strands_files.join(", "), + HumanDuration(total.elapsed()) + )) + .green() + .bold() ); let strand = StrandResult { - name: strand.file_names.clone(), + name: strand.file_names.clone(), length: strand.map.iter().fold(0, |ax, chr| ax + chr.length), - map: strand.map.clone(), + map: strand.map.clone(), }; Ok(RunResult { strand: strand.clone(), settings: settings, - families: result.iter() - .map(|family| - family.iter() - .map(|sd| - { - SD { - chr_left: strand.find_chr_by_pos(sd.left) - .and_then(|c| Some(c.name.clone())).unwrap_or("unknown".to_string()), - chr_right: strand.find_chr_by_pos(sd.right) - .and_then(|c| Some(c.name.clone())).unwrap_or("unknown".to_string()), - - global_left_position: sd.left, - global_right_position: sd.right, - - chr_left_position: sd.left - strand.find_chr_by_pos(sd.left) - .and_then(|c| Some(c.position)).unwrap_or(0), - chr_right_position: sd.right - strand.find_chr_by_pos(sd.right) - .and_then(|c| Some(c.position)).unwrap_or(0), - - left_length: sd.left_length, - right_length: sd.right_length, - - identity: sd.identity, - reversed: sd.reversed, - complemented: sd.complemented, - }}) - .collect::>() - ).collect(), + families: result + .iter() + .map(|family| { + family + .iter() + .map(|sd| SD { + chr_left: strand + .find_chr_by_pos(sd.left) + .and_then(|c| Some(c.name.clone())) + .unwrap_or("unknown".to_string()), + chr_right: strand + .find_chr_by_pos(sd.right) + .and_then(|c| Some(c.name.clone())) + .unwrap_or("unknown".to_string()), + + global_left_position: sd.left, + global_right_position: sd.right, + + chr_left_position: sd.left + - strand + .find_chr_by_pos(sd.left) + .and_then(|c| Some(c.position)) + .unwrap_or(0), + chr_right_position: sd.right + - strand + .find_chr_by_pos(sd.right) + .and_then(|c| Some(c.position)) + .unwrap_or(0), + + left_length: sd.left_length, + right_length: sd.right_length, + + identity: sd.identity, + reversed: sd.reversed, + complemented: sd.complemented, + }) + .collect::>() + }) + .collect(), }) } - fn main() { if let Err(ref e) = run() { error!("{}", e); diff --git a/src/divsufsort.rs b/src/divsufsort.rs index a300ed4..a221bbc 100755 --- a/src/divsufsort.rs +++ b/src/divsufsort.rs @@ -3,57 +3,60 @@ #![allow(non_camel_case_types, non_upper_case_globals)] type sauchar_t = u8; -type saint_t = i32; -type saidx_t = i32; +type saint_t = i32; +type saidx_t = i32; type saidx64_t = i64; #[link(name = "divsufsort64", kind = "static")] extern "C" { pub fn divsufsort64(T: *const sauchar_t, SA: *mut saidx64_t, n: saidx64_t) -> saint_t; pub fn divsufsort64_version() -> *const ::std::os::raw::c_char; - pub fn sa_search64(T: *const sauchar_t, - Tsize: saidx64_t, - P: *const sauchar_t, - Psize: saidx64_t, - SA: *const saidx64_t, - SAsize: saidx64_t, - left: *mut saidx64_t) - -> saidx64_t; + pub fn sa_search64( + T: *const sauchar_t, + Tsize: saidx64_t, + P: *const sauchar_t, + Psize: saidx64_t, + SA: *const saidx64_t, + SAsize: saidx64_t, + left: *mut saidx64_t, + ) -> saidx64_t; - pub fn sa_searchb64(T: *const sauchar_t, - Tsize: saidx64_t, - P: *const sauchar_t, - Psize: saidx64_t, - SA: *const saidx64_t, - SAsize: saidx64_t, - left: *mut saidx64_t, - init_left: saidx64_t, - init_right: saidx64_t) - -> saidx64_t; + pub fn sa_searchb64( + T: *const sauchar_t, + Tsize: saidx64_t, + P: *const sauchar_t, + Psize: saidx64_t, + SA: *const saidx64_t, + SAsize: saidx64_t, + left: *mut saidx64_t, + init_left: saidx64_t, + init_right: saidx64_t, + ) -> saidx64_t; } extern "C" { - pub fn divsufsort(T: *const sauchar_t, SA: *mut saidx_t, n: saidx_t) - -> saint_t; + pub fn divsufsort(T: *const sauchar_t, SA: *mut saidx_t, n: saidx_t) -> saint_t; - pub fn sa_search(T: *const sauchar_t, - Tsize: saidx_t, - P: *const sauchar_t, - Psize: saidx_t, - SA: *const saidx_t, - SAsize: saidx_t, - left: *mut saidx_t) - -> saidx_t; + pub fn sa_search( + T: *const sauchar_t, + Tsize: saidx_t, + P: *const sauchar_t, + Psize: saidx_t, + SA: *const saidx_t, + SAsize: saidx_t, + left: *mut saidx_t, + ) -> saidx_t; - pub fn sa_searchb(T: *const sauchar_t, - Tsize: saidx_t, - P: *const sauchar_t, - Psize: saidx_t, - SA: *const saidx_t, - SAsize: saidx_t, - left: *mut saidx_t, - init_left: saidx_t, - init_right: saidx_t) - -> saidx_t; + pub fn sa_searchb( + T: *const sauchar_t, + Tsize: saidx_t, + P: *const sauchar_t, + Psize: saidx_t, + SA: *const saidx_t, + SAsize: saidx_t, + left: *mut saidx_t, + init_left: saidx_t, + init_right: saidx_t, + ) -> saidx_t; } pub type SAIdx = saidx64_t; diff --git a/src/exporters.rs b/src/exporters.rs index ce4b0b6..52e4bff 100644 --- a/src/exporters.rs +++ b/src/exporters.rs @@ -1,7 +1,7 @@ +use errors::*; use serde_json; -use structs::*; use std::io::Write; -use ::errors::*; +use structs::*; pub trait Exporter { fn save(&self, result: &RunResult, out: &mut dyn Write) -> Result<()>; @@ -10,22 +10,27 @@ pub trait Exporter { pub struct JSONExporter; impl Exporter for JSONExporter { fn save(&self, result: &RunResult, out: &mut dyn Write) -> Result<()> { - let _ = writeln!(out, - "{}", - serde_json::to_string_pretty(&result).chain_err(|| "Unable to serialize result into JSON")? - ).chain_err(|| "Unable to write results"); + let _ = writeln!( + out, + "{}", + serde_json::to_string_pretty(&result) + .chain_err(|| "Unable to serialize result into JSON")? + ) + .chain_err(|| "Unable to write results"); Ok(()) } } - pub struct GFF2Exporter; impl Exporter for GFF2Exporter { fn save(&self, result: &RunResult, out: &mut dyn Write) -> Result<()> { - writeln!(out, "track name=Duplications\tuseScore=1\tdescription=\"ASGART - {dataset}\"", - dataset = result.strand.name, - ).chain_err(|| "Unable to write results")?; + writeln!( + out, + "track name=Duplications\tuseScore=1\tdescription=\"ASGART - {dataset}\"", + dataset = result.strand.name, + ) + .chain_err(|| "Unable to write results")?; for (i, family) in result.families.iter().enumerate() { for (j, sd) in family.iter().enumerate() { writeln!(out, @@ -49,12 +54,10 @@ impl Exporter for GFF2Exporter { writeln!(out).chain_err(|| "Unable to write results")?; } - Ok(()) -} + Ok(()) + } } - - pub struct GFF3Exporter; impl Exporter for GFF3Exporter { fn save(&self, result: &RunResult, out: &mut dyn Write) -> Result<()> { @@ -64,9 +67,10 @@ impl Exporter for GFF3Exporter { out, "##sequence-region {name} {start} {end}", name = &chr.name, - start = chr.position + 1, + start = chr.position + 1, end = chr.position + chr.length + 1, - ).chain_err(|| "Unable to write results")?; + ) + .chain_err(|| "Unable to write results")?; } for (i, family) in result.families.iter().enumerate() { @@ -94,6 +98,6 @@ impl Exporter for GFF3Exporter { writeln!(out).chain_err(|| "Unable to write results")?; } -Ok(()) + Ok(()) } } diff --git a/src/lib.rs b/src/lib.rs index 9891d32..3d7ba06 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,20 +1,31 @@ -#[macro_use] extern crate serde_derive; -extern crate superslice; -extern crate serde; -pub extern crate serde_json; -pub extern crate separator; +pub extern crate log; +#[macro_use] +pub extern crate serde_derive; +pub extern crate bio; +pub extern crate colored; +pub extern crate console; +pub extern crate indicatif; +pub extern crate num_cpus; +pub extern crate rand; pub extern crate rayon; -#[macro_use] extern crate error_chain; -#[macro_use] pub extern crate log; +pub extern crate regex; +pub extern crate separator; +pub extern crate serde; +pub extern crate serde_json; +pub extern crate superslice; +pub extern crate threadpool; +#[macro_use] +pub extern crate error_chain; pub mod structs; +#[macro_use] pub mod logger; -pub mod utils; -pub mod divsufsort; -pub mod searcher; pub mod automaton; -pub mod plot; +pub mod divsufsort; pub mod exporters; +pub mod plot; +pub mod searcher; +pub mod utils; pub mod errors { - error_chain!{} + error_chain! {} } diff --git a/src/logger.rs b/src/logger.rs index 0fd26bc..6b4a433 100644 --- a/src/logger.rs +++ b/src/logger.rs @@ -1,11 +1,7 @@ -extern crate colored; -extern crate log; +use colored::*; +use log::{Level, LevelFilter, Metadata, Record, SetLoggerError}; -use self::log::{Record, Level, Metadata, SetLoggerError, LevelFilter}; -use self::colored::*; - -pub struct Logger { -} +pub struct Logger {} impl log::Log for Logger { fn enabled(&self, metadata: &Metadata) -> bool { @@ -14,43 +10,33 @@ impl log::Log for Logger { fn log(&self, record: &Record) { if atty::is(atty::Stream::Stdout) { - println!("{}", - match record.level() { - Level::Error => { - format!("{} {}", "×".red(), record.args().to_string().red().bold()) - } - Level::Warn => { - format!("{} {}", "⚠".yellow(), record.args().to_string().yellow()) - } - Level::Info => { - format!("{} {}", "▷".cyan(), record.args()) - } - Level::Debug => { - format!(" {}", record.args()) - } - Level::Trace => { - format!("{} {}", "❖".blue(), record.args().to_string().blue()) - } - }); + println!( + "{}", + match record.level() { + Level::Error => { + format!("{} {}", "×".red(), record.args().to_string().red().bold()) + } + Level::Warn => { + format!("{} {}", "⚠".yellow(), record.args().to_string().yellow()) + } + Level::Info => format!("{} {}", "▷".cyan(), record.args()), + Level::Debug => format!(" {}", record.args()), + Level::Trace => { + format!("{} {}", "❖".blue(), record.args().to_string().blue()) + } + } + ); } else { - println!("{}", - match record.level() { - Level::Error => { - format!("{} {}", " X ", record.args().to_string()) - } - Level::Warn => { - format!("{} {}", "/!\\", record.args().to_string()) - } - Level::Info => { - format!("{} {}", "-> ", record.args()) - } - Level::Debug => { - format!(" {}", record.args()) - } - Level::Trace => { - format!("{} {}", " . ", record.args().to_string()) - } - }); + println!( + "{}", + match record.level() { + Level::Error => format!("{} {}", " X ", record.args().to_string()), + Level::Warn => format!("{} {}", "/!\\", record.args().to_string()), + Level::Info => format!("{} {}", "-> ", record.args()), + Level::Debug => format!(" {}", record.args()), + Level::Trace => format!("{} {}", " . ", record.args().to_string()), + } + ); } } @@ -59,6 +45,6 @@ impl log::Log for Logger { impl Logger { pub fn init(level: LevelFilter) -> Result<(), SetLoggerError> { - log::set_boxed_logger(Box::new(Logger{})).map(|()| log::set_max_level(level)) + log::set_boxed_logger(Box::new(Logger {})).map(|()| log::set_max_level(level)) } } diff --git a/src/plot/chord_plot.rs b/src/plot/chord_plot.rs index 629a940..9237914 100644 --- a/src/plot/chord_plot.rs +++ b/src/plot/chord_plot.rs @@ -1,23 +1,24 @@ -use std::io::prelude::*; -use std::fs::File; +extern crate rand; + +use separator::Separatable; use std::f64::consts::PI; +use std::fs::File; +use std::io::prelude::*; + use plot::*; use plot::colorizers::Colorizer; -use separator::Separatable; const R: f64 = 200.0; const RING_WIDTH: f64 = 5.0; const RING_MARGIN: f64 = 10.0; -const OUT_CEILING: f64 = R/2.0; +const OUT_CEILING: f64 = R / 2.0; const INTER_RING_SPACING: f64 = 0.002; -const TOTAL_WIDTH: f64 = 2.5*(R + RING_MARGIN + RING_WIDTH + OUT_CEILING); - -const CX: f64 = TOTAL_WIDTH/2.0; -const CY: f64 = TOTAL_WIDTH/2.0; - +const TOTAL_WIDTH: f64 = 2.5 * (R + RING_MARGIN + RING_WIDTH + OUT_CEILING); +const CX: f64 = TOTAL_WIDTH / 2.0; +const CY: f64 = TOTAL_WIDTH / 2.0; pub struct ChordPlotter { result: RunResult, @@ -42,7 +43,10 @@ impl Plotter for ChordPlotter { let out_filename = format!("{}.svg", &self.settings.out_file); File::create(&out_filename) .and_then(|mut f| f.write_all(self.plot_chord().as_bytes())) - .and_then(|_| { log::info!("Chord plot written to `{}`", &out_filename); Ok(()) }) + .and_then(|_| { + log::info!("Chord plot written to `{}`", &out_filename); + Ok(()) + }) .chain_err(|| format!("Unable to write in `{}`", &out_filename))?; Ok(()) @@ -62,15 +66,15 @@ impl ChordPlotter { let (start_x, start_y) = self.cartesian(t1, radius); let (end_x, end_y) = self.cartesian(t2, radius); - let large_flag = if t2 - t1 > PI/2.0 { 1 } else { 0 }; + let large_flag = if t2 - t1 > PI / 2.0 { 1 } else { 0 }; let sweep_flag = if t2 - t1 > 0.0 { 0 } else { 1 }; - format!("M {} {} A {} {} {} {} {} {} {}", - start_x, start_y, - radius, radius, 0, large_flag, sweep_flag, end_x, end_y) + format!( + "M {} {} A {} {} {} {} {} {} {}", + start_x, start_y, radius, radius, 0, large_flag, sweep_flag, end_x, end_y + ) } - fn plot_chord(&self) -> String { let mut svg = String::new(); svg += &format!("\n\n", 0, 0); @@ -78,25 +82,33 @@ impl ChordPlotter { for chr in &self.result.strand.map { let t1 = self.angle(chr.position as f64) - INTER_RING_SPACING; let t2 = self.angle(chr.position as f64 + chr.length as f64) + INTER_RING_SPACING; - let tt = t1 + (t2-t1)/2.0; - + let tt = t1 + (t2 - t1) / 2.0; // Main chromosome bar - svg += &format!("\n", - self.arc(R + RING_WIDTH, t1, t2), - self.colorizer.color_fragment(&chr.name) + svg += &format!( + "\n", + self.arc(R + RING_WIDTH, t1, t2), + self.colorizer.color_fragment(&chr.name) ); if self.result.strand.map.len() > 1 { // Secondary chromosome bar - svg += &format!("\n", - self.arc(R + RING_WIDTH + OUT_CEILING * 0.7, t1, t2), - self.colorizer.color_fragment(&chr.name) + svg += &format!( + "\n", + self.arc(R + RING_WIDTH + OUT_CEILING * 0.7, t1, t2), + self.colorizer.color_fragment(&chr.name) ); } // Chromosomes labels let r = R + RING_WIDTH + RING_MARGIN; - let (x, y) = self.cartesian(tt, r + if self.result.strand.map.len() > 1 { 65. } else { 20. }); + let (x, y) = self.cartesian( + tt, + r + if self.result.strand.map.len() > 1 { + 65. + } else { + 20. + }, + ); svg += &format!("\n{}\n\n", x, y, -tt/(2.0*PI)*360.0 + 90.0, x, y, @@ -105,15 +117,18 @@ impl ChordPlotter { for family in &self.result.families { for sd in family { - let (left, right) = (sd.global_left_position as i64, sd.global_right_position as i64); + let (left, right) = ( + sd.global_left_position as i64, + sd.global_right_position as i64, + ); let t11 = self.angle(left as f64); let t12 = self.angle(left as f64 + sd.left_length as f64); - let t1 = t11 + (t12 - t11)/2.0; + let t1 = t11 + (t12 - t11) / 2.0; let t21 = self.angle(right as f64); let t22 = self.angle(right as f64 + sd.right_length as f64); - let t2 = t21 + (t22 - t21)/2.0; + let t2 = t21 + (t22 - t21) / 2.0; let mut width = R * (2.0*(1.0 - (t12-t11).cos())).sqrt(); // Cf. Al-Kashi if width <= self.settings.min_thickness {width = self.settings.min_thickness}; @@ -125,16 +140,15 @@ impl ChordPlotter { let (x2, y2) = self.cartesian(t2, R); let (cx, cy) = (CX, CY); ((x1, y1), (x2, y2), (cx, cy)) - } else { - let tt = t1 + (t2-t1)/2.0; - let rin = R + RING_WIDTH + RING_MARGIN; - let rout = rin + OUT_CEILING; - let (x1, y1) = self.cartesian(t1, rin); - let (cx, cy) = self.cartesian(tt, rout); - let (x2, y2) = self.cartesian(t2, rin); - ((x1, y1), (x2, y2), (cx, cy)) - }; - + } else { + let tt = t1 + (t2 - t1) / 2.0; + let rin = R + RING_WIDTH + RING_MARGIN; + let rout = rin + OUT_CEILING; + let (x1, y1) = self.cartesian(t1, rin); + let (cx, cy) = self.cartesian(tt, rout); + let (x2, y2) = self.cartesian(t2, rin); + ((x1, y1), (x2, y2), (cx, cy)) + }; let path = format!("M {},{} Q {},{} {} {}", x1, y1, cx, cy, x2, y2); svg += &format!(r#" @@ -159,33 +173,42 @@ impl ChordPlotter { } } - for features_family in &self.settings.feature_tracks { - let color = format!("#{:2X}{:2X}{:2X}", rand::random::(), rand::random::(), rand::random::()); + let color = format!( + "#{:2X}{:2X}{:2X}", + rand::random::(), + rand::random::(), + rand::random::() + ); for feature in features_family.iter() { for position in &feature.positions { let (start, end) = match *position { - FeaturePosition::Relative { ref chr, start, length} => { - let chr = self.result.strand.find_chr(&chr).unwrap_or_else(|| panic!("Unable to find fragment `{}`", chr)); + FeaturePosition::Relative { + ref chr, + start, + length, + } => { + let chr = self + .result + .strand + .find_chr(&chr) + .unwrap_or_else(|| panic!("Unable to find fragment `{}`", chr)); (chr.position + start, chr.position + start + length) } - FeaturePosition::Absolute { start, length } => { (start, start + length) } + FeaturePosition::Absolute { start, length } => (start, start + length), }; let t1 = self.angle(start as f64); let t2 = self.angle(end as f64); - let t0 = t1 + (t2 - t1)/2.0; + let t0 = t1 + (t2 - t1) / 2.0; let (x0, y0) = self.cartesian(t0 - 0.02, R - 5.0); let (x1, y1) = self.cartesian(t1, R); let (x2, y2) = self.cartesian(t2, R); let (x3, y3) = self.cartesian(t0 + 0.02, R - 5.0); let font_size = 4.0; - svg += &format!("\n", - x0, y0, - x1, y1, - x2, y2, - x3, y3, - color + svg += &format!( + "\n", + x0, y0, x1, y1, x2, y2, x3, y3, color ); svg += &format!("{}", x3 + font_size, y3 + font_size, font_size, @@ -196,7 +219,8 @@ impl ChordPlotter { } svg += ""; - format!(" ", - TOTAL_WIDTH, TOTAL_WIDTH, - 2.0*self.settings.min_thickness, - svg - ) + TOTAL_WIDTH, + TOTAL_WIDTH, + 2.0 * self.settings.min_thickness, + svg + ) } } diff --git a/src/plot/circos_plot.rs b/src/plot/circos_plot.rs index 5953aaa..6362230 100644 --- a/src/plot/circos_plot.rs +++ b/src/plot/circos_plot.rs @@ -2,8 +2,8 @@ use plot::*; use plot::colorizers::Colorizer; use ::utils::slugify; -use std::io::prelude::*; use std::fs::File; +use std::io::prelude::*; pub struct CircosPlotter { result: RunResult, @@ -22,25 +22,42 @@ impl Plotter for CircosPlotter { let prefix = self.settings.out_file.clone(); let karyotype_filename = &format!("{}.karyotype", &prefix); - let links_filename = &format!("{}.links", &prefix); - let config_filename = &format!("{}.conf", &prefix); + let links_filename = &format!("{}.links", &prefix); + let config_filename = &format!("{}.conf", &prefix); File::create(karyotype_filename) .and_then(|mut f| f.write_all(self.plot_karyotype().as_bytes())) - .and_then(|_| { log::info!("Karyotype written to `{}`", &karyotype_filename); Ok(()) }) + .and_then(|_| { + log::info!("Karyotype written to `{}`", &karyotype_filename); + Ok(()) + }) .chain_err(|| format!("Unable to write in `{}`", &karyotype_filename))?; File::create(links_filename) .and_then(|mut f| f.write_all(self.plot_links().as_bytes())) - .and_then(|_| { log::info!("Links written to `{}`", &links_filename); Ok(()) }) + .and_then(|_| { + log::info!("Links written to `{}`", &links_filename); + Ok(()) + }) .chain_err(|| format!("Unable to write in `{}`", &links_filename))?; File::create(config_filename) - .and_then(|mut f| f.write_all(self.plot_config(karyotype_filename, links_filename).as_bytes())) - .and_then(|_| { log::info!("Config written to `{}`", &config_filename); Ok(()) }) + .and_then(|mut f| { + f.write_all( + self.plot_config(karyotype_filename, links_filename) + .as_bytes(), + ) + }) + .and_then(|_| { + log::info!("Config written to `{}`", &config_filename); + Ok(()) + }) .chain_err(|| format!("Unable to write in `{}`", &config_filename))?; - log::info!("\nYou can now edit `{}` and/or run `circos {}` to generate the final plot.", config_filename, config_filename); + println!( + "\nYou can now edit `{}` and/or run `circos {}` to generate the final plot.", + config_filename, config_filename + ); Ok(()) } } @@ -48,16 +65,19 @@ impl Plotter for CircosPlotter { impl CircosPlotter { fn plot_karyotype(&self) -> String { fn encode_chromosome(chr: &Start) -> String { - format!("chr - {id} {label} {start} {end} {color}", - id = slugify(&chr.name), - label = slugify(&chr.name), - start = 0, - end = chr.length, - color = "grey" + format!( + "chr - {id} {label} {start} {end} {color}", + id = slugify(&chr.name), + label = slugify(&chr.name), + start = 0, + end = chr.length, + color = "grey" ) } - self.result.strand.map + self.result + .strand + .map .iter() .map(encode_chromosome) .collect::>() @@ -171,5 +191,4 @@ format = %d }), ) } - } diff --git a/src/plot/flat_plot.rs b/src/plot/flat_plot.rs index ae09631..4a5320c 100644 --- a/src/plot/flat_plot.rs +++ b/src/plot/flat_plot.rs @@ -2,8 +2,8 @@ extern crate rand; use plot::*; use plot::colorizers::Colorizer; -use std::io::prelude::*; use std::fs::File; +use std::io::prelude::*; use separator::Separatable; const CHR_WIDTH: f64 = 4.0; @@ -36,7 +36,10 @@ impl Plotter for FlatPlotter { let out_filename = format!("{}.svg", &self.settings.out_file); File::create(&out_filename) .and_then(|mut f| f.write_all(self.plot_flat().as_bytes())) - .and_then(|_| { println!("Flat plot written to `{}`", &out_filename); Ok(()) }) + .and_then(|_| { + println!("Flat plot written to `{}`", &out_filename); + Ok(()) + }) .chain_err(|| format!("Unable to write in `{}`", &out_filename))?; Ok(()) @@ -115,26 +118,44 @@ impl FlatPlotter { for feature in features_family.iter() { for position in &feature.positions { let (start, end) = match *position { - FeaturePosition::Relative { ref chr, start, length} => { - let chr = self.result.strand.find_chr(&chr).unwrap_or_else(|| panic!("Unable to find fragment `{}`", chr)); + FeaturePosition::Relative { + ref chr, + start, + length, + } => { + let chr = self + .result + .strand + .find_chr(&chr) + .unwrap_or_else(|| panic!("Unable to find fragment `{}`", chr)); (chr.position + start, chr.position + start + length) } - FeaturePosition::Absolute { start, length } => { (start, start + length) } + FeaturePosition::Absolute { start, length } => (start, start + length), }; - let color = format!("#{:2X}{:2X}{:2X}", rand::random::(), rand::random::(), rand::random::()); - let x0 = start as f64/self.max_length * self.width; - let x1 = end as f64/self.max_length * self.width; + let color = format!( + "#{:2X}{:2X}{:2X}", + rand::random::(), + rand::random::(), + rand::random::() + ); + let x0 = start as f64 / self.max_length * self.width; + let x1 = end as f64 / self.max_length * self.width; let x2 = x1 + 2.0; let x3 = x0 - 2.0; let font_size = 8.0; - svg += &format!("\n", - x0, self.height, - x1, self.height, - x2, self.height + 10.0, - x3, self.height + 10.0, - color + svg += &format!( + "\n", + x0, + self.height, + x1, + self.height, + x2, + self.height + 10.0, + x3, + self.height + 10.0, + color ); svg += &format!("{}", @@ -144,7 +165,6 @@ impl FlatPlotter { } } - for family in &self.result.families { for sd in family { let left1 = (sd.global_left_position as f64)/self.max_length * self.width; @@ -154,7 +174,8 @@ impl FlatPlotter { let color = self.colorizer.color(sd); - svg += &format!(r#" + svg += &format!( + r#" {} "#, - left1, CHR_WIDTH, - if left2 - left1 < self.settings.min_thickness { left1 + self.settings.min_thickness} else { left2 }, CHR_WIDTH, - if right2 - right1 < self.settings.min_thickness { right1 + self.settings.min_thickness} else { right2 }, self.height - CHR_WIDTH, - right1, self.height - CHR_WIDTH, - color, - color, - &format!("{}: {} → {} ({}bp)\n{}: {} → {} ({}bp)", - &sd.chr_left, - sd.chr_left_position.separated_string(), - (sd.chr_left_position+sd.left_length).separated_string(), - sd.left_length.separated_string(), - - &sd.chr_right, - sd.chr_right_position.separated_string(), - (sd.chr_right_position+sd.right_length).separated_string(), - sd.right_length.separated_string() - ) + left1, CHR_WIDTH, + if left2 - left1 < self.settings.min_thickness { left1 + self.settings.min_thickness} else { left2 }, CHR_WIDTH, + if right2 - right1 < self.settings.min_thickness { right1 + self.settings.min_thickness} else { right2 }, self.height - CHR_WIDTH, + right1, self.height - CHR_WIDTH, + color, + color, + &format!("{}: {} → {} ({}bp)\n{}: {} → {} ({}bp)", + &sd.chr_left, + sd.chr_left_position.separated_string(), + (sd.chr_left_position+sd.left_length).separated_string(), + sd.left_length.separated_string(), + + &sd.chr_right, + sd.chr_right_position.separated_string(), + (sd.chr_right_position+sd.right_length).separated_string(), + sd.right_length.separated_string() + ) ); } } diff --git a/src/plot/genome_plot.rs b/src/plot/genome_plot.rs index ec05b28..bc8c529 100644 --- a/src/plot/genome_plot.rs +++ b/src/plot/genome_plot.rs @@ -46,7 +46,6 @@ impl GenomePlotter { // height_factor + 50px top + 100px bot let height = height_factor + 50.0 + 100.0; - // 1. Draw the chromosomes for (i, chr) in self.result.strand.map.iter().enumerate() { // Chromosome bar @@ -83,7 +82,6 @@ impl GenomePlotter { 0.5, ); - // Label svg += &format!("{}\n", chr_spacing + i as f64 * chr_spacing - 10.0, diff --git a/src/plot/mod.rs b/src/plot/mod.rs index 08f70de..b5309eb 100644 --- a/src/plot/mod.rs +++ b/src/plot/mod.rs @@ -2,15 +2,15 @@ extern crate regex; extern crate palette; extern crate rand; -use ::errors::*; -use ::structs::*; +use errors::*; +use structs::*; pub mod chord_plot; pub mod flat_plot; // pub mod eye_plot; -pub mod genome_plot; pub mod circos_plot; pub mod colorizers; +pub mod genome_plot; pub struct Settings { pub out_file: String, @@ -25,8 +25,15 @@ pub struct Settings { #[derive(Debug, Clone)] pub enum FeaturePosition { - Relative { chr: String, start: usize, length: usize }, - Absolute { start: usize, length: usize } + Relative { + chr: String, + start: usize, + length: usize, + }, + Absolute { + start: usize, + length: usize, + }, } #[derive(Debug, Clone)] diff --git a/src/searcher.rs b/src/searcher.rs index 8f726ec..c569d29 100644 --- a/src/searcher.rs +++ b/src/searcher.rs @@ -1,10 +1,10 @@ -use std::mem; use std::collections::HashMap; +use std::mem; use superslice::Ext; -use structs::ALPHABET; use automaton::Segment; +use structs::ALPHABET; use divsufsort::*; @@ -98,7 +98,10 @@ impl Searcher { } pub fn new(dna: &[u8], sa: &[SAIdx], offset: usize) -> Searcher { - let mut s = Searcher { cache: HashMap::new(), offset }; + let mut s = Searcher { + cache: HashMap::new(), + offset, + }; unsafe { for a in &ALPHABET { @@ -113,15 +116,17 @@ impl Searcher { let index = Searcher::indexize(&p); let (start, count): (usize, usize) = { let mut out = 0; - let count = sa_searchb64(dna.as_ptr(), - dna.len() as i64, - p.as_ptr(), - p.len() as i64, - sa.as_ptr(), - sa.len() as i64, - &mut out, - 0, - sa.len() as i64); + let count = sa_searchb64( + dna.as_ptr(), + dna.len() as i64, + p.as_ptr(), + p.len() as i64, + sa.as_ptr(), + sa.len() as i64, + &mut out, + 0, + sa.len() as i64, + ); (out as usize, count as usize) }; s.cache.insert(index, (start, start + count)); @@ -138,7 +143,6 @@ impl Searcher { s } - pub fn search(&self, dna: &[u8], sa: &[SAIdx], pattern: &[u8]) -> Vec { #[inline] fn stringcmp(a: &[u8], b: &[u8]) -> std::cmp::Ordering { @@ -150,27 +154,29 @@ impl Searcher { let index = Searcher::indexize(pattern); if !self.cache.contains_key(&index) { - panic!("Unable to find {} ({})", - index, - String::from_utf8(pattern[0..CACHE_LEN].to_vec()).unwrap()); + panic!( + "Unable to find {} ({})", + index, + String::from_utf8(pattern[0..CACHE_LEN].to_vec()).unwrap() + ); } let (lstart, rstart) = self.cache[&index]; - let range = &sa[lstart..rstart] - .equal_range_by(|x| if *x as usize + pattern.len() > dna.len() { + let range = &sa[lstart..rstart].equal_range_by(|x| { + if *x as usize + pattern.len() > dna.len() { std::cmp::Ordering::Less } else { - stringcmp(&dna[*x as usize .. *x as usize + pattern.len()], &pattern) - }); + stringcmp(&dna[*x as usize..*x as usize + pattern.len()], &pattern) + } + }); - sa[lstart + range.start .. lstart + range.end] + sa[lstart + range.start..lstart + range.end] .iter() - .map(|start| - Segment { - tag: 0, - start: self.offset + *start as usize, - end: self.offset + *start as usize + pattern.len(), - }) + .map(|start| Segment { + tag: 0, + start: self.offset + *start as usize, + end: self.offset + *start as usize + pattern.len(), + }) .collect() } } diff --git a/src/structs.rs b/src/structs.rs index 16d14ae..aadabd2 100644 --- a/src/structs.rs +++ b/src/structs.rs @@ -1,40 +1,35 @@ -use std::io::prelude::*; +use errors::*; use std::fs::File; -use ::errors::*; +use std::io::Read; use ::rayon::prelude::*; pub const COLLAPSED_NAME: &str = "ASGART_COLLAPSED"; - -pub const ALPHABET: [u8; 5] = [ - b'A', b'T', b'G', b'C', b'N' -]; -pub const ALPHABET_MASKED: [u8; 5] = [ - b'a', b't', b'g', b'c', b'n' -]; +pub const ALPHABET: [u8; 5] = [b'A', b'T', b'G', b'C', b'N']; +pub const ALPHABET_MASKED: [u8; 5] = [b'a', b't', b'g', b'c', b'n']; #[derive(Serialize, Deserialize, Clone, Copy, Debug)] pub struct RunSettings { - pub probe_size: usize, - pub max_gap_size: u32, + pub probe_size: usize, + pub max_gap_size: u32, pub min_duplication_length: usize, - pub max_cardinality: usize, - pub trim: Option<(usize, usize)>, + pub max_cardinality: usize, + pub trim: Option<(usize, usize)>, #[serde(skip_serializing)] #[serde(default)] - pub reverse: bool, + pub reverse: bool, #[serde(skip_serializing)] #[serde(default)] - pub complement: bool, - pub skip_masked: bool, + pub complement: bool, + pub skip_masked: bool, #[serde(skip_serializing)] #[serde(default)] - pub threads_count: usize, + pub threads_count: usize, #[serde(skip_serializing)] #[serde(default)] - pub compute_score: bool, + pub compute_score: bool, } #[derive(Serialize, Deserialize, Clone, Debug)] @@ -55,7 +50,7 @@ impl StrandResult { self.map.iter().any(|chr| chr.name == name) } - pub fn find_chr(&self, name: &str) -> Option<&Start>{ + pub fn find_chr(&self, name: &str) -> Option<&Start> { self.map.iter().find(|chr| chr.name == name) } @@ -64,11 +59,12 @@ impl StrandResult { } pub fn find_chr_by_pos(&self, pos: usize) -> Option<&Start> { - self.map.iter().find(|&chr| pos >= chr.position && pos < chr.position + chr.length) + self.map + .iter() + .find(|&chr| pos >= chr.position && pos < chr.position + chr.length) } } - #[derive(Serialize, Deserialize, Debug)] pub struct RunResult { pub strand: StrandResult, @@ -91,16 +87,21 @@ impl RunResult { for result in &results { if result.strand.name != results[0].strand.name { - bail!(format!("Trying to combine ASGART files from different sources: `{}` and `{}`", - result.strand.name, results[0].strand.name, + bail!(format!( + "Trying to combine ASGART files from different sources: `{}` and `{}`", + result.strand.name, results[0].strand.name, )); } } let r = RunResult { - settings: results[0].settings.clone(), - strand: results[0].strand.clone(), - families: results.iter().flat_map(|ref r| r.families.iter()).cloned().collect::>(), + settings: results[0].settings, + strand: results[0].strand.clone(), + families: results + .iter() + .flat_map(|ref r| r.families.iter()) + .cloned() + .collect::>(), }; Ok(r) @@ -241,7 +242,6 @@ impl RunResult { } } - #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ProtoSD { pub left: usize, @@ -265,11 +265,11 @@ impl ProtoSD { pub fn levenshtein(&self, strand: &[u8]) -> f64 { // TODO take into account R/C duplications - let left_arm = &strand[self.left ..= self.left + self.left_length]; - let right_arm = &strand[self.right ..= self.right + self.right_length]; + let left_arm = &strand[self.left..=self.left + self.left_length]; + let right_arm = &strand[self.right..=self.right + self.right_length]; let dist = f64::from(bio::alignment::distance::levenshtein(left_arm, right_arm)); - 100.0 * (1.0 - dist/(std::cmp::max(self.left_length, self.right_length) as f64)) + 100.0 * (1.0 - dist / (std::cmp::max(self.left_length, self.right_length) as f64)) } } pub type ProtoSDsFamily = Vec; diff --git a/src/utils.rs b/src/utils.rs index 82d68e3..e414ff6 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -12,32 +12,40 @@ pub fn complement_nucleotide(n: u8) -> u8 { b'c' => b'g', b'n' => b'n', - _ => b'N' + _ => b'N', } } pub fn complemented(text: &[u8]) -> Vec { - text.iter().map(|x| complement_nucleotide(*x)).collect::>() + text.iter() + .map(|x| complement_nucleotide(*x)) + .collect::>() } pub fn slugify(x: &str) -> String { - x .trim() + x.trim() .replace(" ", "_") .replace(":", "_") .replace("|", "_") } -pub fn make_out_filename(filename: Option<&str>, default: &str, extension: &str) -> std::path::PathBuf { +pub fn make_out_filename( + filename: Option<&str>, + default: &str, + extension: &str, +) -> std::path::PathBuf { filename .map(|f| { let mut path = std::path::PathBuf::from(f); - if path.is_dir() { path.push(default); } + if path.is_dir() { + path.push(default); + } path.set_extension(extension); path }) .unwrap_or_else(|| { - let mut new = std::path::PathBuf::from(default); - new.set_extension(extension); - new + let mut new = std::path::PathBuf::from(default); + new.set_extension(extension); + new }) }