diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml index b56e9089..b7d58a06 100644 --- a/.github/workflows/draft-pdf.yml +++ b/.github/workflows/draft-pdf.yml @@ -4,7 +4,7 @@ name: paper formatting jobs: immem: runs-on: ubuntu-latest - name: Paper Draft + name: IMMEM Draft steps: - name: Checkout uses: actions/checkout@v2 @@ -26,23 +26,27 @@ jobs: # PDF. Note, this should be the same directory as the input # paper.md path: paper/paper.pdf -# paper: -# runs-on: ubuntu-latest -# name: Paper Draft -# steps: -# - name: Checkout -# uses: actions/checkout@v2 -# - name: Build draft PDF -# uses: openjournals/openjournals-draft-action@master -# with: -# journal: joss -# # This should be the path to the paper within your repo. -# paper-path: paper/paper.md -# - name: Upload -# uses: actions/upload-artifact@v1 -# with: -# name: paper -# # This is the output path where Pandoc will write the compiled -# # PDF. Note, this should be the same directory as the input -# # paper.md -# path: paper/paper.pdf + joss: + runs-on: ubuntu-latest + name: JOSS Draft + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + # This should be the path to the paper within your repo. + paper-path: paper/paper.md + - name: inspect directory + run: | + tree + find . -type f -name '*paper*' + - name: Upload + uses: actions/upload-artifact@v1 + with: + name: paper-abstract + # This is the output path where Pandoc will write the compiled + # PDF. Note, this should be the same directory as the input + # paper.md + path: paper/paper.pdf diff --git a/.gitignore b/.gitignore index 0eb24cde..2df9f341 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,3 @@ Cargo.lock **/*.rs.bk tests/hyperfine -paper/ diff --git a/Cargo.toml b/Cargo.toml index 926006d8..c74827dd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fasten" -version = "0.6.0" +version = "0.7.1" authors = ["Lee Katz "] #license-file = "LICENSE" license = "MIT" @@ -96,12 +96,11 @@ name = "fasten_normalize" path = "src/bin/fasten_normalize.rs" [dependencies] -regex = "0.2.10" +regex = "1.10" getopts = "0.2.21" -statistical = "0.1.1" +statistical = "1.0" multiqueue = "0.3.2" -rand = "0.4" +rand = "0.8" fastq = "0.6" threadpool = "1.8.1" bam = "0.1.4" - diff --git a/README.md b/README.md index 31eacddf..ad54eca6 100644 --- a/README.md +++ b/README.md @@ -56,8 +56,7 @@ This documentation was built with `cargo docs --no-deps` ## Other documentation -* Some workflows are shown in the [one-liners](./docs/one-liners.md) page. -* Some wrapper scripts are noted in the [scripts](./docs/scripts.md) page. +* Some wrapper scripts are noted in the [scripts](./scripts.md) page. ## Fasten script descriptions diff --git a/paper/benchmarks.png b/paper/benchmarks.png index 7250470a..9cae508e 100755 Binary files a/paper/benchmarks.png and b/paper/benchmarks.png differ diff --git a/paper/paper.bib b/paper/paper.bib index e69de29b..77cbbd51 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -0,0 +1,41 @@ +@software{Peter_hyperfine_2023, + author = {Peter, David}, + license = {MIT}, + month = mar, + title = {{hyperfine}}, + url = {https://github.com/sharkdp/hyperfine}, + version = {1.16.1}, + year = {2023} +} + +@article{seqkit, + doi = {10.1371/journal.pone.0163962}, + author = {Shen, Wei AND Le, Shuai AND Li, Yan AND Hu, Fuquan}, + journal = {PLOS ONE}, + publisher = {Public Library of Science}, + title = {SeqKit: A Cross-Platform and Ultrafast Toolkit for FASTA/Q File Manipulation}, + year = {2016}, + month = {10}, + volume = {11}, + url = {https://doi.org/10.1371/journal.pone.0163962}, + pages = {1-10}, + abstract = {FASTA and FASTQ are basic and ubiquitous formats for storing nucleotide and protein sequences. Common manipulations of FASTA/Q file include converting, searching, filtering, deduplication, splitting, shuffling, and sampling. Existing tools only implement some of these manipulations, and not particularly efficiently, and some are only available for certain operating systems. Furthermore, the complicated installation process of required packages and running environments can render these programs less user friendly. This paper describes a cross-platform ultrafast comprehensive toolkit for FASTA/Q processing. SeqKit provides executable binary files for all major operating systems, including Windows, Linux, and Mac OSX, and can be directly used without any dependencies or pre-configurations. SeqKit demonstrates competitive performance in execution time and memory usage compared to similar tools. The efficiency and usability of SeqKit enable researchers to rapidly accomplish common FASTA/Q file manipulations. SeqKit is open source and available on Github at https://github.com/shenwei356/seqkit.}, + number = {10}, + +} + +@software{seqtk, + author = {Li, Heng}, + license = {MIT}, + title = {{seqtk}}, + url = {https://github.com/lh3/seqtk}, + year = {2023} +} + +@software{fastx, + author = {Gordon}, + license = {AGPL}, + title = {{fastx toolkit}}, + url = {http://hannonlab.cshl.edu/fastx_toolkit/index.html}, + year = {2014} +} diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 00000000..7b3d5a08 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,66 @@ +--- +title: 'Fasten with Pipes' +tags: + - command line + - fastq manipulation + - interleaved fastq +authors: + - name: Lee S. Katz + affiliation: "1, 2" + orcid: 0000-0002-2533-9161 + - name: Henk C. den Bakker + orcid: 0000-0002-4086-1580 + affiliation: 1 +affiliations: + - name: Enteric Diseases Laboratory Branch (EDLB), Centers for Disease Control and Prevention, Atlanta, GA, USA + index: 1 + - name: Center for Food Safety, University of Georgia, Griffin, GA, USA + index: 2 +bibliography: paper.bib +--- + +## Background + +There are still many gaps in basic command line bioinformatics for standard file formats. +Bioinformaticians have been able to use many tools to manipulate sequence data files in the fastq format, such as `seqkit` [@seqkit], `seqtk` [@seqtk] or FASTX-Toolkit [@fastx]. +These tools only accept paired end (PE) sequence data when split into multiple files per sample. +Additionally, these tools do not always allow for Unix-style pipe file control. Sometimes they require explicity input/output options instead of using `stdin` and `stdout`. +However, some bioinformaticians prefer to combine PE data from a single sample into one file using the interleaved fastq file format, but this format is not always well supported in mainstream tools. +Here, we provide Fasten to the community to address these needs. + +## Materials + +We leveraged the Cargo packaging system in Rust to create a basic framework for interleaved fastq file manipulation. +Each executable reads from `stdin` and prints reads to `stdout` and only performs one function at a time. +The core executables perform these fundamental functions: 1) converting to and from interleaved format, 2) converting to and from other sequence file formats, 3) ‘straightening’ fastq files to a more standard 4-line-per-entry format. + +There are 20 executables including but not limited to read metric generation, read cleaning, kmer counting, read validation, and regular expressions for interleaved fastq files. + +We have also taken advantage of Rust to make comprehensive and standardized documentation. +Continuous integration was implemented in GitHub Actions for unit testing, containerizing, and benchmarking. +Benchmarking was performed against other mainstream packages using `hyperfine` using 20 replicates and 2 burn-ins [@Peter_hyperfine_2023]. + +## Results + +Documentation, the container, and code are available at GitHub. Benchmarking results were graphed into Figure \autoref{fig:benchmarks}. + +![Benchmarks comparing fasten with other analagous tools. From left to right, then to bottom: Trimming with a minimum quality score; converting fastq to fasta; interleaving R1 and R2 reads; kmer counting; normalizing read depth using kmer coverage; Searching for a sequence in a fastq file; downsampling reads; sorting fastq entries by either sequence or ID; and converting nonstandard fastq files to a format whose entries are four lines each, and selecting the first 100.\label{fig:benchmarks}](benchmarks.png) + +## Conclusions + +Fasten is a powerful manipulation suite for interleaved fastq files, written in Rust. +We benchmarked Fasten on several categories. +It has strengths as shown in Figure 1 but it does not occupy the fastest position in all cases. +Its major strengths include its competetive speeds, +Unix-style pipes, +paired-end handling, +and the advantages afforded by the Rust language including documentation and stability. + +Fasten touts a comprehensive manual, continuous integration, and integration into the command line with unix pipes. +It is well poised to be a crucial module for daily work on the command line. + +## Acknowledgements + +Thank you, John Phan, for creating the Docker container. + +## References diff --git a/src/bin/fasten_clean.rs b/src/bin/fasten_clean.rs index 6416b128..7d1db3d2 100644 --- a/src/bin/fasten_clean.rs +++ b/src/bin/fasten_clean.rs @@ -33,107 +33,18 @@ extern crate getopts; extern crate fasten; -extern crate multiqueue; -extern crate threadpool; use std::fs::File; use std::io::BufReader; -use std::io::BufRead; - -use threadpool::ThreadPool; -use std::sync::mpsc::channel; use fasten::fasten_base_options; use fasten::fasten_base_options_matches; +use fasten::io::fastq; //use fasten::logmsg; -#[test] -// Basically, loop through different trim cutoff scores -// and expect a length that correlates due to the waxing/waning -// quality scores. -fn test_clean_entry() { - // // This is an easier-to-look-at entry due to the qual scores - let entry:Vec = vec![ - String::from("@read0"), - String::from("AAAGTGCTCTTAACTTGTCCCGCTCCACATCAGCGCGACATCAATCGACATTAAACCGAGTATCTTGTCAGCCTGGGGTGACGA"), - String::from("+"), - // To create this qual line: - // perl -e 'for $int (33..33+40){$chr=chr($int); print $chr;}' && \ - // perl -e 'for $int (33..33+40){$chr=chr($int); print $chr;}' | rev && \ - // echo - String::from("!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIIHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#\"!") - ]; - let min_length = 1; - let min_avg_qual = 1.0; - let lines_per_read = 4; - let (tx, rx):(std::sync::mpsc::Sender,std::sync::mpsc::Receiver) = channel(); - - - for min_trim_qual in 1..42 { - let tx_trash = tx.clone(); // cannot for the life of me remember why I required this clone - - // perform the clean - clean_entry(entry.clone(), min_length, min_avg_qual, min_trim_qual, lines_per_read, tx.clone(), tx_trash); - } - - drop(tx); - - let expected:Vec = (2..82).step_by(2).rev().collect(); - let mut observed:Vec = vec![]; - - for trimmed_entry in rx.iter(){ - let mut lines = trimmed_entry.lines(); - lines.next(); // id line - lines.next(); // seq line - lines.next(); // + line - let qual_line = lines.next().expect("Fourth line as qual line in entry"); - let length = qual_line.len() as u8; - observed.push(length); - } - - assert_eq!(observed, expected); - -} - -#[test] -fn test_clean_entry_basic() { - // this is the four-line fastq entry - let entry:Vec = vec![ - String::from("@read0"), - String::from("AAAGTGCTCTTAACTTGTCCCGCTCCACATCAGCGCGACATCAATCGACATTAAACCGAGTATCTTGTCAGCCTGGGGTGACGATGCGTCCCATTAAAGT"), - String::from("+"), - String::from("8AB*2D>C1'02C+=I@IEFHC7&-E5',I?E*33E/@3#68B%\"!B-/2%(G=*@D052IA!('7-*$+A6>.$89,-CG71=AGAE3&&#=2B.+I,std::sync::mpsc::Receiver) = channel(); - let tx_trash = tx.clone(); // cannot for the life of me remember why I required this clone - - // perform the clean - clean_entry(entry, min_length, min_avg_qual, min_trim_qual, lines_per_read, tx.clone(), tx_trash); - - drop(tx); - - let cleaned_res = rx.iter().next(); - let cleaned_read= cleaned_res.unwrap(); - - let expected_35_trim = "@read0 -GCTCTTAACTTGTCCCGCTCCACATCAGCGCGACATCAATCGACATTAAACCGAGTATCTTGTCAGCCTGGGGTGACGATGCGTCCCATTAAAGT -+ -D>C1'02C+=I@IEFHC7&-E5',I?E*33E/@3#68B%\"!B-/2%(G=*@D052IA!('7-*$+A6>.$89,-CG71=AGAE3&&#=2B.+I,std::sync::mpsc::Receiver) = channel(); - let (tx_trash, _rx_trash):(std::sync::mpsc::Sender,std::sync::mpsc::Receiver) = channel(); - let pool = ThreadPool::new(num_cpus); - // Read the file and send seqs to threads let my_file = File::open("/dev/stdin").expect("Could not open file"); let my_buffer=BufReader::new(my_file); - - let mut buffer_iter = my_buffer.lines(); - let num_lines_per_buffer = num_reads_per_buffer * lines_per_read as usize; - while let Some(line_res) = buffer_iter.next(){ - // get 4 or 8 lines of the fastq into the vector - let mut lines:Vec = Vec::with_capacity(num_lines_per_buffer); - - let line = line_res.expect("ERROR reading the first line of the next fastq entry"); - lines.push(line); - for _ in 0..(num_lines_per_buffer-1) { - if let Some(line_res) = buffer_iter.next() { - let line = line_res.expect("ERROR reading the first line of the next fastq entry"); - lines.push(line); - //} else { - // panic!("Expected another line in this fastq entry but got {} out of {}", i+1, lines_per_read); - } + let fastq_reader = fastq::FastqReader::new(my_buffer); + let fastq_iter = fastq_reader.into_iter(); + + let mut trimmed_ids :Vec = vec![]; + let mut trimmed_seqs :Vec = vec![]; + let mut trimmed_quals:Vec = vec![]; + for seq in fastq_iter { + // trim + let (seq_trimmed, qual_trimmed) = trim(&seq.seq, &seq.qual, min_trim_qual); + trimmed_ids.push(seq.id); + trimmed_seqs.push(seq_trimmed); + trimmed_quals.push(qual_trimmed); + + if paired_end && trimmed_seqs.len() < 2 { + continue; } - // Pass the 4 or 8 lines to a thread - let tx2 = tx.clone(); - let tx2_trash = tx_trash.clone(); - pool.execute(move|| { - clean_entry(lines, min_length, min_avg_qual, min_trim_qual, lines_per_read, tx2, tx2_trash); - }); - } - - pool.join(); - drop(tx); // signal the end of the transmitting to the channel - - for entry in rx.iter(){ - println!("{}", entry); - } -} - -/// Cleans a SE or PE read -fn clean_entry(lines:Vec, min_length:usize, min_avg_qual:f32, min_trim_qual:u8, lines_per_read:u8, tx:std::sync::mpsc::Sender, _tx_trash:std::sync::mpsc::Sender) { - let short_blank_string:String = String::with_capacity(100); - let long_blank_string:String = String::with_capacity(10000); - - let mut id1 :String = short_blank_string.clone(); - let mut id2 :String = short_blank_string.clone(); - let mut seq1 :String = long_blank_string.clone(); - let mut seq2 :String = long_blank_string.clone(); - let mut qual1 :String = long_blank_string.clone(); - let mut qual2 :String = long_blank_string.clone(); - - let mut i = 0; - for line in lines { - - //let line = wrapped_line.expect("ERROR: could not read line"); - match i % lines_per_read { - // read ID - 0=>{ - // On the zeroth line, set the first ID... - id1 = line; - // ...but then reset all other fields - id2 = short_blank_string.clone(); - seq1 = long_blank_string.clone(); - seq2 = long_blank_string.clone(); - qual1 = long_blank_string.clone(); - qual2 = long_blank_string.clone(); - } - 4=>{ - id2 = line; - } - - // Sequence - 1=>{ - seq1 = line; - } - 5=>{ - seq2 = line; - } - - // Qual line. If we've gotten here, then we can also trim/filter/print - // First qual line - 3=>{ - qual1 = line; - - } - // Second qual line - 7=>{ - qual2 = line; - - } - 2=>{} // + line - 6=>{} // + line - _=>{ - panic!("Internal error: somehow there are more than 8 lines per entry. The last line was line {} and contents were {}", i, line); + // ensure that we pass min qual and length for all reads + let mut passed = true; + for q_str in &trimmed_quals { + if avg_quality(&q_str) < min_avg_qual { + passed = false; + break; + } + if q_str.len() < min_length { + passed = false; + break; + } } - - } - - // moment of truth: see trim the reads and then see - // if they pass the filter. - if (lines_per_read==4 && !qual1.is_empty()) - ||(lines_per_read==8 && !qual2.is_empty()) { - - // Trim - let (seq1_trimmed, qual1_trimmed) = - trim(&seq1,&qual1,min_trim_qual); - - // If this is single end, go ahead and filter/print - if lines_per_read==4 { - if seq1_trimmed.len() >= min_length && avg_quality(&qual1_trimmed) >= min_avg_qual { - tx.send(format!("{}\n{}\n+\n{}", - id1,seq1_trimmed,qual1_trimmed, - )) - .unwrap(); + if passed { + for i in 0..trimmed_seqs.len() { + println!("{}\n{}\n+\n{}", + &trimmed_ids[i], + &trimmed_seqs[i], + &trimmed_quals[i], + ); } - } - - else if lines_per_read==8 { - - // trim second read - let (seq2_trimmed, qual2_trimmed) = - trim(&seq2,&qual2,min_trim_qual); - - - // Since we are at the second qual line, this is PE and we can - // go ahead with filter/print and not check for the PE param. - - if seq2_trimmed.len() >= min_length && seq2_trimmed.len() >= min_length - && avg_quality(&qual1_trimmed) >= min_avg_qual - && avg_quality(&qual2_trimmed) >= min_avg_qual { - - tx.send(format!("{}\n{}\n+\n{}\n{}\n{}\n+\n{}", - id1,seq1_trimmed,qual1_trimmed, - id2,seq2_trimmed,qual2_trimmed - )) - .unwrap(); - } - } - else { - panic!("I do not understand {} lines per fastq entry", lines_per_read); } + // Before we end the loop, we still need to clear our sequence + // buffer. + trimmed_ids.clear(); + trimmed_seqs.clear(); + trimmed_quals.clear(); } - - i += 1; - } + } /// Determine average quality of a qual cigar string, diff --git a/src/bin/fasten_mutate.rs b/src/bin/fasten_mutate.rs index 2fcf43d7..8d6c9474 100644 --- a/src/bin/fasten_mutate.rs +++ b/src/bin/fasten_mutate.rs @@ -30,7 +30,8 @@ use std::fs::File; use std::io::BufReader; use std::io::BufRead; -use rand::distributions::{IndependentSample, Range}; +use rand::Rng; +use rand::seq::SliceRandom; use fasten::fasten_base_options; use fasten::fasten_base_options_matches; @@ -70,7 +71,6 @@ fn main(){ //let nts = vec!['a', 'c', 'g', 't']; // Make this one time outside the loop to keep overhead low - //let mut rng = rand::thread_rng(); let my_file = File::open("/dev/stdin").expect("Could not open file"); let my_buffer=BufReader::new(my_file); @@ -100,14 +100,11 @@ fn mutate(seq: &str, nts: &Vec, num_snps: u8, mark:bool) -> String { if mark { sequence.make_ascii_lowercase(); } - let length = sequence.len(); - let between = Range::new(0, length); - let nt_range= Range::new(0, nts.len()); let mut rng = rand::thread_rng(); for _ in 0..num_snps { - let pos = between.ind_sample(&mut rng); - let nt = nts[nt_range.ind_sample(&mut rng)]; - sequence[pos] = nt as u8; + let pos = rng.gen_range(0..sequence.len()); + let nt = nts.choose(&mut rng).unwrap(); + sequence[pos] = *nt as u8; } return String::from_utf8_lossy(&sequence).to_string(); } diff --git a/src/bin/fasten_normalize.rs b/src/bin/fasten_normalize.rs index d4de243d..20dcc5cf 100644 --- a/src/bin/fasten_normalize.rs +++ b/src/bin/fasten_normalize.rs @@ -48,7 +48,7 @@ use std::io::BufRead; use std::io::stdin; use std::io::Stdin; use std::cmp::min; -use rand::Rng; +use rand::prelude::*; use fasten::fasten_base_options; use fasten::fasten_base_options_matches; @@ -111,7 +111,7 @@ fn normalize_coverage (stdin:Stdin, target_depth:u32, paired_end:bool) { //println!("target depth:{} count:{} num reads:{} = {}", target_depth, count, f.len(), num_reads_to_keep); // shuffle the reads in place - rng.shuffle(&mut f); + f.shuffle(&mut rng); // take the top X reads let reads_to_keep :Vec<&str> = f.splice(0..num_reads_to_keep, vec![]).collect(); diff --git a/src/bin/fasten_randomize.rs b/src/bin/fasten_randomize.rs index 2c014870..677de8c1 100644 --- a/src/bin/fasten_randomize.rs +++ b/src/bin/fasten_randomize.rs @@ -40,7 +40,7 @@ use std::fs::File; use std::io::BufReader; use std::io::BufRead; -use rand::{Rng,thread_rng}; +use rand::seq::SliceRandom; use fasten::fasten_base_options; use fasten::fasten_base_options_matches; @@ -83,8 +83,9 @@ fn print_reads_from_stdin(lines_per_read :u32) -> () { } // choose random reads - let mut rng = thread_rng(); - rng.shuffle(&mut seqs); + let mut rng = rand::thread_rng(); + //rng.shuffle(&mut seqs); + seqs.shuffle(&mut rng); for seq in seqs { println!("{}",seq); } diff --git a/src/bin/fasten_regex.rs b/src/bin/fasten_regex.rs index 8ba461f5..bec64554 100644 --- a/src/bin/fasten_regex.rs +++ b/src/bin/fasten_regex.rs @@ -146,26 +146,14 @@ fn main(){ // Print if it's a match let should_print:bool = match the_field.as_str(){ "SEQ" => { - if regex.is_match(&all_seq) { - true - } else { - false - } - }, + regex.is_match(&all_seq) + }, "ID" => { - if regex.is_match(&all_id) { - true - } else { - false - } + regex.is_match(&all_id) }, "QUAL" => { - if regex.is_match(&all_qual) { - true - } else { - false - } - } + regex.is_match(&all_qual) + }, _ => { panic!("{} is not a valid key to match on", &the_field); } diff --git a/src/bin/fasten_sample.rs b/src/bin/fasten_sample.rs index f0eb592f..319bb74b 100644 --- a/src/bin/fasten_sample.rs +++ b/src/bin/fasten_sample.rs @@ -29,8 +29,7 @@ use std::fs::File; use std::io::BufReader; use std::io::BufRead; -use rand::thread_rng; -use rand::Rng; +use rand::prelude::*; use fasten::fasten_base_options; use fasten::fasten_base_options_matches; @@ -61,12 +60,11 @@ fn main(){ } }; - let mut rng = thread_rng(); - let my_file = File::open("/dev/stdin").expect("Could not open file"); let my_buffer=BufReader::new(my_file); let mut line_counter =0; let mut entry = String::new(); + let mut randoms :Vec = vec![]; for line in my_buffer.lines() { // unwrap the line here and shadow-set the variable. let line=line.expect("ERROR: did not get a line"); @@ -76,8 +74,12 @@ fn main(){ // Action if we have a full entry when mod 0 if line_counter % lines_per_read == 0 { + if randoms.len() < 1 { + randoms = rand_32(); + } + let r:f32 = randoms.pop().unwrap(); // Should we print? - if rng.gen_range(0.0,1.0) < frequency { + if r < frequency { print!("{}",entry); } // reset the entry string @@ -86,3 +88,14 @@ fn main(){ } } +/// Generate a set of random floats +fn rand_32() -> Vec { + let mut rng = rand::thread_rng(); + + let floats :Vec = (0..10000) + .map(|_| rng.gen::()) + .collect(); + + return floats; +} + diff --git a/src/bin/fasten_trim.rs b/src/bin/fasten_trim.rs index 82d80242..d52d440e 100644 --- a/src/bin/fasten_trim.rs +++ b/src/bin/fasten_trim.rs @@ -41,14 +41,10 @@ use std::fs::File; use std::io::BufReader; use std::cmp::min; -use threadpool::ThreadPool; -use std::sync::mpsc::channel; - use fasten::fasten_base_options; use fasten::fasten_base_options_matches; use fasten::logmsg; use fasten::io::fastq; -use fasten::io::seq::Cleanable; use fasten::io::seq::Seq; fn main(){ @@ -56,14 +52,10 @@ fn main(){ // script-specific options opts.optopt("f","first-base","The first base to keep (default: 0)","INT"); - opts.optopt("l","last-base","The last base to keep. If negative, counts from the right. (default: 0)","INT"); + opts.optopt("l","last-base","The last base to keep (default: 0)","INT"); let matches = fasten_base_options_matches("Blunt-end trims using 0-based coordinates", opts); - let (tx, rx):(std::sync::mpsc::Sender,std::sync::mpsc::Receiver) = channel(); - - //let paired_end:bool = matches.opt_present("paired-end"); - let first_base:usize ={ if matches.opt_present("first-base") { matches.opt_str("first-base") @@ -86,7 +78,7 @@ fn main(){ } }; - let num_cpus:usize = { + let _num_cpus:usize = { if matches.opt_present("numcpus") { /* matches.opt_str("numcpus") @@ -101,63 +93,21 @@ fn main(){ } }; - /* - * Set up multithreading. Each thread will get 100k - * reads at a time. - */ - let pool = ThreadPool::with_name("worker".into(), num_cpus); - // Read from stdin let my_file = File::open("/dev/stdin").expect("Could not open file"); let my_buffer=BufReader::new(my_file); let fastq_reader = fastq::FastqReader::new(my_buffer); - let mut fastq_iter = fastq_reader.into_iter(); - while let Some(seq) = fastq_iter.next() { - let mut seqs:Vec = Vec::with_capacity(10000); - seqs.push(seq); - - // Get an odd number to push onto the vector - // so that it is eventually an even number - // and we sidestep any paired end nuances. - for _ in 0..9999 { // 9999 + 1 => 10k seqs - // if the iterator returns nothing, then use - // a blank sequence. In the worker thread, - // it will check for a blank sequence and if - // it is blank, it will skip it. - let next_seq = fastq_iter.next() - .or(Some(Seq::blank())).unwrap(); - seqs.push( - next_seq - //.expect("Tried to get the second sequence in a pair but ran into an error") - ); - } + let fastq_iter = fastq_reader.into_iter(); + for seq in fastq_iter { - // Send this single end or paired end to the queue - let tx2 = tx.clone(); - pool.execute(move|| { - trim_worker(seqs, first_base, last_base, tx2); - }); + let trimmed:String = trim_worker(seq, first_base, last_base); + println!("{}", trimmed); } - - pool.join(); - drop(tx); // disconnects the channel - - let receiver = rx.iter(); - for entry in receiver { - println!("{}",entry); - } - } /// Trim a set of fastq entries and send it to a channel -fn trim_worker(seqs:Vec, first_base:usize, last_base:usize, tx:std::sync::mpsc::Sender ){ - - let blank_seq = Seq::blank(); +fn trim_worker(seq:Seq, first_base:usize, last_base:usize ) -> String { - for seq in seqs{ - if seq.id == blank_seq.id && seq.seq == blank_seq.seq && seq.qual == blank_seq.qual { - continue; - } // The last position is either the last_base parameter // or the last position in the string, whichever is less. let last_base_tmp = match last_base { @@ -175,42 +125,7 @@ fn trim_worker(seqs:Vec, first_base:usize, last_base:usize, tx:std::sync::m let quality = &seq.qual[first_base..last_base_tmp]; let trimmed = format!("{}\n{}\n+\n{}", seq.id, sequence, quality); - match tx.send(trimmed){ - Ok(_seq_obj) => {}, - Err(_error) => {} - }; - } + return trimmed; } -/* -fn trim_worker_old(sub_lines_buffer:&mut Vec, first_base:usize, last_base:usize, tx:std::sync::mpsc::Sender ){ - let this_thread = thread::current(); - let _tid = this_thread.id(); // for debugging - - sub_lines_buffer.reverse(); - - while sub_lines_buffer.len() > 0 { - //let mut entry_splice = &sub_lines_buffer.splice(0..4, vec![]); - //let entry = vec![entry_splice]; - let id = sub_lines_buffer.pop().unwrap(); - let mut seq = sub_lines_buffer.pop().unwrap(); - let plus = sub_lines_buffer.pop().unwrap(); - let mut qual = sub_lines_buffer.pop().unwrap(); - - let last_base_tmp = min(seq.len(), last_base); - seq = String::from(seq); - qual = String::from(qual); - - let entry = format!("{}\n{}\n{}\n{}", - id, &seq[first_base..last_base_tmp], plus, &qual[first_base..last_base_tmp] - ); - - tx.send(entry).unwrap(); - - } - //eprintln!("{:?} finished {}", &_tid, &num_lines); -} - -*/ - diff --git a/tests/benchmark_clean.sh b/tests/benchmark_clean.sh new file mode 100644 index 00000000..d5b169ba --- /dev/null +++ b/tests/benchmark_clean.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -e +set -u + +thisDir=$(dirname $0); +source "$thisDir/lib/benchmark.sh"; +export PATH=$thisDir/../target/release:$PATH + +# get first 100 reads for any fastq file +hyperfine --export-json=$reportsDir/clean.json --warmup 2 --shell $SHELL --runs $num_runs \ + -n "fasten_clean" "cat $large_R1 | fasten_clean --min-length 0 --min-trim-quality 5" \ + -n "seqtk trimfq" "cat $large_R1 | seqtk trimfq -q 0.1" + +plot_whisker.py --title "Clean (reps=$num_runs)" --output $reportsDir/clean.json.png $reportsDir/clean.json + +# -n "friends_monica.pl" "friends_monica.pl -i $large_R1 -o /dev/stdout" \ diff --git a/tests/benchmark_convert.sh b/tests/benchmark_convert.sh new file mode 100644 index 00000000..783ff2ae --- /dev/null +++ b/tests/benchmark_convert.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -e +set -u + +thisDir=$(dirname $0); +source "$thisDir/lib/benchmark.sh"; +export PATH=$thisDir/../target/release:$PATH + +ls -lh $large_sorted + +# Convert fastq=>fasta. +# Set the width to really long on fq2fa to match the output exactly +hyperfine --export-json=$reportsDir/convertToFasta.json --warmup 2 --shell $SHELL --runs $num_runs \ + -n "Fasten convert" "zcat $large_sorted | fasten_convert -i fastq -o fasta " \ + -n "Seqkit fq2fa -w 0" "zcat $large_sorted | seqkit fq2fa " \ + -n "Seqtk seq -A" "zcat $large_sorted | seqtk seq -A - " \ + -n "fastx convert" "zcat $large_sorted | fastq_to_fasta -i - -o - -Q33" \ + -n "seqfu cat" "zcat $large_sorted | seqfu cat - --fasta" + +plot_whisker.py --title "Converting fastq to fasta (reps=$num_runs)" --output $reportsDir/convertToFasta.json.png $reportsDir/convertToFasta.json + diff --git a/tests/benchmark_head.sh b/tests/benchmark_head.sh new file mode 100644 index 00000000..4bd33f9d --- /dev/null +++ b/tests/benchmark_head.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -e +set -u + +thisDir=$(dirname $0); +source "$thisDir/lib/benchmark.sh"; +export PATH=$thisDir/../target/release:$PATH + +# get first 100 reads for any fastq file +hyperfine --export-json=$reportsDir/straighten.json --warmup 2 --shell $SHELL --runs $num_runs \ + -n "fasten_straighten" "cat $large_R1 | fasten_straighten | head -n 400" \ + -n "seqkit head" "cat $large_R1 | seqkit head --number 100" \ + -n "seqtk seq straighten" "seqtk seq -l0 $large_R1 | head -n 400" \ + -n "seqfu head" "cat $large_R1 | seqfu head --num 100" + +plot_whisker.py --title "Get first 100 reads from fastq (reps=$num_runs)" --output $reportsDir/straighten.json.png $reportsDir/straighten.json + diff --git a/tests/benchmark_interleave.sh b/tests/benchmark_interleave.sh new file mode 100644 index 00000000..e1d9987b --- /dev/null +++ b/tests/benchmark_interleave.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -e +set -u + +thisDir=$(dirname $0); +source "$thisDir/lib/benchmark.sh"; +export PATH=$thisDir/../target/release:$PATH + +hyperfine --export-json=$reportsDir/interleave.json --warmup 2 --shell $SHELL --runs $num_runs \ + -n "fasten shuffle" "cat $large_R1 $large_R2 | fasten_shuffle" \ + -n "seqfu interleave" "seqfu ilv -1 $large_R1 -2 $large_R2" \ + -n "fasten deshuffle" "zcat $large_interleaved | fasten_shuffle -d" \ + -n "seqfu deinterleave" "zcat $large_interleaved | seqfu dei - de-interleaved" + +plot_whisker.py --title "interleave (reps=$num_runs)" --output $reportsDir/interleave.json.png $reportsDir/interleave.json + diff --git a/tests/benchmark_kmer.sh b/tests/benchmark_kmer.sh new file mode 100644 index 00000000..6198220f --- /dev/null +++ b/tests/benchmark_kmer.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +set -e +set -u + +thisDir=$(dirname $0); +source "$thisDir/lib/benchmark.sh"; +export PATH=$thisDir/../target/release:$PATH + +which jellyfish + +trap ' { rm jf.jf; } ' EXIT + +# get first 100 reads for any fastq file +hyperfine --export-json=$reportsDir/kmer.json --warmup 2 --shell $SHELL --runs $num_runs \ + -n "fasten_kmer k=11" "cat $large_R1 | fasten_kmer --kmer-length 11" \ + -n "jellyfish k=11" "jellyfish count -m 11 -s 1000000 -o jf.jf $large_R1 && jellyfish dump --column --tab jf.jf" \ + -n "fasten_kmer k=25" "cat $large_R1 | fasten_kmer --kmer-length 25" \ + -n "jellyfish k=25" "jellyfish count -m 25 -s 1000000 -o jf.jf $large_R1 && jellyfish dump --column --tab jf.jf" \ + -n "fasten_kmer k=31" "cat $large_R1 | fasten_kmer --kmer-length 31" \ + -n "jellyfish k=31" "jellyfish count -m 31 -s 1000000 -o jf.jf $large_R1 && jellyfish dump --column --tab jf.jf" \ + -n "fasten_kmer k=51" "cat $large_R1 | fasten_kmer --kmer-length 51" \ + -n "jellyfish k=51" "jellyfish count -m 51 -s 1000000 -o jf.jf $large_R1 && jellyfish dump --column --tab jf.jf" + + +plot_whisker.py --title "Kmer counting (reps=$num_runs)" --output $reportsDir/kmer.json.png $reportsDir/kmer.json + diff --git a/tests/benchmark_normalize.sh b/tests/benchmark_normalize.sh new file mode 100644 index 00000000..f81d041c --- /dev/null +++ b/tests/benchmark_normalize.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -e +set -u + +thisDir=$(dirname $0); +source "$thisDir/lib/benchmark.sh"; +export PATH=$thisDir/../target/release:$PATH + +tmpfile=$(mktemp --dry-run --suffix=.fastq) +trap " { rm $tmpfile; } " EXIT + +hyperfine --export-json=$reportsDir/normalize.json --warmup 2 --shell $SHELL --runs $num_runs \ + -n "fasten_normalize" "cat $large_R1 | fasten_kmer -k 5 --remember-reads | fasten_normalize --target-depth 50 > $tmpfile" \ + -n "BBNorm" "bbnorm.sh in=$large_R1 out=$tmpfile target=50 min=1 threads=1 k=5" + +plot_whisker.py --title "Normalize kmer depth (reps=$num_runs)" --output $reportsDir/normalize.json.png $reportsDir/normalize.json + diff --git a/tests/benchmark_progress-meter.sh b/tests/benchmark_progress-meter.sh new file mode 100644 index 00000000..03cf003d --- /dev/null +++ b/tests/benchmark_progress-meter.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -e +set -u + +thisDir=$(dirname $0); +source "$thisDir/lib/benchmark.sh"; +export PATH=$thisDir/../target/release:$PATH + +# Fasten_progress time trial +hyperfine --export-json=$reportsDir/progress.json --warmup 2 --shell $SHELL --runs $num_runs \ + -n "Basic fasten_shuffle" "cat $large_R1 $large_R2 | fasten_shuffle > /dev/null" \ + -n "Shuffle with progress bar before" "cat $large_R1 $large_R2 | fasten_progress --print 2>/dev/null | fasten_shuffle > /dev/null" \ + -n "Shuffle with progress bar after" "cat $large_R1 $large_R2 | fasten_shuffle | fasten_progress 2>/dev/null > /dev/null" + +plot_whisker.py --title "Interleave reads with or without a progress meter (reps=$num_runs)" --labels without,before,after --output $reportsDir/progress.json.png $reportsDir/progress.json + diff --git a/tests/benchmark_regex.sh b/tests/benchmark_regex.sh new file mode 100644 index 00000000..2ad44f40 --- /dev/null +++ b/tests/benchmark_regex.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -e +set -u + +thisDir=$(dirname $0); +source "$thisDir/lib/benchmark.sh"; +export PATH=$thisDir/../target/release:$PATH + +# Grep for seq +pattern="CCCC" +pattern2="ATG" +hyperfine --export-json=$reportsDir/regex.json --warmup 2 --shell $SHELL --runs $num_runs \ + -n "fasten_regex $pattern" "cat $large_R1 | fasten_regex --regex '$pattern' --which SEQ" \ + -n "fasten_regex $pattern2" "cat $large_R1 | fasten_regex --regex '$pattern2' --which SEQ" \ + -n "seqkit grep $pattern" "cat $large_R1 | seqkit grep --by-seq --pattern '$pattern'" \ + -n "seqkit grep $pattern2" "cat $large_R1 | seqkit grep --by-seq --pattern '$pattern2'" \ + -n "seqfu grep $pattern" "seqfu grep -r '$pattern' $large_R1" \ + -n "seqfu grep $pattern2" "seqfu grep -r '$pattern2' $large_R1" \ + +plot_whisker.py --title "Regex on a sequence (reps=$num_runs)" --output $reportsDir/regex.json.png $reportsDir/regex.json + diff --git a/tests/benchmark_sample.sh b/tests/benchmark_sample.sh new file mode 100644 index 00000000..944c680f --- /dev/null +++ b/tests/benchmark_sample.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -e +set -u + +thisDir=$(dirname $0); +source "$thisDir/lib/benchmark.sh"; +export PATH=$thisDir/../target/release:$PATH + +hyperfine --export-json=$reportsDir/sample.json --warmup 2 --shell $SHELL --runs $num_runs \ + -n "Fasten sample" "cat $large_R1 | fasten_sample --frequency 0.1" \ + -n "seqkit sample" "cat $large_R1 | seqkit sample --proportion 0.1" \ + -n "Seqtk sample" "seqtk seq -f 0.1 $large_R1"; + +plot_whisker.py --title "subsample reads (reps=$num_runs)" --labels "fasten sample,seqkit sample,seqtk sample" --output $reportsDir/sample.json.png $reportsDir/sample.json + diff --git a/tests/benchmark_sort.sh b/tests/benchmark_sort.sh new file mode 100644 index 00000000..7602a114 --- /dev/null +++ b/tests/benchmark_sort.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -e +set -u + +thisDir=$(dirname $0); +source "$thisDir/lib/benchmark.sh"; +export PATH=$thisDir/../target/release:$PATH + +# sorting +hyperfine --export-json=$reportsDir/sort.json --warmup 2 --shell $SHELL --runs $num_runs \ + -n "Fasten sort id" "cat $large_R1 | fasten_sort --sort-by id > /dev/null" \ + -n "Fasten sort seq" "cat $large_R1 | fasten_sort --sort-by seq > /dev/null" \ + -n "Seqkit sort id" "cat $large_R1 | seqkit sort > /dev/null" \ + -n "Seqkit sort seq" "cat $large_R1 | seqkit sort --by-seq > /dev/null" \ + -n "unix pipe sort id" "cat $large_R1 | paste - - - - | sort -k 1 | tr '\t' '\n' > /dev/null" \ + -n "unix pipe sort seq" "cat $large_R1 | paste - - - - | sort -k 2 | tr '\t' '\n' > /dev/null" + +plot_whisker.py --title "Sort sequences (reps=$num_runs)" --output $reportsDir/sort.json.png $reportsDir/sort.json + diff --git a/tests/benchmark_trim.sh b/tests/benchmark_trim.sh new file mode 100644 index 00000000..a905fe2d --- /dev/null +++ b/tests/benchmark_trim.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -e +set -u + +thisDir=$(dirname $0); +source "$thisDir/lib/benchmark.sh"; +export PATH=$thisDir/../target/release:$PATH + +# get first 100 reads for any fastq file +hyperfine --export-json=$reportsDir/trim.json --warmup 2 --shell $SHELL --runs $num_runs \ + -n "fasten_trim" "cat $large_R1 | fasten_trim -f 5 -l 10" \ + -n "seqtk trimfq" "cat $large_R1 | seqtk trimfq -b 5 -e 90" \ + -n "seqfu cat" "cat $large_R1 | seqfu cat --trim-front 5 --trim-tail 90" + +plot_whisker.py --title "Trim (reps=$num_runs)" --output $reportsDir/trim.json.png $reportsDir/trim.json + diff --git a/tests/deprecated/10_benchmark.sh b/tests/deprecated/10_benchmark.sh new file mode 100644 index 00000000..32169e11 --- /dev/null +++ b/tests/deprecated/10_benchmark.sh @@ -0,0 +1,107 @@ +#!/bin/bash + +set -e + +thisDir=$(dirname $0); +export PATH=$thisDir/../target/release:$PATH + +# Hyperfine parameters +# Locally, just run a handful of times per test +# but in the cloud, boost it to ten +num_runs=20 +if [[ ! -z "$CI" ]]; then + num_runs=100 +fi +# output directory for markdown files +reportsDir="$thisDir/hyperfine" +mkdir -pv $reportsDir + +thisDir=$(dirname $(realpath $0)) + +R1="$thisDir/../testdata/R1.fastq" +R2="$thisDir/../testdata/R2.fastq" + +large_R1="$thisDir/../testdata/R1.large.fastq" +large_R2="$thisDir/../testdata/R2.large.fastq" + +large_interleaved="$thisDir/../testdata/shuffled.large.fastq.gz" +large_sorted="$thisDir/../testdata/shuffled.sorted.fastq.gz" + +# Just make it simple locally but a large number online +multiplier=1000 +if [[ ! -z "$CI" ]]; then + multiplier=100000; +fi +R1_content=`cat $R1`; +R2_content=`cat $R2`; +for i in `seq 1 $multiplier`; do + echo "$R1_content" +done > $large_R1 +for i in `seq 1 $multiplier`; do + echo "$R2_content" +done > $large_R2 + +# Version information +seqtk 2>&1 | grep -i version | sed 's/^/seqtk /' +seqkit version | grep -m 1 v +fasten_clean --version + +# Save this file even though it's not used in the first benchmark +cat $R1 $R2 | fasten_shuffle | gzip -c9 > $large_interleaved +ls -lhS $large_interleaved $R1 $R2 + +# Fasten_progress time trial +hyperfine --export-json=$reportsDir/progress.json --warmup 2 --shell $SHELL --runs $num_runs \ + -n "Basic fasten_shuffle" "cat $large_R1 $large_R2 | fasten_shuffle > /dev/null" \ + -n "Shuffle with progress bar before" "cat $large_R1 $large_R2 | fasten_progress --print 2>/dev/null | fasten_shuffle > /dev/null" \ + -n "Shuffle with progress bar after" "cat $large_R1 $large_R2 | fasten_shuffle | fasten_progress 2>/dev/null > /dev/null" + +plot_whisker.py --title "Interleave reads with or without a progress meter (reps=$num_runs)" --labels without,before,after --output $reportsDir/progress.json.png $reportsDir/progress.json + +hyperfine --export-json=$reportsDir/sample.json --warmup 2 --shell $SHELL --runs $num_runs \ + -n "Fasten sample" "cat $large_R1 | fasten_sample --frequency 0.1" \ + -n "Seqtk sample" "seqtk seq -f 0.1 $large_R1"; + +plot_whisker.py --title "subsample reads" --labels "fasten sample, seqtk sample (reps=$num_runs)" --output $reportsDir/sample.json.png $reportsDir/sample.json + +# Convert fastq=>fasta. +# Set the width to really long on fq2fa to match the output exactly +hyperfine --export-json=$reportsDir/convertToFasta.json --warmup 2 --shell $SHELL --runs $num_runs \ + -n "Fasten convert" "zcat $large_sorted | fasten_convert -i fastq -o fasta " \ + -n "Seqkit fq2fa -w 0" "zcat $large_sorted | seqkit fq2fa " + #-n "Seqtk seq -A" "seqtk seq -A $large_sorted" + +plot_whisker.py --title "Converting fastq to fasta (reps=$num_runs)" --output $reportsDir/convertToFasta.json.png $reportsDir/convertToFasta.json + +# get first 100 reads for any fastq file +hyperfine --export-json=$reportsDir/straighten.json --warmup 2 --shell $SHELL --runs $num_runs \ + -n "fasten_straighten" "cat $large_R1 | fasten_straighten | head -n 400" \ + -n "seqkit head" "cat $large_R1 | seqkit head --number 100" \ + -n "seqtk seq straighten" "seqtk seq -l0 $large_R1 | head -n 400" + +plot_whisker.py --title "Get first 100 reads from fastq (reps=$num_runs)" --output $reportsDir/straighten.json.png $reportsDir/straighten.json + +# Grep for seq +pattern="CCCC" +pattern2="ATG" +hyperfine --export-json=$reportsDir/regex.json --warmup 2 --shell $SHELL --runs $num_runs \ + -n "fasten_regex $pattern" "cat $large_R1 | fasten_regex --regex '$pattern' --which SEQ > /dev/null" \ + -n "fasten_regex $pattern2" "cat $large_R1 | fasten_regex --regex '$pattern2' --which SEQ > /dev/null" \ + -n "seqkit grep $pattern" "cat $large_R1 | seqkit grep --by-seq --pattern '$pattern' > /dev/null" \ + -n "seqkit grep $pattern2" "cat $large_R1 | seqkit grep --by-seq --pattern '$pattern2' > /dev/null" + +plot_whisker.py --title "Regex on a sequence (reps=$num_runs)" --output $reportsDir/regex.json.png $reportsDir/regex.json + +# Fasten sort file size +zcat $large_interleaved | fasten_sort --sort-by GC --paired-end | gzip -c > $large_sorted +ls -lh $large_interleaved $large_sorted + +# sorting +hyperfine --export-json=$reportsDir/sort.json --warmup 2 --shell $SHELL --runs $num_runs \ + -n "Fasten sort id" "zcat $large_R1 | fasten_sort --sort-by id > /dev/null" \ + -n "Fasten sort seq" "zcat $large_R1 | fasten_sort --sort-by seq > /dev/null" \ + -n "Seqkit sort id" "zcat $large_R1 | seqkit sort > /dev/null" \ + -n "Seqkit sort seq" "zcat $large_R1 | seqkit sort --by-seq > /dev/null" + +plot_whisker.py --title "Sort sequences (reps=$num_runs)" --output $reportsDir/sort.json.png $reportsDir/sort.json + diff --git a/tests/lib/benchmark.sh b/tests/lib/benchmark.sh new file mode 100644 index 00000000..ca593510 --- /dev/null +++ b/tests/lib/benchmark.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +set -e +set -u + +# Hyperfine parameters +# Locally, just run a handful of times per test +# but in the cloud, boost it to ten +num_runs=20 +# How many times to multiply the four reads file to make a large one +multiplier=1000 +if [[ ! -z ${CI+x} ]]; then + num_runs=1000 + multiplier=10000; +fi + +if [[ -z ${thisDir+x} ]]; then + echo "ERROR: \$thisDir needs to be set up in a shell script, and then this library should be sourced." + exit 1 +fi + +# output directory for markdown files +reportsDir="$thisDir/hyperfine" +mkdir -pv $reportsDir + +R1="$thisDir/../testdata/R1.fastq" +R2="$thisDir/../testdata/R2.fastq" + +large_R1="$thisDir/../testdata/R1.large.fastq" +large_R2="$thisDir/../testdata/R2.large.fastq" + +large_interleaved="$thisDir/../testdata/shuffled.large.fastq.gz" +large_sorted="$thisDir/../testdata/shuffled.sorted.fastq.gz" + +R1_content=`cat $R1`; +R2_content=`cat $R2`; +for i in `seq 1 $multiplier`; do + echo "$R1_content" +done | seqkit rename > $large_R1 +for i in `seq 1 $multiplier`; do + echo "$R2_content" +done | seqkit rename > $large_R2 + +# Save this file even though it's not used in the first benchmark +cat $R1 $R2 | fasten_shuffle | gzip -c9 > $large_interleaved +#ls -lhS $large_interleaved $R1 $R2 + +# Fasten sort file size +zcat $large_interleaved | fasten_sort --sort-by GC --paired-end | gzip -c > $large_sorted +#ls -lh $large_interleaved $large_sorted + +which bbnorm.sh +which fasten_clean + +# Version information +seqtk 2>&1 | grep -i version | sed 's/^/seqtk /' +seqkit version | grep -m 1 v +fasten_clean --version +fastq_to_fasta -h | grep "Part of FASTX" +bbnorm.sh version 2>&1 | grep 'BBMap version' + +