diff --git a/Cargo.toml b/Cargo.toml index a077e66..9760e3c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "fastleng" -version = "0.1.1" -authors = ["holtjma "] -edition = "2018" +version = "0.2.0" +authors = ["holtjma "] +edition = "2021" license = "MIT OR Apache-2.0" description = "fastleng - read length statistics tool" homepage = "https://github.com/HudsonAlpha/rust-fastleng" @@ -18,6 +18,7 @@ env_logger = "0.9.0" exitcode = "1.1.2" log = "0.4.14" needletail = "0.4.1" +rust-htslib = { version = "0.39.5", default-features = false, features = ["static"] } serde = { version = "1.0.129", features = ["derive"] } serde_json = "1.0.66" diff --git a/README.md b/README.md index ffc1bf2..c1d0a94 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Build status](https://github.com/HudsonAlpha/rust-fastleng/actions/workflows/quickstart-ci.yml/badge.svg)](https://github.com/HudsonAlpha/rust-fastleng/actions) # rust-fastleng -`fastleng` is a tool created specifically for gathering sequence length information from a FASTQ or FASTA file. +`fastleng` is a tool created specifically for gathering sequence length information from a FASTQ, FASTA, or unaligned BAM file. ### Why another FASTX stat tool? While there are numerous tools that will generate summary statistics for FASTX files, I was not able to find one that computed all the desired length metrics for _both_ FASTQ and FASTA. @@ -45,20 +45,22 @@ fastleng {data.fq.gz} > {output.json} ### Example output ``` { - "total_bases": 1358218298, - "total_sequences": 100000, - "mean_length": 13582.18298, - "median_length": 13664.0, - "n50": 13775, - "n75": 13027, - "n90": 12543 + "total_bases": 21750112406, + "total_sequences": 1305936, + "mean_length": 16654.807284583625, + "median_length": 16600.0, + "n10": 18849, + "n25": 17833, + "n50": 16739, + "n75": 15842, + "n90": 15209 } ``` 1. `total_bases` - the total number of basepairs across all sequences in the input file 2. `total_sequences` - the total number of sequences (i.e. strings) contained in the input file 3. `mean_length` - the average length of the counted sequences 4. `median_length` - the median length of the counted sequences -5. `n50`, `n75`, `n90` - the [N-score](https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics) of the sequences for 50, 75, and 90 respectively +5. `n10`, `n25`, `n50`, `n75`, `n90` - the [N-score](https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics) of the sequences for 10, 25, 50, 75, and 90 respectively; these should be monotonically decreasing, respectively ### Options to consider 1. `-h` - see full list of options and exit @@ -89,4 +91,4 @@ at your option. ## Contribution Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be -dual licensed as above, without any additional terms or conditions. \ No newline at end of file +dual licensed as above, without any additional terms or conditions. diff --git a/src/bam_loader.rs b/src/bam_loader.rs new file mode 100644 index 0000000..875cbda --- /dev/null +++ b/src/bam_loader.rs @@ -0,0 +1,225 @@ + +use log::{info, warn}; +use rust_htslib::{bam, bam::Read}; +use std::collections::BTreeMap; + +/// This is the main function for gathering all sequence lengths for a fastx file into a BTreeMap. +/// # Arguments +/// * `filename` - the filename to read sequences from +/// # Examples +/// ``` +/// use std::collections::BTreeMap; +/// use fastleng::bam_loader::gather_bam_stats; +/// let filename = "./test_data/single_string.sam"; +/// let counts: BTreeMap = gather_bam_stats(&filename).unwrap(); +/// ``` +pub fn gather_bam_stats(filename: &str) -> Result, Box> { + gather_bam_stats_with_seed(filename, None) +} + +/// This will gather sequence lengths from a filename and add them to a provided BTreeMap (`initial_counts`). +/// # Arguments +/// * `filename` - the filename to read sequences from +/// * `initial_counts` - if provided, this will use that BTreeMap as the inital counts, otherwise it will create an empty one +/// # Examples +/// ``` +/// use std::collections::BTreeMap; +/// use fastleng::bam_loader::gather_bam_stats_with_seed; +/// let filename = "./test_data/single_string.sam"; +/// let initial_counts: BTreeMap = BTreeMap::new(); +/// let counts: BTreeMap = gather_bam_stats_with_seed(&filename, Some(initial_counts)).unwrap(); +/// ``` +pub fn gather_bam_stats_with_seed(filename: &str, initial_counts: Option>) -> Result, Box> { + //create an empty stats file (or use initial counts) and ready the reader + let mut hash_stats: BTreeMap = match initial_counts { + Some(ic) => ic, + None => BTreeMap::new() + }; + let mut reader = bam::Reader::from_path(filename)?; + + //go through all the records + let mut warning_triggered = false; + let mut count: usize = 0; + info!("Loading file \"{}\"...", filename); + for read_entry in reader.records() { + //all we care about is the sequence length + let record = read_entry?; + let seq_len: usize = record.seq_len(); + + if !warning_triggered && !record.is_unmapped() { + // user gave us an aligned file, spit out a one-time warning + warn!("Detected aligned reads, this is not properly handled: {filename}"); + warning_triggered = true; + } + + //insert 0 if absent; then increment + let len_count: &mut u64 = hash_stats.entry(seq_len).or_insert(0); + *len_count += 1; + + count += 1; + if count % 1000000 == 0 { + info!("Processed {} sequences", count); + } + } + info!("Finished loading file with {} sequences.", count); + + //return the full count list now + Ok(hash_stats) +} + +#[cfg(test)] +mod tests { + use super::*; + + // allows us to test a bunch at once + use crate::fastx_loader::gather_multifastx_stats; + + /// This one is a single sequence "A" + fn stats_basic_bam() -> BTreeMap { + let mut results: BTreeMap = BTreeMap::new(); + results.insert(1, 1); + results + } + + /// one of each length from 1-5 + fn stats_basic_bam2() -> BTreeMap { + let mut results: BTreeMap = BTreeMap::new(); + for l in 1..6 { + results.insert(l, 1); + } + results + } + + /// mix of a few lengths from 1-4 + fn stats_basic_bam3() -> BTreeMap { + let mut results: BTreeMap = BTreeMap::new(); + results.insert(1, 3); + results.insert(2, 2); + results.insert(3, 1); + results.insert(4, 2); + results + } + + /// some longer strings + fn stats_basic_bam4() -> BTreeMap { + let mut results: BTreeMap = BTreeMap::new(); + results.insert(50, 2); + results.insert(100, 2); + results.insert(150, 2); + results.insert(1000, 1); + results + } + + #[test] + fn test_basic_sam() { + //build some inputs + let filename = "./test_data/single_string.sam"; + + //get the expected outputs + let expected = stats_basic_bam(); + + //now do it for real + let hash_stats = gather_bam_stats(&filename).unwrap(); + assert_eq!(hash_stats, expected); + } + + #[test] + fn test_basic_sam2() { + //build some inputs + let filename = "./test_data/five_strings.sam"; + + //get the expected outputs + let expected = stats_basic_bam2(); + + //now do it for real + let hash_stats = gather_bam_stats(&filename).unwrap(); + assert_eq!(hash_stats, expected); + } + + #[test] + fn test_basic_sam3() { + //build some inputs + let filename = "./test_data/small_strings.sam"; + + //get the expected outputs + let expected = stats_basic_bam3(); + + //now do it for real + let hash_stats = gather_bam_stats(&filename).unwrap(); + assert_eq!(hash_stats, expected); + } + + #[test] + fn test_basic_sam4() { + //build some inputs + let filename = "./test_data/long_strings.sam"; + + //get the expected outputs + let expected = stats_basic_bam4(); + + //now do it for real + let hash_stats = gather_bam_stats(&filename).unwrap(); + assert_eq!(hash_stats, expected); + } + + #[test] + fn test_basic_bam4() { + //build some inputs + let filename = "./test_data/long_strings.bam"; + + //get the expected outputs + let expected = stats_basic_bam4(); + + //now do it for real + let hash_stats = gather_bam_stats(&filename).unwrap(); + assert_eq!(hash_stats, expected); + } + + #[test] + #[should_panic] + fn test_error_handling() { + let filename = "./test_data/panic_file.fa"; + let _hash_stats = gather_bam_stats(&filename).unwrap(); + } + + #[test] + fn test_multifastx() { + let filenames = [ + "./test_data/single_string.sam", + "./test_data/five_strings.sam", + "./test_data/small_strings.sam", + "./test_data/long_strings.bam" + ]; + + //get the expected outputs + let expected_list = [ + stats_basic_bam(), + stats_basic_bam2(), + stats_basic_bam3(), + stats_basic_bam4() + ]; + + //sum the expected outputs + let mut expected: BTreeMap = BTreeMap::new(); + for results in expected_list.iter() { + for (key, value) in results.iter() { + let len_count: &mut u64 = expected.entry(*key).or_insert(0); + *len_count += value; + } + } + + //now do it for real + let hash_stats = gather_multifastx_stats(&filenames).unwrap(); + assert_eq!(hash_stats, expected); + } + + #[test] + #[should_panic] + fn test_multifastx_error_handling() { + let filenames = [ + "./test_data/single_string.bam", + "./test_data/panic_file.fa" + ]; + let _hash_stats = gather_multifastx_stats(&filenames).unwrap(); + } +} \ No newline at end of file diff --git a/src/fastx_loader.rs b/src/fastx_loader.rs index 4d1da87..09911fd 100644 --- a/src/fastx_loader.rs +++ b/src/fastx_loader.rs @@ -1,11 +1,10 @@ -extern crate log; -extern crate needletail; - use log::{error, info}; use needletail::parse_fastx_file; use std::collections::BTreeMap; +use crate::bam_loader::gather_bam_stats_with_seed; + /// This is the main function for gathering all sequence lengths for a fastx file into a BTreeMap. /// # Arguments /// * `filename` - the filename to read sequences from @@ -38,7 +37,7 @@ pub fn gather_fastx_stats_with_seed(filename: &str, initial_counts: Option ic, None => BTreeMap::new() }; - let mut reader = parse_fastx_file(&filename)?; + let mut reader = parse_fastx_file(filename)?; //go through all the records let mut count: usize = 0; @@ -84,14 +83,26 @@ pub fn gather_multifastx_stats + std::fmt::Debug>(filenames: &[T]) */ let mut hash_stats: BTreeMap = BTreeMap::new(); for filename in filenames.iter() { - hash_stats = match gather_fastx_stats_with_seed(filename.as_ref(), Some(hash_stats)) { - Ok(result) => result, - Err(e) => { - error!("Error while parsing FASTX file: {:?}", filename); - error!("Error: {:?}", e); - return Err(e); - } - }; + if filename.as_ref().ends_with(".bam") || filename.as_ref().ends_with(".sam") { + hash_stats = match gather_bam_stats_with_seed(filename.as_ref(), Some(hash_stats)) { + Ok(result) => result, + Err(e) => { + error!("Error while parsing BAM file: {:?}", filename); + error!("Error: {:?}", e); + return Err(e); + } + }; + } + else { + hash_stats = match gather_fastx_stats_with_seed(filename.as_ref(), Some(hash_stats)) { + Ok(result) => result, + Err(e) => { + error!("Error while parsing FASTX file: {:?}", filename); + error!("Error: {:?}", e); + return Err(e); + } + }; + } } Ok(hash_stats) } @@ -226,6 +237,37 @@ mod tests { assert_eq!(hash_stats, expected); } + #[test] + fn test_multimixed() { + let filenames = [ + "./test_data/single_string.fa", + "./test_data/five_strings.sam", + "./test_data/small_strings.fa", + "./test_data/long_strings.bam" + ]; + + //get the expected outputs + let expected_list = [ + stats_basic_fasta(), + stats_basic_fasta2(), + stats_basic_fasta3(), + stats_basic_fasta4() + ]; + + //sum the expected outputs + let mut expected: BTreeMap = BTreeMap::new(); + for results in expected_list.iter() { + for (key, value) in results.iter() { + let len_count: &mut u64 = expected.entry(*key).or_insert(0); + *len_count += value; + } + } + + //now do it for real + let hash_stats = gather_multifastx_stats(&filenames).unwrap(); + assert_eq!(hash_stats, expected); + } + #[test] #[should_panic] fn test_multifastx_error_handling() { diff --git a/src/length_stats.rs b/src/length_stats.rs index 766d9c4..9a8646b 100644 --- a/src/length_stats.rs +++ b/src/length_stats.rs @@ -112,6 +112,10 @@ pub struct LengthStats { pub mean_length: f64, /// The median length of the sequences pub median_length: f64, + /// N10 - 10% of bases are in sequences of length greater than this value + pub n10: usize, + /// N25 - 25% of bases are in sequences of length greater than this value + pub n25: usize, /// N50 - 50% of bases are in sequences of length greater than this value pub n50: usize, /// N75 - 75% of bases are in sequences of length greater than this value @@ -139,6 +143,8 @@ pub fn compute_length_stats(length_counts: &BTreeMap) -> LengthStats //first get all the totals let (total_bases, total_seqs): (u64, u64) = compute_total_counts(length_counts); let median_length: f64 = compute_median_length(length_counts, total_seqs); + let n10: usize = compute_n_score(length_counts, total_bases, 10); + let n25: usize = compute_n_score(length_counts, total_bases, 25); let n50: usize = compute_n_score(length_counts, total_bases, 50); let n75: usize = compute_n_score(length_counts, total_bases, 75); let n90: usize = compute_n_score(length_counts, total_bases, 90); @@ -149,6 +155,8 @@ pub fn compute_length_stats(length_counts: &BTreeMap) -> LengthStats total_sequences: total_seqs, mean_length: (total_bases as f64) / (total_seqs as f64), median_length, + n10, + n25, n50, n75, n90 @@ -308,6 +316,8 @@ mod tests { total_sequences: 100, mean_length: 10.0, median_length: 10.0, + n10: 10, + n25: 10, n50: 10, n75: 10, n90: 10 diff --git a/src/lib.rs b/src/lib.rs index 6cefed4..0963872 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,7 +16,8 @@ let length_counts: BTreeMap = gather_fastx_stats(&filename).unwrap() let length_metrics: LengthStats = compute_length_stats(&length_counts); ``` */ - +/// Contains the logic for loading length information from an unaligned BAM/SAM file +pub mod bam_loader; /// Contains the logic for loading length information from a fastx file pub mod fastx_loader; /// Contains the logic for calculating the summary statistics from the counts diff --git a/src/main.rs b/src/main.rs index 7c8d604..4caf8e9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -20,8 +20,8 @@ fn main() { let matches = App::new("fastleng") .version(VERSION.unwrap_or("?")) - .author("J. Matthew Holt ") - .about("fastleng - a sequence length statistics generator for fastx files") + .author("J. Matthew Holt ") + .about("fastleng - a sequence length statistics generator for fastx/uBAM files") .arg( Arg::with_name("out_json") .short("o") @@ -38,7 +38,7 @@ fn main() { ) .arg( Arg::with_name("FASTX") - .help("The FASTQ/A file(s) to gather stats on, gzip accepted") + .help("The FASTQ/A or uBAM file(s) to gather stats on, gzip accepted") .required(true) .multiple(true) .index(1) @@ -60,7 +60,7 @@ fn main() { match File::open(fastx_fn) { Ok(_) => {} Err(e) => { - error!("Failed to open FASTX file: {:?}", fastx_fn); + error!("Failed to open file: {:?}", fastx_fn); error!("Error: {:?}", e); std::process::exit(exitcode::NOINPUT); } @@ -93,7 +93,7 @@ fn main() { let length_counts: BTreeMap = match gather_multifastx_stats(&fastx_fns) { Ok(result) => result, Err(e) => { - error!("Error while parsing FASTX files: {:?}", fastx_fns); + error!("Error while parsing input files: {:?}", fastx_fns); error!("Error: {:?}", e); std::process::exit(exitcode::IOERR); } @@ -108,7 +108,7 @@ fn main() { //this is what we should put in the file if out_fn == "stdout" { let pretty_json: String = serde_json::to_string_pretty(&length_metrics).unwrap(); - println!("{}", pretty_json); + println!("{pretty_json}"); } else { info!("Saving results to file: {:?}", out_fn); diff --git a/test_data/five_strings.sam b/test_data/five_strings.sam new file mode 100644 index 0000000..904b272 --- /dev/null +++ b/test_data/five_strings.sam @@ -0,0 +1,6 @@ +@HD VN:1.5 SO:unknown pb:3.0.1 +m64109_200805_204709/1/ccs 4 * 0 255 * * 0 0 A ~ +m64109_200805_204709/2/ccs 4 * 0 255 * * 0 0 AA ~~ +m64109_200805_204709/3/ccs 4 * 0 255 * * 0 0 AAA ~~~ +m64109_200805_204709/4/ccs 4 * 0 255 * * 0 0 AAAA ~~~~ +m64109_200805_204709/5/ccs 4 * 0 255 * * 0 0 AAAAA ~~~~~ \ No newline at end of file diff --git a/test_data/long_strings.bam b/test_data/long_strings.bam new file mode 100644 index 0000000..53e2cc2 Binary files /dev/null and b/test_data/long_strings.bam differ diff --git a/test_data/long_strings.sam b/test_data/long_strings.sam new file mode 100644 index 0000000..5843ffd --- /dev/null +++ b/test_data/long_strings.sam @@ -0,0 +1,8 @@ +@HD VN:1.5 SO:unknown pb:3.0.1 +m64109_200805_204709/1/ccs 4 * 0 255 * * 0 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +m64109_200805_204709/2/ccs 4 * 0 255 * * 0 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +m64109_200805_204709/3/ccs 4 * 0 255 * * 0 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +m64109_200805_204709/4/ccs 4 * 0 255 * * 0 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +m64109_200805_204709/5/ccs 4 * 0 255 * * 0 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +m64109_200805_204709/6/ccs 4 * 0 255 * * 0 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +m64109_200805_204709/7/ccs 4 * 0 255 * * 0 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ \ No newline at end of file diff --git a/test_data/single_string.sam b/test_data/single_string.sam new file mode 100644 index 0000000..da315c5 --- /dev/null +++ b/test_data/single_string.sam @@ -0,0 +1,2 @@ +@HD VN:1.5 SO:unknown pb:3.0.1 +m64109_200805_204709/1/ccs 4 * 0 255 * * 0 0 A ~ \ No newline at end of file diff --git a/test_data/small_strings.sam b/test_data/small_strings.sam new file mode 100644 index 0000000..bbbfc3f --- /dev/null +++ b/test_data/small_strings.sam @@ -0,0 +1,9 @@ +@HD VN:1.5 SO:unknown pb:3.0.1 +m64109_200805_204709/1/ccs 4 * 0 255 * * 0 0 AAAA ~~~~ +m64109_200805_204709/2/ccs 4 * 0 255 * * 0 0 AA ~~ +m64109_200805_204709/3/ccs 4 * 0 255 * * 0 0 A ~ +m64109_200805_204709/4/ccs 4 * 0 255 * * 0 0 AA ~~ +m64109_200805_204709/5/ccs 4 * 0 255 * * 0 0 A ~ +m64109_200805_204709/6/ccs 4 * 0 255 * * 0 0 AAA ~~~ +m64109_200805_204709/7/ccs 4 * 0 255 * * 0 0 AAAA ~~~~ +m64109_200805_204709/8/ccs 4 * 0 255 * * 0 0 A ~ \ No newline at end of file