Skip to content

Commit

Permalink
Merge pull request #6 from HudsonAlpha/ubam-integration
Browse files Browse the repository at this point in the history
Ubam integration
  • Loading branch information
holtjma authored Apr 20, 2023
2 parents 5123a5e + 9c525b4 commit 4be9085
Show file tree
Hide file tree
Showing 12 changed files with 338 additions and 32 deletions.
7 changes: 4 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[package]
name = "fastleng"
version = "0.1.1"
authors = ["holtjma <[email protected]>"]
edition = "2018"
version = "0.2.0"
authors = ["holtjma <[email protected]>"]
edition = "2021"
license = "MIT OR Apache-2.0"
description = "fastleng - read length statistics tool"
homepage = "https://github.com/HudsonAlpha/rust-fastleng"
Expand All @@ -18,6 +18,7 @@ env_logger = "0.9.0"
exitcode = "1.1.2"
log = "0.4.14"
needletail = "0.4.1"
rust-htslib = { version = "0.39.5", default-features = false, features = ["static"] }
serde = { version = "1.0.129", features = ["derive"] }
serde_json = "1.0.66"

Expand Down
22 changes: 12 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
[![Build status](https://github.com/HudsonAlpha/rust-fastleng/actions/workflows/quickstart-ci.yml/badge.svg)](https://github.com/HudsonAlpha/rust-fastleng/actions)

# rust-fastleng
`fastleng` is a tool created specifically for gathering sequence length information from a FASTQ or FASTA file.
`fastleng` is a tool created specifically for gathering sequence length information from a FASTQ, FASTA, or unaligned BAM file.

### Why another FASTX stat tool?
While there are numerous tools that will generate summary statistics for FASTX files, I was not able to find one that computed all the desired length metrics for _both_ FASTQ and FASTA.
Expand Down Expand Up @@ -45,20 +45,22 @@ fastleng {data.fq.gz} > {output.json}
### Example output
```
{
"total_bases": 1358218298,
"total_sequences": 100000,
"mean_length": 13582.18298,
"median_length": 13664.0,
"n50": 13775,
"n75": 13027,
"n90": 12543
"total_bases": 21750112406,
"total_sequences": 1305936,
"mean_length": 16654.807284583625,
"median_length": 16600.0,
"n10": 18849,
"n25": 17833,
"n50": 16739,
"n75": 15842,
"n90": 15209
}
```
1. `total_bases` - the total number of basepairs across all sequences in the input file
2. `total_sequences` - the total number of sequences (i.e. strings) contained in the input file
3. `mean_length` - the average length of the counted sequences
4. `median_length` - the median length of the counted sequences
5. `n50`, `n75`, `n90` - the [N-score](https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics) of the sequences for 50, 75, and 90 respectively
5. `n10`, `n25`, `n50`, `n75`, `n90` - the [N-score](https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics) of the sequences for 10, 25, 50, 75, and 90 respectively; these should be monotonically decreasing, respectively

### Options to consider
1. `-h` - see full list of options and exit
Expand Down Expand Up @@ -89,4 +91,4 @@ at your option.
## Contribution
Unless you explicitly state otherwise, any contribution intentionally submitted
for inclusion in the work by you, as defined in the Apache-2.0 license, shall be
dual licensed as above, without any additional terms or conditions.
dual licensed as above, without any additional terms or conditions.
225 changes: 225 additions & 0 deletions src/bam_loader.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@

use log::{info, warn};
use rust_htslib::{bam, bam::Read};
use std::collections::BTreeMap;

/// This is the main function for gathering all sequence lengths for a fastx file into a BTreeMap.
/// # Arguments
/// * `filename` - the filename to read sequences from
/// # Examples
/// ```
/// use std::collections::BTreeMap;
/// use fastleng::bam_loader::gather_bam_stats;
/// let filename = "./test_data/single_string.sam";
/// let counts: BTreeMap<usize, u64> = gather_bam_stats(&filename).unwrap();
/// ```
pub fn gather_bam_stats(filename: &str) -> Result<BTreeMap<usize, u64>, Box<dyn std::error::Error>> {
gather_bam_stats_with_seed(filename, None)
}

/// This will gather sequence lengths from a filename and add them to a provided BTreeMap (`initial_counts`).
/// # Arguments
/// * `filename` - the filename to read sequences from
/// * `initial_counts` - if provided, this will use that BTreeMap as the inital counts, otherwise it will create an empty one
/// # Examples
/// ```
/// use std::collections::BTreeMap;
/// use fastleng::bam_loader::gather_bam_stats_with_seed;
/// let filename = "./test_data/single_string.sam";
/// let initial_counts: BTreeMap<usize, u64> = BTreeMap::new();
/// let counts: BTreeMap<usize, u64> = gather_bam_stats_with_seed(&filename, Some(initial_counts)).unwrap();
/// ```
pub fn gather_bam_stats_with_seed(filename: &str, initial_counts: Option<BTreeMap<usize, u64>>) -> Result<BTreeMap<usize, u64>, Box<dyn std::error::Error>> {
//create an empty stats file (or use initial counts) and ready the reader
let mut hash_stats: BTreeMap<usize, u64> = match initial_counts {
Some(ic) => ic,
None => BTreeMap::new()
};
let mut reader = bam::Reader::from_path(filename)?;

//go through all the records
let mut warning_triggered = false;
let mut count: usize = 0;
info!("Loading file \"{}\"...", filename);
for read_entry in reader.records() {
//all we care about is the sequence length
let record = read_entry?;
let seq_len: usize = record.seq_len();

if !warning_triggered && !record.is_unmapped() {
// user gave us an aligned file, spit out a one-time warning
warn!("Detected aligned reads, this is not properly handled: {filename}");
warning_triggered = true;
}

//insert 0 if absent; then increment
let len_count: &mut u64 = hash_stats.entry(seq_len).or_insert(0);
*len_count += 1;

count += 1;
if count % 1000000 == 0 {
info!("Processed {} sequences", count);
}
}
info!("Finished loading file with {} sequences.", count);

//return the full count list now
Ok(hash_stats)
}

#[cfg(test)]
mod tests {
use super::*;

// allows us to test a bunch at once
use crate::fastx_loader::gather_multifastx_stats;

/// This one is a single sequence "A"
fn stats_basic_bam() -> BTreeMap<usize, u64> {
let mut results: BTreeMap<usize, u64> = BTreeMap::new();
results.insert(1, 1);
results
}

/// one of each length from 1-5
fn stats_basic_bam2() -> BTreeMap<usize, u64> {
let mut results: BTreeMap<usize, u64> = BTreeMap::new();
for l in 1..6 {
results.insert(l, 1);
}
results
}

/// mix of a few lengths from 1-4
fn stats_basic_bam3() -> BTreeMap<usize, u64> {
let mut results: BTreeMap<usize, u64> = BTreeMap::new();
results.insert(1, 3);
results.insert(2, 2);
results.insert(3, 1);
results.insert(4, 2);
results
}

/// some longer strings
fn stats_basic_bam4() -> BTreeMap<usize, u64> {
let mut results: BTreeMap<usize, u64> = BTreeMap::new();
results.insert(50, 2);
results.insert(100, 2);
results.insert(150, 2);
results.insert(1000, 1);
results
}

#[test]
fn test_basic_sam() {
//build some inputs
let filename = "./test_data/single_string.sam";

//get the expected outputs
let expected = stats_basic_bam();

//now do it for real
let hash_stats = gather_bam_stats(&filename).unwrap();
assert_eq!(hash_stats, expected);
}

#[test]
fn test_basic_sam2() {
//build some inputs
let filename = "./test_data/five_strings.sam";

//get the expected outputs
let expected = stats_basic_bam2();

//now do it for real
let hash_stats = gather_bam_stats(&filename).unwrap();
assert_eq!(hash_stats, expected);
}

#[test]
fn test_basic_sam3() {
//build some inputs
let filename = "./test_data/small_strings.sam";

//get the expected outputs
let expected = stats_basic_bam3();

//now do it for real
let hash_stats = gather_bam_stats(&filename).unwrap();
assert_eq!(hash_stats, expected);
}

#[test]
fn test_basic_sam4() {
//build some inputs
let filename = "./test_data/long_strings.sam";

//get the expected outputs
let expected = stats_basic_bam4();

//now do it for real
let hash_stats = gather_bam_stats(&filename).unwrap();
assert_eq!(hash_stats, expected);
}

#[test]
fn test_basic_bam4() {
//build some inputs
let filename = "./test_data/long_strings.bam";

//get the expected outputs
let expected = stats_basic_bam4();

//now do it for real
let hash_stats = gather_bam_stats(&filename).unwrap();
assert_eq!(hash_stats, expected);
}

#[test]
#[should_panic]
fn test_error_handling() {
let filename = "./test_data/panic_file.fa";
let _hash_stats = gather_bam_stats(&filename).unwrap();
}

#[test]
fn test_multifastx() {
let filenames = [
"./test_data/single_string.sam",
"./test_data/five_strings.sam",
"./test_data/small_strings.sam",
"./test_data/long_strings.bam"
];

//get the expected outputs
let expected_list = [
stats_basic_bam(),
stats_basic_bam2(),
stats_basic_bam3(),
stats_basic_bam4()
];

//sum the expected outputs
let mut expected: BTreeMap<usize, u64> = BTreeMap::new();
for results in expected_list.iter() {
for (key, value) in results.iter() {
let len_count: &mut u64 = expected.entry(*key).or_insert(0);
*len_count += value;
}
}

//now do it for real
let hash_stats = gather_multifastx_stats(&filenames).unwrap();
assert_eq!(hash_stats, expected);
}

#[test]
#[should_panic]
fn test_multifastx_error_handling() {
let filenames = [
"./test_data/single_string.bam",
"./test_data/panic_file.fa"
];
let _hash_stats = gather_multifastx_stats(&filenames).unwrap();
}
}
66 changes: 54 additions & 12 deletions src/fastx_loader.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@

extern crate log;
extern crate needletail;

use log::{error, info};
use needletail::parse_fastx_file;
use std::collections::BTreeMap;

use crate::bam_loader::gather_bam_stats_with_seed;

/// This is the main function for gathering all sequence lengths for a fastx file into a BTreeMap.
/// # Arguments
/// * `filename` - the filename to read sequences from
Expand Down Expand Up @@ -38,7 +37,7 @@ pub fn gather_fastx_stats_with_seed(filename: &str, initial_counts: Option<BTree
Some(ic) => ic,
None => BTreeMap::new()
};
let mut reader = parse_fastx_file(&filename)?;
let mut reader = parse_fastx_file(filename)?;

//go through all the records
let mut count: usize = 0;
Expand Down Expand Up @@ -84,14 +83,26 @@ pub fn gather_multifastx_stats<T: AsRef<str> + std::fmt::Debug>(filenames: &[T])
*/
let mut hash_stats: BTreeMap<usize, u64> = BTreeMap::new();
for filename in filenames.iter() {
hash_stats = match gather_fastx_stats_with_seed(filename.as_ref(), Some(hash_stats)) {
Ok(result) => result,
Err(e) => {
error!("Error while parsing FASTX file: {:?}", filename);
error!("Error: {:?}", e);
return Err(e);
}
};
if filename.as_ref().ends_with(".bam") || filename.as_ref().ends_with(".sam") {
hash_stats = match gather_bam_stats_with_seed(filename.as_ref(), Some(hash_stats)) {
Ok(result) => result,
Err(e) => {
error!("Error while parsing BAM file: {:?}", filename);
error!("Error: {:?}", e);
return Err(e);
}
};
}
else {
hash_stats = match gather_fastx_stats_with_seed(filename.as_ref(), Some(hash_stats)) {
Ok(result) => result,
Err(e) => {
error!("Error while parsing FASTX file: {:?}", filename);
error!("Error: {:?}", e);
return Err(e);
}
};
}
}
Ok(hash_stats)
}
Expand Down Expand Up @@ -226,6 +237,37 @@ mod tests {
assert_eq!(hash_stats, expected);
}

#[test]
fn test_multimixed() {
let filenames = [
"./test_data/single_string.fa",
"./test_data/five_strings.sam",
"./test_data/small_strings.fa",
"./test_data/long_strings.bam"
];

//get the expected outputs
let expected_list = [
stats_basic_fasta(),
stats_basic_fasta2(),
stats_basic_fasta3(),
stats_basic_fasta4()
];

//sum the expected outputs
let mut expected: BTreeMap<usize, u64> = BTreeMap::new();
for results in expected_list.iter() {
for (key, value) in results.iter() {
let len_count: &mut u64 = expected.entry(*key).or_insert(0);
*len_count += value;
}
}

//now do it for real
let hash_stats = gather_multifastx_stats(&filenames).unwrap();
assert_eq!(hash_stats, expected);
}

#[test]
#[should_panic]
fn test_multifastx_error_handling() {
Expand Down
Loading

0 comments on commit 4be9085

Please sign in to comment.