-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6 from HudsonAlpha/ubam-integration
Ubam integration
- Loading branch information
Showing
12 changed files
with
338 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,8 @@ | ||
[package] | ||
name = "fastleng" | ||
version = "0.1.1" | ||
authors = ["holtjma <[email protected]>"] | ||
edition = "2018" | ||
version = "0.2.0" | ||
authors = ["holtjma <[email protected]>"] | ||
edition = "2021" | ||
license = "MIT OR Apache-2.0" | ||
description = "fastleng - read length statistics tool" | ||
homepage = "https://github.com/HudsonAlpha/rust-fastleng" | ||
|
@@ -18,6 +18,7 @@ env_logger = "0.9.0" | |
exitcode = "1.1.2" | ||
log = "0.4.14" | ||
needletail = "0.4.1" | ||
rust-htslib = { version = "0.39.5", default-features = false, features = ["static"] } | ||
serde = { version = "1.0.129", features = ["derive"] } | ||
serde_json = "1.0.66" | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,225 @@ | ||
|
||
use log::{info, warn}; | ||
use rust_htslib::{bam, bam::Read}; | ||
use std::collections::BTreeMap; | ||
|
||
/// This is the main function for gathering all sequence lengths for a fastx file into a BTreeMap. | ||
/// # Arguments | ||
/// * `filename` - the filename to read sequences from | ||
/// # Examples | ||
/// ``` | ||
/// use std::collections::BTreeMap; | ||
/// use fastleng::bam_loader::gather_bam_stats; | ||
/// let filename = "./test_data/single_string.sam"; | ||
/// let counts: BTreeMap<usize, u64> = gather_bam_stats(&filename).unwrap(); | ||
/// ``` | ||
pub fn gather_bam_stats(filename: &str) -> Result<BTreeMap<usize, u64>, Box<dyn std::error::Error>> { | ||
gather_bam_stats_with_seed(filename, None) | ||
} | ||
|
||
/// This will gather sequence lengths from a filename and add them to a provided BTreeMap (`initial_counts`). | ||
/// # Arguments | ||
/// * `filename` - the filename to read sequences from | ||
/// * `initial_counts` - if provided, this will use that BTreeMap as the inital counts, otherwise it will create an empty one | ||
/// # Examples | ||
/// ``` | ||
/// use std::collections::BTreeMap; | ||
/// use fastleng::bam_loader::gather_bam_stats_with_seed; | ||
/// let filename = "./test_data/single_string.sam"; | ||
/// let initial_counts: BTreeMap<usize, u64> = BTreeMap::new(); | ||
/// let counts: BTreeMap<usize, u64> = gather_bam_stats_with_seed(&filename, Some(initial_counts)).unwrap(); | ||
/// ``` | ||
pub fn gather_bam_stats_with_seed(filename: &str, initial_counts: Option<BTreeMap<usize, u64>>) -> Result<BTreeMap<usize, u64>, Box<dyn std::error::Error>> { | ||
//create an empty stats file (or use initial counts) and ready the reader | ||
let mut hash_stats: BTreeMap<usize, u64> = match initial_counts { | ||
Some(ic) => ic, | ||
None => BTreeMap::new() | ||
}; | ||
let mut reader = bam::Reader::from_path(filename)?; | ||
|
||
//go through all the records | ||
let mut warning_triggered = false; | ||
let mut count: usize = 0; | ||
info!("Loading file \"{}\"...", filename); | ||
for read_entry in reader.records() { | ||
//all we care about is the sequence length | ||
let record = read_entry?; | ||
let seq_len: usize = record.seq_len(); | ||
|
||
if !warning_triggered && !record.is_unmapped() { | ||
// user gave us an aligned file, spit out a one-time warning | ||
warn!("Detected aligned reads, this is not properly handled: {filename}"); | ||
warning_triggered = true; | ||
} | ||
|
||
//insert 0 if absent; then increment | ||
let len_count: &mut u64 = hash_stats.entry(seq_len).or_insert(0); | ||
*len_count += 1; | ||
|
||
count += 1; | ||
if count % 1000000 == 0 { | ||
info!("Processed {} sequences", count); | ||
} | ||
} | ||
info!("Finished loading file with {} sequences.", count); | ||
|
||
//return the full count list now | ||
Ok(hash_stats) | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
// allows us to test a bunch at once | ||
use crate::fastx_loader::gather_multifastx_stats; | ||
|
||
/// This one is a single sequence "A" | ||
fn stats_basic_bam() -> BTreeMap<usize, u64> { | ||
let mut results: BTreeMap<usize, u64> = BTreeMap::new(); | ||
results.insert(1, 1); | ||
results | ||
} | ||
|
||
/// one of each length from 1-5 | ||
fn stats_basic_bam2() -> BTreeMap<usize, u64> { | ||
let mut results: BTreeMap<usize, u64> = BTreeMap::new(); | ||
for l in 1..6 { | ||
results.insert(l, 1); | ||
} | ||
results | ||
} | ||
|
||
/// mix of a few lengths from 1-4 | ||
fn stats_basic_bam3() -> BTreeMap<usize, u64> { | ||
let mut results: BTreeMap<usize, u64> = BTreeMap::new(); | ||
results.insert(1, 3); | ||
results.insert(2, 2); | ||
results.insert(3, 1); | ||
results.insert(4, 2); | ||
results | ||
} | ||
|
||
/// some longer strings | ||
fn stats_basic_bam4() -> BTreeMap<usize, u64> { | ||
let mut results: BTreeMap<usize, u64> = BTreeMap::new(); | ||
results.insert(50, 2); | ||
results.insert(100, 2); | ||
results.insert(150, 2); | ||
results.insert(1000, 1); | ||
results | ||
} | ||
|
||
#[test] | ||
fn test_basic_sam() { | ||
//build some inputs | ||
let filename = "./test_data/single_string.sam"; | ||
|
||
//get the expected outputs | ||
let expected = stats_basic_bam(); | ||
|
||
//now do it for real | ||
let hash_stats = gather_bam_stats(&filename).unwrap(); | ||
assert_eq!(hash_stats, expected); | ||
} | ||
|
||
#[test] | ||
fn test_basic_sam2() { | ||
//build some inputs | ||
let filename = "./test_data/five_strings.sam"; | ||
|
||
//get the expected outputs | ||
let expected = stats_basic_bam2(); | ||
|
||
//now do it for real | ||
let hash_stats = gather_bam_stats(&filename).unwrap(); | ||
assert_eq!(hash_stats, expected); | ||
} | ||
|
||
#[test] | ||
fn test_basic_sam3() { | ||
//build some inputs | ||
let filename = "./test_data/small_strings.sam"; | ||
|
||
//get the expected outputs | ||
let expected = stats_basic_bam3(); | ||
|
||
//now do it for real | ||
let hash_stats = gather_bam_stats(&filename).unwrap(); | ||
assert_eq!(hash_stats, expected); | ||
} | ||
|
||
#[test] | ||
fn test_basic_sam4() { | ||
//build some inputs | ||
let filename = "./test_data/long_strings.sam"; | ||
|
||
//get the expected outputs | ||
let expected = stats_basic_bam4(); | ||
|
||
//now do it for real | ||
let hash_stats = gather_bam_stats(&filename).unwrap(); | ||
assert_eq!(hash_stats, expected); | ||
} | ||
|
||
#[test] | ||
fn test_basic_bam4() { | ||
//build some inputs | ||
let filename = "./test_data/long_strings.bam"; | ||
|
||
//get the expected outputs | ||
let expected = stats_basic_bam4(); | ||
|
||
//now do it for real | ||
let hash_stats = gather_bam_stats(&filename).unwrap(); | ||
assert_eq!(hash_stats, expected); | ||
} | ||
|
||
#[test] | ||
#[should_panic] | ||
fn test_error_handling() { | ||
let filename = "./test_data/panic_file.fa"; | ||
let _hash_stats = gather_bam_stats(&filename).unwrap(); | ||
} | ||
|
||
#[test] | ||
fn test_multifastx() { | ||
let filenames = [ | ||
"./test_data/single_string.sam", | ||
"./test_data/five_strings.sam", | ||
"./test_data/small_strings.sam", | ||
"./test_data/long_strings.bam" | ||
]; | ||
|
||
//get the expected outputs | ||
let expected_list = [ | ||
stats_basic_bam(), | ||
stats_basic_bam2(), | ||
stats_basic_bam3(), | ||
stats_basic_bam4() | ||
]; | ||
|
||
//sum the expected outputs | ||
let mut expected: BTreeMap<usize, u64> = BTreeMap::new(); | ||
for results in expected_list.iter() { | ||
for (key, value) in results.iter() { | ||
let len_count: &mut u64 = expected.entry(*key).or_insert(0); | ||
*len_count += value; | ||
} | ||
} | ||
|
||
//now do it for real | ||
let hash_stats = gather_multifastx_stats(&filenames).unwrap(); | ||
assert_eq!(hash_stats, expected); | ||
} | ||
|
||
#[test] | ||
#[should_panic] | ||
fn test_multifastx_error_handling() { | ||
let filenames = [ | ||
"./test_data/single_string.bam", | ||
"./test_data/panic_file.fa" | ||
]; | ||
let _hash_stats = gather_multifastx_stats(&filenames).unwrap(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.