Skip to content

Commit

Permalink
Merge pull request #26 from Rust-Wellcome/splitsize
Browse files Browse the repository at this point in the history
Splitsize
  • Loading branch information
DLBPointon authored Jun 6, 2024
2 parents 54c1b5f + 8889479 commit a1a7e7e
Show file tree
Hide file tree
Showing 7 changed files with 238 additions and 53 deletions.
5 changes: 5 additions & 0 deletions src/exclude_seq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@ pub mod exclude_seq_mod {
fasta: &'a str,
out_file: &str,
) -> std::result::Result<&'a str, Box<dyn Error>> {
// Open and read fasta
let reader: Result<fasta::Reader<Box<dyn BufRead>>, std::io::Error> =
fasta::reader::Builder.build_from_path(fasta);

// Create new file
let file = fs::OpenOptions::new()
.create(true)
.append(true)
Expand All @@ -19,6 +22,8 @@ pub mod exclude_seq_mod {

match reader {
Ok(fasta) => {
// on Ok reading append record to new fasta if
// not in user given list of headers
let mut binding = fasta;
for result in binding.records() {
let record = result?;
Expand Down
33 changes: 31 additions & 2 deletions src/generics.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use noodles::fasta;
use noodles::fasta::record::Definition;
use std::error::Error;
use std::fs::{self, File, OpenOptions};
use std::{collections::HashMap, fmt, io::BufRead, result, str};

#[derive(Debug, Clone)]
Expand All @@ -19,6 +20,7 @@ pub fn validate_fasta(
) -> result::Result<HashMap<std::string::String, usize>, Box<dyn Error>> {
// Simply validate the fasta is valid by reading though and ensure there are
// valid record formats through out the file
// Return a Dict of header and length
let reader: Result<fasta::Reader<Box<dyn BufRead>>, std::io::Error> =
fasta::reader::Builder.build_from_path(path);
let mut fasta_map = HashMap::new();
Expand All @@ -44,6 +46,7 @@ pub fn only_keys<K, V>(map: HashMap<K, V>) -> impl Iterator<Item = K> {
}

fn get_gene_symbol(header: String) -> Result<String, Box<dyn std::error::Error>> {
// Take a string and return first segment of it
let header_list: Vec<&str> = header.split(' ').collect();
let record_header = header_list[0];
Ok(record_header[1..].to_owned())
Expand Down Expand Up @@ -71,13 +74,39 @@ fn get_gene_symbol(header: String) -> Result<String, Box<dyn std::error::Error>>
}

pub fn sanitise_header(old_header: &Definition) -> String {
// Clean the header
// This is overly complex for historical reasons
// It is still here incase those reasons come back to haunt me
// ...again
let x = get_gene_symbol(old_header.to_string());

// Yeah i dont know either...
match x {
Ok(c) => c,
Err(e) => {
format!("Regex isnt good enough to capture header id: {}", e)
format!("Split didn't work: {}", e)
}
}
}

pub fn write_fasta(
outdir: &String,
file_name: String,
fasta_record: Vec<noodles::fasta::Record>,
) -> std::io::Result<()> {
// Create file
fs::create_dir_all(outdir)?;
let file_path = format!("{}/{}", outdir, file_name);
let _data_file = File::create(&file_path);

// Append to file
let file = OpenOptions::new()
.append(true)
.open(file_path)
.expect("creation failed");

let mut writer = fasta::Writer::new(file);
for i in fasta_record {
writer.write_record(&i).unwrap();
}
Ok(())
}
16 changes: 14 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,23 @@ fn main() -> Result<(), Error> {
)
.arg(
Arg::new("mem-size")
.short('s')
.short('m')
.required(true)
.value_parser(clap::value_parser!(u16))
.value_parser(clap::value_parser!(usize))
.help("Size in MB that a fasta file is to be chunked into")
)
.arg(
Arg::new("data_type")
.short('d')
.value_parser(clap::builder::PossibleValuesParser::new(split_options))
.help("The data type of the input data")
)
.arg(
Arg::new("sanitise")
.short('s')
.value_parser(clap::value_parser!(bool))
.help("Do we need to sanitise the headers of the input fasta")
)
.arg(
Arg::new("output-directory")
.short('o')
Expand Down
14 changes: 11 additions & 3 deletions src/map_headers.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
pub mod mapping_headers {

use clap::ArgMatches;
use colored::Colorize;
use std::error::Error;
Expand Down Expand Up @@ -51,6 +50,7 @@ pub mod mapping_headers {
std::vec::IntoIter<std::string::String>,
>,
) {
// Save the header mapping to file
let f: File = File::create(output).expect("Unable to create file");
let mut f: BufWriter<File> = BufWriter::new(f);
for map_pair in mapped {
Expand All @@ -69,6 +69,9 @@ pub mod mapping_headers {
std::vec::IntoIter<std::string::String>,
>,
) {
// Swap out the old with the new
// skip all else.
// This could be re-written now that I know more about noodles
let file_reader: File = File::open(input).expect("CAN'T OPEN FILE");
let buff_reader: BufReader<File> = BufReader::new(file_reader);
let mut new_fasta: File = File::create(output).unwrap();
Expand All @@ -95,6 +98,8 @@ pub mod mapping_headers {
pub fn map_fasta_head(
arguments: std::option::Option<&ArgMatches>,
) -> Result<(), Box<dyn Error>> {
// Generate a mapped.txt with the old and new headers
// Generate a mapped.fasta with the new headers
let file: &String = arguments.unwrap().get_one::<String>("fasta-file").unwrap();
let replacer: &String = arguments
.unwrap()
Expand All @@ -110,19 +115,22 @@ pub mod mapping_headers {

match validate_fasta(file) {
Ok(names) => {
// Vec of scaffold names from validate_fasta
// return only the headers, not the lengths
let new_names = Vec::from_iter(only_keys(names));

// Generate a Zip of the a=old and new names
let new_map: Zip<std::vec::IntoIter<String>, std::vec::IntoIter<String>> =
create_mapping(new_names, replacer);

// Save the mapping to file
let map_to_save: Zip<std::vec::IntoIter<String>, std::vec::IntoIter<String>> =
new_map.clone();
let output_file = format!("{}mapped-heads.tsv", output);

save_mapping(&output_file, map_to_save);

// Generate a new fasta with the mapped headers
let new_fasta: String = format!("{output}mapped.fasta");

create_mapped_fasta(file, &new_fasta, new_map);

println!(
Expand Down
45 changes: 14 additions & 31 deletions src/split_by_count.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
pub mod split_by_count_mod {
use crate::generics::sanitise_header;
use crate::generics::{sanitise_header, write_fasta};
use clap::ArgMatches;
use compare::{natural, Compare};
use noodles::fasta::{self, Record};
use std::cmp::Ordering;
use std::fs::OpenOptions;
use std::{
fs::{create_dir_all, File},
io::BufReader,
path::Path,
};
use std::{fs::File, io::BufReader, path::Path};

#[allow(clippy::needless_return)]
fn fix_head(records: Record, sanitise: bool) -> Record {
// Taker a Record and sanitise the header
// recombine into a new Record
if sanitise {
let header = sanitise_header(records.definition());
let definition = fasta::record::Definition::new(header, None);
Expand All @@ -23,21 +20,6 @@ pub mod split_by_count_mod {
};
}

fn write_fasta(outdir: &String, fasta_record: &Vec<Record>) {
println!("{}", outdir);

let _data_file = File::create(outdir);
let file = OpenOptions::new()
.append(true)
.open(outdir)
.expect("creation failed");

let mut writer = fasta::Writer::new(file);
for i in fasta_record {
writer.write_record(i).unwrap();
}
}

pub fn split_file_by_count(arguments: std::option::Option<&ArgMatches>) {
let sanitise: &bool = arguments.unwrap().get_one::<bool>("sanitise").unwrap();
let fasta_file = arguments.unwrap().get_one::<String>("fasta-file").unwrap();
Expand All @@ -54,23 +36,26 @@ pub mod split_by_count_mod {
.unwrap();

let new_outpath = format!("{}/{}/{}/", outpath, actual_name, data_type);
create_dir_all(new_outpath.clone()).unwrap();
let fasta_count = arguments.unwrap().get_one::<u16>("count").unwrap();
println!(
"Fasta file for processing: {:?}\nNumber of records per file: {:?}",
fasta_file, fasta_count
);

// Header counter
let mut counter: u16 = 0;
let mut file_counter: u16 = 1;

// Remove the file suffix from the file name
let file_name: Vec<&str> = actual_name.split('.').collect();

// Open the fasta file
let mut reader = File::open(fasta_file)
.map(BufReader::new)
.map(fasta::Reader::new)
.unwrap();

// Create a Record List
let mut record_list: Vec<Record> = Vec::new();
for result in reader.records() {
let record = result.unwrap();
Expand All @@ -82,30 +67,28 @@ pub mod split_by_count_mod {
let cmp = natural();
let compared = cmp.compare(&counter, fasta_count);
if compared == Ordering::Equal {
let full_outpath = format!(
"{}{}_f{}_c{}-a{}.fa",
new_outpath,
let file_name = format!(
"{}_f{}_c{}-a{}.fa",
file_name[0],
file_counter,
&fasta_count,
&record_list.len()
);

write_fasta(&full_outpath, &record_list);
let _ = write_fasta(&new_outpath, file_name, record_list);
file_counter += 1;
counter = 0;
record_list = Vec::new();
}
}

let full_outpath = format!(
"{}{}_f{}_c{}-a{}.fa",
new_outpath,
let file_name = format!(
"{}_f{}_c{}-a{}.fa",
file_name[0],
file_counter,
&fasta_count,
&record_list.len()
);
write_fasta(&full_outpath, &record_list);
let _ = write_fasta(&new_outpath, file_name, record_list);
}
}
Loading

0 comments on commit a1a7e7e

Please sign in to comment.