diff --git a/Cargo.lock b/Cargo.lock index 6249461..7630133 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -236,6 +236,12 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "compare" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "120133d4db2ec47efe2e26502ee984747630c67f51974fca0b6c1340cf2368d3" + [[package]] name = "const_format" version = "0.2.32" @@ -405,10 +411,11 @@ dependencies = [ [[package]] name = "fasta_manipulation" -version = "0.1.2" +version = "0.1.3" dependencies = [ "clap", "colored", + "compare", "csv", "io", "noodles", diff --git a/Cargo.toml b/Cargo.toml index ad77b8f..4c55605 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fasta_manipulation" -version = "0.1.2" +version = "0.1.3" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -8,6 +8,7 @@ edition = "2021" [dependencies] clap = { version = "4.4.4", features = ["cargo"] } colored = "2.0.4" +compare = "0.1.0" csv = "1.3.0" io = "0.0.2" noodles = { version = "0.52.0", features = ["fasta", "cram", "csi", "core"] } diff --git a/README.md b/README.md index ae3e739..79c7001 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,9 @@ Currently, this program has the following arguments: This command will generate a directory of files made up of a user given number of sequences from the input fasta. This is useful when generating geneset data for TreeVal use or sub-setting data in a non-random manner. The count will be the upper limit, as there will be a left over number of records. - `splitbycount --fasta-file ${PATH TO FASTA} --output-directory ${OUTPUT LOCATION} --count {NUMBER OF FASTA RECORDS PER FILE}` + This will generate files in `{outdir}/{fasta-file.prefix}/{data_type}/{input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa` + + `splitbycount --fasta-file ${PATH TO FASTA} --output-directory ${OUTPUT LOCATION} --count {NUMBER OF FASTA RECORDS PER FILE} --data_type ['pep','cdna', 'cds', 'rna', 'other']` - split_by_size (NOT YET WRITTEN) @@ -59,5 +61,10 @@ Currently, this program has the following arguments: - GC percentage per scaffold + counts - GC percentage whole genome + - N50 and N90 + - L50 + - GAP count and length (summary with average length) + + `profile -f input.fasta -o outdir` If there are other options that would be useful to any other teams, leave a message or issue. diff --git a/src/generics.rs b/src/generics.rs index 2811f50..f10b880 100644 --- a/src/generics.rs +++ b/src/generics.rs @@ -1,4 +1,5 @@ use noodles::fasta; +use noodles::fasta::record::Definition; use std::error::Error; use std::{collections::HashMap, fmt, io::BufRead, result, str}; @@ -41,3 +42,42 @@ pub fn only_keys(map: HashMap) -> impl Iterator { // Take a HashMap and return a Key only Vec map.into_iter().map(|(k, _v)| k) } + +fn get_gene_symbol(header: String) -> Result> { + let header_list: Vec<&str> = header.split(' ').collect(); + let record_header = header_list[0]; + Ok(record_header[1..].to_owned()) + // let re = Regex::new(r"gene=([A-Z]\w+)").unwrap(); + + // let first_run = re.captures(&header).ok_or("None")?; + + // if first_run[0] == "None".to_owned() { + // let re = Regex::new(r"symbol:(\S+)").unwrap(); + // let second_run = re.captures(&header).ok_or("None")?; + // if second_run[0] == "None".to_owned() { + // let re = Regex::new(r"(\(\S+\)) gene").unwrap(); + // let third_run = re.captures(&header).ok_or("None")?; + // if third_run[0] == "None".to_owned() { + // Ok("NOCAPTUREDRESULT".to_string()) + // } else { + // Ok(third_run[0].to_string()) + // } + // } else { + // Ok(second_run[0].to_string()) + // } + // } else { + // Ok(first_run[0].to_string()) + // } +} + +pub fn sanitise_header(old_header: &Definition) -> String { + let x = get_gene_symbol(old_header.to_string()); + + // Yeah i dont know either... + match x { + Ok(c) => c, + Err(e) => { + format!("Regex isnt good enough to capture header id: {}", e) + } + } +} diff --git a/src/main.rs b/src/main.rs index aedc212..9e947f9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -30,6 +30,7 @@ mod exclude_seq; use crate::exclude_seq::exclude_seq_mod::filter_fasta; fn main() -> Result<(), Error> { + let split_options = ["pep", "cds", "cdna", "rna", "other"]; let match_result = command!() .about("A program for fasta manipulation and yaml validation ~ Used in TreeVal project") .subcommand( @@ -45,14 +46,11 @@ fn main() -> Result<(), Error> { .short('v') .value_parser(clap::value_parser!(bool)) .default_value("false") - .required(false) .help("Print explainers as to why validation fails, if it does fail") ) .arg( Arg::new("output") .short('o') - .aliases(["out"]) - .required(false) .default_value("./") .help("Output the log to file") ) @@ -63,23 +61,30 @@ fn main() -> Result<(), Error> { .arg( Arg::new("fasta-file") .short('f') - .aliases(["fasta"]) .required(true) .help("A path to a valid fasta file.") ) .arg( Arg::new("output-directory") .short('o') - .aliases(["out"]) - .required(false) .default_value("./") - .help("The output directory that files will be placed in") + .help("The output directory that files will be placed in | outfile will be formatted like {input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa") + ) + .arg( + Arg::new("data_type") + .short('d') + .value_parser(clap::builder::PossibleValuesParser::new(split_options)) + .help("The data type of the input data") + ) + .arg( + Arg::new("sanitise") + .short('s') + .value_parser(clap::value_parser!(bool)) + .help("Do we need to sanitise the headers of the input fasta") ) .arg( Arg::new("count") .short('c') - .long("file-count") - .aliases(["count"]) .value_parser(clap::value_parser!(u16)) .help("How many sequences per file") ) @@ -90,14 +95,12 @@ fn main() -> Result<(), Error> { .arg( Arg::new("fasta-file") .short('f') - .aliases(["fasta"]) .required(true) .help("A path to a valid fasta file.") ) .arg( Arg::new("mem-size") .short('s') - .long("mem-size") .required(true) .value_parser(clap::value_parser!(u16)) .help("Size in MB that a fasta file is to be chunked into") @@ -105,35 +108,45 @@ fn main() -> Result<(), Error> { .arg( Arg::new("output-directory") .short('o') - .aliases(["out"]) - .required(false) .default_value("./") .help("The output directory that files will be placed in") ) ) + .subcommand( + Command::new("geneset_csvs") + .about("Subcommand to generate csv files that condense geneset directories generated by splitbycount/splitbysize. Mainly for use in TreeVal") + .arg( + Arg::new("geneset_dir") + .short('d') + .required(true) + .help("The path to the top level directory of your geneset directory.") + ) + .arg( + Arg::new("specifiy_clade") + .short('c') + .required(true) + .default_value("ALL") + .help("Specify the clade folder to refresh") + ) + ) .subcommand( Command::new("mapheaders") .about("Subcommand for stripping out headers and replacing with a standardised automatic or user-given string, this also returns a dict of old:new headers") .arg( Arg::new("fasta-file") .short('f') - .aliases(["fasta"]) .required(true) .help("A path to a valid fasta file.") ) .arg( Arg::new("output-directory") .short('o') - .aliases(["out"]) - .required(false) .default_value("./") .help("The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta") ) .arg( Arg::new("replace-with") .short('r') - .aliases(["replacement"]) - .required(false) .default_value("FMMH") .help("The new header format, appended with a numerical value. Without being set the new header will default to 'FMMH_{numberical}'") ) @@ -144,22 +157,18 @@ fn main() -> Result<(), Error> { .arg( Arg::new("fasta-file") .short('f') - .aliases(["fasta"]) .required(true) .help("A path to a valid fasta file.") ) .arg( Arg::new("output-directory") .short('o') - .aliases(["out"]) - .required(false) .default_value("./new") .help("The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta") ) .arg( Arg::new("map-file") .short('m') - .aliases(["mapped-header-file"]) .required(true) .help("The original mapped header field, a TSV of old-header, new-header") ) @@ -170,15 +179,13 @@ fn main() -> Result<(), Error> { .arg( Arg::new("fasta-file") .short('f') - .aliases(["fsata"]) .required(true) .help("The input fasta file for profiling") ) .arg( Arg::new("output-dir") .short('o') - .aliases(["outdir"]) - .default_value("FASTMAN-out") + .default_value("FasMan-out") .help("The input fasta file for profiling") ) ) @@ -188,21 +195,18 @@ fn main() -> Result<(), Error> { .arg( Arg::new("fasta") .short('f') - .aliases(["fasta"]) .required(true) .help("The input fasta file for re-organising") ) .arg( Arg::new("tpf") .short('t') - .aliases(["tpf file"]) .required(true) .help("The TPF file used to re-organise the input fasta") ) .arg( Arg::new("sort") .short('s') - .required(false) .value_parser(clap::value_parser!(bool)) .default_value("false") .help("Size sort the output or leave as order in AGP") @@ -210,14 +214,11 @@ fn main() -> Result<(), Error> { .arg( Arg::new("output") .short('o') - .aliases(["out"]) - .required(false) .default_value("new.fasta") .help("The output name of the new fasta file") ) .arg( Arg::new("n_length") - .aliases(["n_len"]) .value_parser(clap::value_parser!(usize)) .default_value("200") .help("Length that the N (gap) string should be.") @@ -229,7 +230,6 @@ fn main() -> Result<(), Error> { .arg( Arg::new("fasta-file") .short('f') - .aliases(["fsata"]) .required(true) .help("The input fasta file for profiling") ) @@ -237,8 +237,6 @@ fn main() -> Result<(), Error> { Arg::new("random") .short('r') .value_parser(clap::value_parser!(bool)) - .default_value("false") - .aliases(["random"]) .help("Random subset of input file. Default skims the first X given percent") ) .arg( @@ -246,7 +244,6 @@ fn main() -> Result<(), Error> { .short('p') .value_parser(clap::value_parser!(u16)) .default_value("50") - .aliases(["proportion"]) .help("Percentage of the original file entries that should be retained") ) ) @@ -262,14 +259,12 @@ fn main() -> Result<(), Error> { .arg( Arg::new("output") .short('o') - .required(false) .default_value("FiilteredFasta.fa") .help("The outfile naming") ) .arg( Arg::new("filter_list") .short('l') - .required(false) .help("A string comma-separated list of sequence names to exclude from the final fasta") ) ) @@ -279,30 +274,24 @@ fn main() -> Result<(), Error> { .arg( Arg::new("fasta-1") .short('p') - .aliases(["primary-fasta"]) .required(true) .help("The input fasta file for re-organising") ) .arg( Arg::new("fasta-2") .short('s') - .aliases(["secondary-fasta"]) .required(true) .help("The second input fasta file") ) .arg( Arg::new("naming") .short('s') - .aliases(["naming"]) - .required(false) .default_value("PRI/HAP") .help("A '/' separated list with an item per file, these are the namings of the new scaffolds in the merged output") ) .arg( Arg::new("output") .short('o') - .aliases(["output"]) - .required(false) .default_value("merged") .help("Output file prefix") ) @@ -310,17 +299,13 @@ fn main() -> Result<(), Error> { .get_matches(); println! { - "{}\n{}\n{}", + "{}\n{}\n{}\nRUNNING SUBCOMMAND: |\n-- {}\nRUNNING ON: |\n-- {}", "WELCOME TO Fasta Manipulator".bold(), "This has been made to help prep data for use in the Treeval and curationpretext pipelines".bold(), - "ONLY THE yamlvalidator IS SPECIFIC TO TREEVAL, THE OTHER COMMANDS CAN BE USED FOR ANY OTHER PURPOSE YOU WANT".purple() - }; - - println!( - "RUNNING : {:?} : SUBCOMMAND\nRUNNING ON: {:?}", + "ONLY THE yamlvalidator IS SPECIFIC TO TREEVAL, THE OTHER COMMANDS CAN BE USED FOR ANY OTHER PURPOSE YOU WANT".purple(), match_result.subcommand_name().unwrap(), env::consts::OS - ); + }; match match_result.subcommand_name() { Some("splitbysize") => split_file_by_size(match_result.subcommand_matches("splitbysize")), diff --git a/src/split_by_count.rs b/src/split_by_count.rs index 495f5af..1396f00 100644 --- a/src/split_by_count.rs +++ b/src/split_by_count.rs @@ -1,41 +1,111 @@ pub mod split_by_count_mod { + use crate::generics::sanitise_header; use clap::ArgMatches; + use compare::{natural, Compare}; + use noodles::fasta::{self, Record}; + use std::cmp::Ordering; + use std::fs::OpenOptions; use std::{ - fs::File, - io::{BufRead, BufReader}, + fs::{create_dir_all, File}, + io::BufReader, + path::Path, }; + #[allow(clippy::needless_return)] + fn fix_head(records: Record, sanitise: bool) -> Record { + if sanitise { + let header = sanitise_header(records.definition()); + let definition = fasta::record::Definition::new(header, None); + let seq = records.sequence().to_owned(); + return fasta::Record::new(definition, seq); + } else { + return records.to_owned(); + }; + } + + fn write_fasta(outdir: &String, fasta_record: &Vec) { + println!("{}", outdir); + + let _data_file = File::create(outdir); + let file = OpenOptions::new() + .append(true) + .open(outdir) + .expect("creation failed"); + + let mut writer = fasta::Writer::new(file); + for i in fasta_record { + writer.write_record(i).unwrap(); + } + } + pub fn split_file_by_count(arguments: std::option::Option<&ArgMatches>) { + let sanitise: &bool = arguments.unwrap().get_one::("sanitise").unwrap(); let fasta_file = arguments.unwrap().get_one::("fasta-file").unwrap(); + let path_obj = Path::new(fasta_file); + let grab_name = path_obj.file_name().unwrap(); + let actual_list: Vec<&str> = grab_name.to_str().unwrap().split('.').collect(); + let actual_name = actual_list[0]; + + let data_type = arguments.unwrap().get_one::("data_type").unwrap(); + + let outpath = arguments + .unwrap() + .get_one::("output-directory") + .unwrap(); + + let new_outpath = format!("{}/{}/{}/", outpath, actual_name, data_type); + create_dir_all(new_outpath.clone()).unwrap(); let fasta_count = arguments.unwrap().get_one::("count").unwrap(); - println!("Fasta file for processing: {:?}", fasta_file); - println!("{:?}", &fasta_count); println!( - "Number of sequence-header pairs per file: {:?}", - fasta_count + "Fasta file for processing: {:?}\nNumber of records per file: {:?}", + fasta_file, fasta_count ); - let chunk_val = *fasta_count; - let mut counter = 0; - let mut global_counter = 0; - - let input = File::open(fasta_file).expect("CANT OPEN FASTA"); - let buffered = BufReader::new(input); - - for line in buffered.lines() { - if counter != chunk_val { - if line.expect("NO LINES IN FASTA").starts_with('>') { - println!("header"); - } else { - println!("Sequence"); - counter += 1; - global_counter += 1; - } - } else { + let mut counter: u16 = 0; + let mut file_counter: u16 = 1; + + let file_name: Vec<&str> = actual_name.split('.').collect(); + + let mut reader = File::open(fasta_file) + .map(BufReader::new) + .map(fasta::Reader::new) + .unwrap(); + + let mut record_list: Vec = Vec::new(); + for result in reader.records() { + let record = result.unwrap(); + counter += 1; + + let final_rec = fix_head(record, *sanitise); + record_list.push(final_rec); + + let cmp = natural(); + let compared = cmp.compare(&counter, fasta_count); + if compared == Ordering::Equal { + let full_outpath = format!( + "{}{}_f{}_c{}-a{}.fa", + new_outpath, + file_name[0], + file_counter, + &fasta_count, + &record_list.len() + ); + + write_fasta(&full_outpath, &record_list); + file_counter += 1; counter = 0; - println!("CHUNK"); + record_list = Vec::new(); } } - println!("Total number of pairs: {:?}", global_counter); + + let full_outpath = format!( + "{}{}_f{}_c{}-a{}.fa", + new_outpath, + file_name[0], + file_counter, + &fasta_count, + &record_list.len() + ); + write_fasta(&full_outpath, &record_list); } }