From f5b676bc87f35aad3bb5b6e72f41c700343643cb Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 24 Jul 2024 11:14:40 +0100 Subject: [PATCH 1/6] Updates --- Cargo.lock | 18 +++++++++--------- src/yaml_validator.rs | 33 +++++++++++++++++---------------- test_data/yaml/test.yaml | 2 +- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1ecdf4d..4f6eccc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -177,9 +177,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.5" +version = "1.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "324c74f2155653c90b04f25b2a47a8a631360cb908f92a772695f430c7e31052" +checksum = "2aba8f4e9906c7ce3c73463f62a7f0c65183ada1a2d47e397cc8810827f9694f" [[package]] name = "cfg-if" @@ -333,7 +333,7 @@ dependencies = [ "proc-macro2", "quote", "rustc_version", - "syn 2.0.71", + "syn 2.0.72", ] [[package]] @@ -364,7 +364,7 @@ checksum = "a1ab991c1362ac86c61ab6f556cff143daa22e5a15e4e189df818b2fd19fe65b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.72", ] [[package]] @@ -598,7 +598,7 @@ dependencies = [ "gprimitives", "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.72", ] [[package]] @@ -1123,7 +1123,7 @@ checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.72", ] [[package]] @@ -1189,9 +1189,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.71" +version = "2.0.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b146dcf730474b4bcd16c311627b31ede9ab149045db4d6088b3becaea046462" +checksum = "dc4b9b9bf2add8093d3f2c0204471e951b2285580335de42f9d2534f3ae7a8af" dependencies = [ "proc-macro2", "quote", @@ -1478,5 +1478,5 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.72", ] diff --git a/src/yaml_validator.rs b/src/yaml_validator.rs index 29b0d75..04735b1 100644 --- a/src/yaml_validator.rs +++ b/src/yaml_validator.rs @@ -1,11 +1,11 @@ pub mod yaml_validator_mod { use clap::ArgMatches; - use colored::{ColoredString, Colorize}; + use colored::Colorize; use csv::ReaderBuilder; use noodles::{cram, fasta}; use serde::{Deserialize, Serialize}; - use std::fmt::format; use std::fs::{self, File}; + use std::marker::PhantomData; use std::path::PathBuf; use walkdir::WalkDir; @@ -28,7 +28,7 @@ pub mod yaml_validator_mod { } #[derive(Debug, Serialize, Deserialize)] - struct YamlResults { + struct YamlResults<'a> { ReferenceResults: String, CramResults: CRAMtags, AlignerResults: String, @@ -38,9 +38,10 @@ pub mod yaml_validator_mod { KmerProfileResults: String, GenesetResults: Vec, SyntenicResults: Vec, + phantom: PhantomData<&'a String>, } - impl<'a> std::fmt::Display for YamlResults { + impl<'a> std::fmt::Display for YamlResults<'a> { // Pretty Printing YamlResults fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { write!( @@ -60,7 +61,7 @@ pub mod yaml_validator_mod { } } - impl YamlResults { + impl<'a> YamlResults<'a> { fn is_cram_valid(&self) -> String { // this should add a field to the cramresults struct if &self.CramResults.header_read_groups.len() >= &1 { @@ -100,16 +101,18 @@ pub mod yaml_validator_mod { failures } - fn check_secondaries(&self, secondary_list: Vec<&Vec>) -> Vec { - let mut failures: Vec = Vec::new(); - let fails = for i in secondary_list { - let fails: Vec<&String> = i + fn check_secondaries(&'a self, secondary_list: Vec<&'a Vec>) -> Vec<&String> { + let mut failures: Vec<&String> = Vec::new(); + for i in secondary_list { + let collection = i .into_iter() .filter(|j| j.contains("FAIL") || j.contains("NO")) - .collect(); - }; + .collect::>(); - for i in fails {} + for i in collection { + failures.push(i) + } + } failures } @@ -138,9 +141,6 @@ pub mod yaml_validator_mod { let failed_primary_count = &failed_primaries.len(); let failed_secondary_count = &failed_secondary.len(); - println!("{:?}", &failed_primaries); - println!("{:?}", &failed_secondary); - if &failed_primaries.len() >= &1 { println!( "Primary Values Failed: {}\nSecondary Values Failed: {}\nPrimary Values that failed:\n{:?}\nSecondary Values that failed (These are not essential for TreeVal):\n{:?}\n", @@ -212,6 +212,7 @@ pub mod yaml_validator_mod { KmerProfileResults: self.validate_kmer_prof(), GenesetResults: self.validate_genesets(), SyntenicResults: self.validate_synteny(), + phantom: PhantomData, }; results } @@ -572,7 +573,7 @@ pub mod yaml_validator_mod { let file = arguments.unwrap().get_one::("yaml").unwrap(); let output: &bool = arguments.unwrap().get_one::("output").unwrap(); - let output_file = if output.to_owned() { + let _output_file = if output.to_owned() { "./yamlresults.txt".to_string() } else { "".to_string() diff --git a/test_data/yaml/test.yaml b/test_data/yaml/test.yaml index ec51cca..0a6de9e 100644 --- a/test_data/yaml/test.yaml +++ b/test_data/yaml/test.yaml @@ -9,7 +9,7 @@ reference_file: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/assembly/ map_order: unsorted assem_reads: read_type: hifi - read_data: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/genomic_data/pacbio/ + read_data: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/genomic_data/pacbio2/ supplementary_data: path hic_data: hic_cram: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/genomic_data/hic-arima/ From 9a6ca8737f4d660440d8e338a9ac06dc7db25414 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 6 Aug 2024 16:45:24 +0100 Subject: [PATCH 2/6] Updates to complete validateyaml --- src/yaml_validator.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/yaml_validator.rs b/src/yaml_validator.rs index 04735b1..8c4f818 100644 --- a/src/yaml_validator.rs +++ b/src/yaml_validator.rs @@ -71,10 +71,12 @@ pub mod yaml_validator_mod { } } + #[allow(dead_code)] fn to_stdout(&self) { println!("{}", &self) } + #[allow(dead_code)] fn to_file(&self, output_location: String) -> Result<(), std::io::Error> { let string_data = format!("YamlResults:\n\tReference: {:#?}\n\tCram: {:#?}\n\tAligner: {:#?}\n\tLongread: {:#?}\n\tBusco: {:#?}\n\tTelomere: {:#?}\n\tKmerProfile: {:#?}\n\tGenesetPaths: {:#?}\n\tSyntenicPaths: {:#?}\n\t{:#?}", &self.ReferenceResults, From 5559182be4ae841318cdc0dfcc7c12c903ea30e3 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 14 Aug 2024 11:05:52 +0100 Subject: [PATCH 3/6] Updates --- src/main.rs | 18 ++++++++++++++++-- src/yaml_validator.rs | 32 +++++++++++++++++++++++--------- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/src/main.rs b/src/main.rs index 37f5350..1fb723a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -42,8 +42,22 @@ fn main() -> Result<(), Error> { .help("Path to the TreeVal yaml file generated by the user") ) .arg( - Arg::new("output") - .short('o') + Arg::new("output_to_file") + .short('f') + .value_parser(clap::builder::BoolishValueParser::new()) + .default_value(std::ffi::OsStr::new("true")) + .help("Output the log to file") + ) + .arg( + Arg::new("output_to_stdout") + .short('s') + .value_parser(clap::builder::BoolishValueParser::new()) + .default_value(std::ffi::OsStr::new("true")) + .help("Output the log to file") + ) + .arg( + Arg::new("output_to_pipeline") + .short('p') .value_parser(clap::builder::BoolishValueParser::new()) .default_value(std::ffi::OsStr::new("true")) .help("Output the log to file") diff --git a/src/yaml_validator.rs b/src/yaml_validator.rs index 04735b1..10a12c2 100644 --- a/src/yaml_validator.rs +++ b/src/yaml_validator.rs @@ -571,9 +571,20 @@ pub mod yaml_validator_mod { /// Validate the yaml file required for the TreeVal pipeline pub fn validate_yaml(arguments: std::option::Option<&ArgMatches>) { let file = arguments.unwrap().get_one::("yaml").unwrap(); - let output: &bool = arguments.unwrap().get_one::("output").unwrap(); - - let _output_file = if output.to_owned() { + let output_to_file: &bool = arguments + .unwrap() + .get_one::("output_to_file") + .unwrap(); + let output_to_stdout: &bool = arguments + .unwrap() + .get_one::("output_to_stdout") + .unwrap(); + let output_to_pipeline: &bool = arguments + .unwrap() + .get_one::("output_to_pipeline") + .unwrap(); + + let output_file = if output_to_file.to_owned() { "./yamlresults.txt".to_string() } else { "".to_string() @@ -586,12 +597,15 @@ pub mod yaml_validator_mod { serde_yaml::from_reader(input).expect("Unable to read from file"); let results = contents.into_results(); - //results.to_stdout(); - - // results - // .to_file(output_file) - // .expect("Can't create final report"); - results.to_check() + if output_to_stdout == &true { + results.to_stdout(); + } else if output_to_file == &true { + results + .to_file(output_file) + .expect("Can't create final report"); + } else if output_to_pipeline == &true { + results.to_check() + } } } From cbb202180c544aabae3f56551e4bc41df973fa2d Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 14 Aug 2024 11:38:59 +0100 Subject: [PATCH 4/6] Updates --- src/yaml_validator.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/yaml_validator.rs b/src/yaml_validator.rs index 10a12c2..cf0d912 100644 --- a/src/yaml_validator.rs +++ b/src/yaml_validator.rs @@ -600,11 +600,15 @@ pub mod yaml_validator_mod { if output_to_stdout == &true { results.to_stdout(); - } else if output_to_file == &true { + } + + if output_to_file == &true { results .to_file(output_file) .expect("Can't create final report"); - } else if output_to_pipeline == &true { + } + + if output_to_pipeline == &true { results.to_check() } } From 7a8f93d0a9df40486f96c4b7f2752fd047d37e47 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 15 Aug 2024 16:30:53 +0100 Subject: [PATCH 5/6] Update fasman --- src/yaml_validator.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/yaml_validator.rs b/src/yaml_validator.rs index cf0d912..05b9a9c 100644 --- a/src/yaml_validator.rs +++ b/src/yaml_validator.rs @@ -28,6 +28,7 @@ pub mod yaml_validator_mod { } #[derive(Debug, Serialize, Deserialize)] + // https://doc.rust-lang.org/std/marker/struct.PhantomData.html struct YamlResults<'a> { ReferenceResults: String, CramResults: CRAMtags, From db320aa8a0e470708efb1207f2cc6ea83f97fce4 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 15 Aug 2024 20:32:48 +0100 Subject: [PATCH 6/6] addition of generate_csv - TreeVal specific function --- src/generate_csv.rs | 126 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 src/generate_csv.rs diff --git a/src/generate_csv.rs b/src/generate_csv.rs new file mode 100644 index 0000000..34e27bb --- /dev/null +++ b/src/generate_csv.rs @@ -0,0 +1,126 @@ +/// Generate CSV generates a csv file which describes a specific data directory /User/...../geneset_alignment_data/insect/ApisMeliffera/ApisMeliffera.AMel1_1/{pep,cdna,rna,cds}/files.fa +/// This is for data tracking for TreeVal +/// This may be replaced or enhanced with a function to send this to a Google Sheets so the team has an easier way of tracking it all. +pub mod gencsv_mod { + use crate::generics::get_folder_list; + use clap::ArgMatches; + use csv::Writer; + use std::collections::HashMap; + use std::error::Error; + use std::{fs, path::Path, path::PathBuf}; + use walkdir::WalkDir; + + fn get_file_list(root: &str) -> Vec { + WalkDir::new(root) + .into_iter() + .filter_map(|e| e.ok()) + .filter(|e| e.file_type().is_file()) + .map(|e| e.into_path()) + .collect() + } + + // Function to convert list to dictionary + fn list_2_dict(file_list: &Vec) -> (HashMap>, String) { + let mut file_dict = HashMap::new(); + let mut org = String::new(); + for path in file_list { + let path_str = path.to_str().unwrap(); + let path_list: Vec<&str> = path_str.split('/').collect(); + let file_name = path_list[path_list.len() - 1]; + if file_name.to_lowercase() != "readme.txt" && file_name.to_lowercase() != "readme" { + file_dict.insert( + file_name.to_string(), + vec![ + path_list[path_list.len() - 3].to_string(), + path_list[path_list.len() - 2].to_string(), + path_str.to_string(), + ], + ); + org = path_list[path_list.len() - 3].to_string(); + } + } + (file_dict, org) + } + + fn save_data( + dict_of_data: HashMap>, + save_loc: &str, + org_accession: &str, + ) -> Result<(), Box> { + let save_dir = format!("{}/csv_data", save_loc); + + let save_path = format!("{}/csv_data/{}-data.csv", save_loc, org_accession); + let save_path = Path::new(&save_path); + + // Ensure the save directory exists + if !Path::new(&save_dir).exists() { + fs::create_dir_all(&save_dir).unwrap(); + } + + if save_path.exists() { + fs::remove_file(save_path).unwrap(); + } + + println!( + "Generating CSV for:\t{}\nSave Path:\t\t{}", + org_accession, + save_path.display() + ); + + println!("{}", save_dir); + + let mut wtr = Writer::from_path(save_path)?; + wtr.write_record(&["org", "type", "data_file"])?; + for (_key, value) in dict_of_data { + wtr.write_record(&value)?; + } + wtr.flush()?; + Ok(()) + } + + pub fn gencsv(arguments: std::option::Option<&ArgMatches>) { + let geneset_folder: &String = arguments.unwrap().get_one::("geneset_dir").unwrap(); + + let clade_folder = get_folder_list(&geneset_folder); + + for clade in clade_folder { + let save_clade = clade.clone(); + let org_folder = get_folder_list(&clade.into_os_string().into_string().unwrap()); + + // Filter out the folders ending with csv_data as these are output folders + let new_org_folder: Vec<&PathBuf> = org_folder + .iter() + .filter(|x| !x.ends_with("csv_data")) + .collect(); + + for org in new_org_folder { + let mut master_list = Vec::new(); + + let accession_folder = get_folder_list( + &::clone(&org) + .into_os_string() + .into_string() + .unwrap(), + ); + + for accession in accession_folder { + let data_list = get_folder_list(accession.to_str().unwrap()); + for data in data_list { + master_list.push(get_file_list(data.to_str().unwrap())); + } + + let file_dict: HashMap>; + let orgs: String; + (file_dict, orgs) = + list_2_dict(&master_list.iter().flatten().cloned().collect()); + let save_loc = format!( + "{}/{}", + geneset_folder, + save_clade.file_name().unwrap().to_str().unwrap() + ); + let _ = save_data(file_dict, &save_loc, &orgs); + } + } + } + } +}