Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dp24 validateyaml #43

Merged
merged 8 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

126 changes: 126 additions & 0 deletions src/generate_csv.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/// Generate CSV generates a csv file which describes a specific data directory /User/...../geneset_alignment_data/insect/ApisMeliffera/ApisMeliffera.AMel1_1/{pep,cdna,rna,cds}/files.fa
/// This is for data tracking for TreeVal
/// This may be replaced or enhanced with a function to send this to a Google Sheets so the team has an easier way of tracking it all.
pub mod gencsv_mod {
use crate::generics::get_folder_list;
use clap::ArgMatches;
use csv::Writer;
use std::collections::HashMap;
use std::error::Error;
use std::{fs, path::Path, path::PathBuf};
use walkdir::WalkDir;

fn get_file_list(root: &str) -> Vec<PathBuf> {
WalkDir::new(root)
.into_iter()
.filter_map(|e| e.ok())
.filter(|e| e.file_type().is_file())
.map(|e| e.into_path())
.collect()
}

// Function to convert list to dictionary
fn list_2_dict(file_list: &Vec<PathBuf>) -> (HashMap<String, Vec<String>>, String) {
let mut file_dict = HashMap::new();
let mut org = String::new();
for path in file_list {
let path_str = path.to_str().unwrap();
let path_list: Vec<&str> = path_str.split('/').collect();
let file_name = path_list[path_list.len() - 1];
if file_name.to_lowercase() != "readme.txt" && file_name.to_lowercase() != "readme" {
file_dict.insert(
file_name.to_string(),
vec![
path_list[path_list.len() - 3].to_string(),
path_list[path_list.len() - 2].to_string(),
path_str.to_string(),
],
);
org = path_list[path_list.len() - 3].to_string();
}
}
(file_dict, org)
}

fn save_data(
dict_of_data: HashMap<String, Vec<String>>,
save_loc: &str,
org_accession: &str,
) -> Result<(), Box<dyn Error>> {
let save_dir = format!("{}/csv_data", save_loc);

let save_path = format!("{}/csv_data/{}-data.csv", save_loc, org_accession);
let save_path = Path::new(&save_path);

// Ensure the save directory exists
if !Path::new(&save_dir).exists() {
fs::create_dir_all(&save_dir).unwrap();
}

if save_path.exists() {
fs::remove_file(save_path).unwrap();
}

println!(
"Generating CSV for:\t{}\nSave Path:\t\t{}",
org_accession,
save_path.display()
);

println!("{}", save_dir);

let mut wtr = Writer::from_path(save_path)?;
wtr.write_record(&["org", "type", "data_file"])?;
for (_key, value) in dict_of_data {
wtr.write_record(&value)?;
}
wtr.flush()?;
Ok(())
}

pub fn gencsv(arguments: std::option::Option<&ArgMatches>) {
let geneset_folder: &String = arguments.unwrap().get_one::<String>("geneset_dir").unwrap();

let clade_folder = get_folder_list(&geneset_folder);

for clade in clade_folder {
let save_clade = clade.clone();
let org_folder = get_folder_list(&clade.into_os_string().into_string().unwrap());

// Filter out the folders ending with csv_data as these are output folders
let new_org_folder: Vec<&PathBuf> = org_folder
.iter()
.filter(|x| !x.ends_with("csv_data"))
.collect();

for org in new_org_folder {
let mut master_list = Vec::new();

let accession_folder = get_folder_list(
&<PathBuf as Clone>::clone(&org)
.into_os_string()
.into_string()
.unwrap(),
);

for accession in accession_folder {
let data_list = get_folder_list(accession.to_str().unwrap());
for data in data_list {
master_list.push(get_file_list(data.to_str().unwrap()));
}

let file_dict: HashMap<String, Vec<String>>;
let orgs: String;
(file_dict, orgs) =
list_2_dict(&master_list.iter().flatten().cloned().collect());
let save_loc = format!(
"{}/{}",
geneset_folder,
save_clade.file_name().unwrap().to_str().unwrap()
);
let _ = save_data(file_dict, &save_loc, &orgs);
}
}
}
}
}
18 changes: 16 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,22 @@ fn main() -> Result<(), Error> {
.help("Path to the TreeVal yaml file generated by the user")
)
.arg(
Arg::new("output")
.short('o')
Arg::new("output_to_file")
.short('f')
.value_parser(clap::builder::BoolishValueParser::new())
.default_value(std::ffi::OsStr::new("true"))
.help("Output the log to file")
)
.arg(
Arg::new("output_to_stdout")
.short('s')
.value_parser(clap::builder::BoolishValueParser::new())
.default_value(std::ffi::OsStr::new("true"))
.help("Output the log to file")
)
.arg(
Arg::new("output_to_pipeline")
.short('p')
.value_parser(clap::builder::BoolishValueParser::new())
.default_value(std::ffi::OsStr::new("true"))
.help("Output the log to file")
Expand Down
87 changes: 53 additions & 34 deletions src/yaml_validator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ pub mod yaml_validator_mod {
use noodles::{cram, fasta};
use serde::{Deserialize, Serialize};
use std::fs::{self, File};
use std::marker::PhantomData;
use std::path::PathBuf;
use walkdir::WalkDir;

Expand All @@ -27,7 +28,8 @@ pub mod yaml_validator_mod {
}

#[derive(Debug, Serialize, Deserialize)]
struct YamlResults {
// https://doc.rust-lang.org/std/marker/struct.PhantomData.html
struct YamlResults<'a> {
ReferenceResults: String,
CramResults: CRAMtags,
AlignerResults: String,
Expand All @@ -37,9 +39,10 @@ pub mod yaml_validator_mod {
KmerProfileResults: String,
GenesetResults: Vec<String>,
SyntenicResults: Vec<String>,
phantom: PhantomData<&'a String>,
}

impl std::fmt::Display for YamlResults {
impl<'a> std::fmt::Display for YamlResults<'a> {
// Pretty Printing YamlResults
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
write!(
Expand All @@ -59,7 +62,7 @@ pub mod yaml_validator_mod {
}
}

impl YamlResults {
impl<'a> YamlResults<'a> {
fn is_cram_valid(&self) -> String {
// this should add a field to the cramresults struct
if !self.CramResults.header_read_groups.is_empty() {
Expand Down Expand Up @@ -101,18 +104,18 @@ pub mod yaml_validator_mod {
failures
}

#[allow(unused_variables)]
fn check_secondaries(&self, secondary_list: Vec<&Vec<String>>) -> Vec<String> {
let failures: Vec<String> = Vec::new();
// TODO: Complete this
// let fails = for i in secondary_list {
// let fails: Vec<&String> = i
// .into_iter()
// .filter(|j| j.contains("FAIL") || j.contains("NO"))
// .collect();
// };
fn check_secondaries(&'a self, secondary_list: Vec<&'a Vec<String>>) -> Vec<&String> {
let mut failures: Vec<&String> = Vec::new();
for i in secondary_list {
let collection = i
.into_iter()
.filter(|j| j.contains("FAIL") || j.contains("NO"))
.collect::<Vec<&String>>();

// for i in fails {}
for i in collection {
failures.push(i)
}
}

failures
}
Expand Down Expand Up @@ -141,10 +144,7 @@ pub mod yaml_validator_mod {
let failed_primary_count = &failed_primaries.len();
let failed_secondary_count = &failed_secondary.len();

println!("{:?}", &failed_primaries);
println!("{:?}", &failed_secondary);

if !failed_primaries.is_empty() {
if &failed_primaries.len() >= &1 {
println!(
"Primary Values Failed: {}\nSecondary Values Failed: {}\nPrimary Values that failed:\n{:?}\nSecondary Values that failed (These are not essential for TreeVal):\n{:?}\n",
failed_primary_count, failed_secondary_count,
Expand Down Expand Up @@ -204,8 +204,8 @@ pub mod yaml_validator_mod {
/// Struct functions
impl TreeValYaml {
/// Pour the results into a results struct
fn into_results(self) -> YamlResults {
YamlResults {
fn into_results(self) -> YamlResults<'static> {
let results = YamlResults {
ReferenceResults: self.validate_fasta(),
CramResults: self.hic_data.validate_cram().1,
AlignerResults: self.hic_data.validate_aligner(),
Expand All @@ -215,7 +215,9 @@ pub mod yaml_validator_mod {
KmerProfileResults: self.validate_kmer_prof(),
GenesetResults: self.validate_genesets(),
SyntenicResults: self.validate_synteny(),
}
phantom: PhantomData,
};
results
}

/// Validate that the input fasta is infact a fasta format and count records.
Expand Down Expand Up @@ -574,14 +576,24 @@ pub mod yaml_validator_mod {
/// Validate the yaml file required for the TreeVal pipeline
pub fn validate_yaml(arguments: std::option::Option<&ArgMatches>) {
let file = arguments.unwrap().get_one::<String>("yaml").unwrap();
let output: &bool = arguments.unwrap().get_one::<bool>("output").unwrap();

// TODO: Complete this
// let output_file = if output.to_owned() {
// "./yamlresults.txt".to_string()
// } else {
// "".to_string()
// };
let output_to_file: &bool = arguments
.unwrap()
.get_one::<bool>("output_to_file")
.unwrap();
let output_to_stdout: &bool = arguments
.unwrap()
.get_one::<bool>("output_to_stdout")
.unwrap();
let output_to_pipeline: &bool = arguments
.unwrap()
.get_one::<bool>("output_to_pipeline")
.unwrap();

let output_file = if output_to_file.to_owned() {
"./yamlresults.txt".to_string()
} else {
"".to_string()
};

println! {"Validating Yaml: {}", file.purple()};

Expand All @@ -590,12 +602,19 @@ pub mod yaml_validator_mod {
serde_yaml::from_reader(input).expect("Unable to read from file");

let results = contents.into_results();
//results.to_stdout();

// results
// .to_file(output_file)
// .expect("Can't create final report");
if output_to_stdout == &true {
results.to_stdout();
}

results.to_check()
if output_to_file == &true {
results
.to_file(output_file)
.expect("Can't create final report");
}

if output_to_pipeline == &true {
results.to_check()
}
}
}
2 changes: 1 addition & 1 deletion test_data/yaml/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ reference_file: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/assembly/
map_order: unsorted
assem_reads:
read_type: hifi
read_data: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/genomic_data/pacbio/
read_data: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/genomic_data/pacbio2/
supplementary_data: path
hic_data:
hic_cram: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/genomic_data/hic-arima/
Expand Down
Loading