From 74660d466c06edd6704ec39f3c385201d5bf6659 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Sun, 21 Jan 2024 20:27:29 -0500 Subject: [PATCH] work on pretokenization --- genimtools/src/main.rs | 4 +- genimtools/src/tools/cli.rs | 75 ++++++++++++++++++++++++++++++++++++- genimtools/src/tools/mod.rs | 5 ++- 3 files changed, 79 insertions(+), 5 deletions(-) diff --git a/genimtools/src/main.rs b/genimtools/src/main.rs index f447bf4b..d06a5f52 100644 --- a/genimtools/src/main.rs +++ b/genimtools/src/main.rs @@ -36,8 +36,8 @@ fn main() { Some((tokenizers::consts::TOKENIZE_CMD, matches)) => { tokenizers::cli::handlers::tokenize_bed_file(matches); } - Some((tools::consts::DATA_DIR_STAT_CMD, matches)) => { - tools::cli::handlers::data_dir_stat_handler(matches); + Some((tools::consts::TOOLS_CMD, matches)) => { + tools::cli::handlers::tools_handler(matches); } _ => unreachable!("Subcommand not found"), diff --git a/genimtools/src/tools/cli.rs b/genimtools/src/tools/cli.rs index 85084c97..d374dcfc 100644 --- a/genimtools/src/tools/cli.rs +++ b/genimtools/src/tools/cli.rs @@ -1,7 +1,7 @@ use super::*; use clap::{arg, ArgMatches, Command}; -pub fn make_tools_cli() -> Command { +fn make_data_dir_stat_cli() -> Command { Command::new(consts::DATA_DIR_STAT_CMD) .author("Nathan LeRoy") .about("Collect data statistics on all bed files in a directory.") @@ -13,10 +13,45 @@ pub fn make_tools_cli() -> Command { .arg(arg!( "Path to the data directory.").required(true)) } +fn make_pre_tokenization_cli() -> Command { + Command::new(consts::PRE_TOKENIZATION_CMD) + .about("Pre-tokenize a bed file or folder of bed files into a specific universe.") + .arg( + arg!(--universe "Path to the output folder or file.") + .required(true), + ) + // positional path + .arg(arg!( "Path to the data directory.").required(true)) +} + +pub fn make_tools_cli() -> Command { + Command::new(consts::TOOLS_CMD) + .author("Nathan LeRoy") + .about("Tools for working with genomic data.") + .subcommand(make_data_dir_stat_cli()) + .subcommand(make_pre_tokenization_cli()) +} + pub mod handlers { + use std::path::{Path, PathBuf}; + + use crate::{tokenizers::{self, Tokenizer}, common::models::RegionSet}; + use super::*; + pub fn tools_handler(matches: &ArgMatches) { + match matches.subcommand() { + Some((consts::DATA_DIR_STAT_CMD, matches)) => { + data_dir_stat_handler(matches); + }, + Some((consts::PRE_TOKENIZATION_CMD, matches)) => { + pre_tokenization_handler(matches); + } + _ => unreachable!("Subcommand not found"), + } + } + pub fn data_dir_stat_handler(matches: &ArgMatches) { let path = matches .get_one::("path") @@ -24,10 +59,46 @@ pub mod handlers { let out = matches .get_one::("out") - .unwrap_or(&consts::DEFAULT_OUTPUT.to_string()) + .unwrap_or(&consts::DEFAULT_DATA_DIR_STAT_OUTPUT.to_string()) .to_owned(); // core logic/algorithm here data_dir_stat(path, out.as_str()); } + + pub fn pre_tokenization_handler(matches: &ArgMatches) { + let path = matches + .get_one::("path") + .expect("Path to either a data file or a directory with data is required"); + + let universe = matches + .get_one::("universe") + .expect("Path to the universe file is required"); + + // check if the path is a file or a directory + let path_to_data = Path::new(&path); + let universe = Path::new(&universe); + + if path_to_data.is_file() { + let file_name = path_to_data.file_stem(); + match file_name { + Some(file_name) => { + let new_file = format!("{}.{}", file_name.to_str().unwrap(), consts::PRE_TOKENIZATION_EXT); + let new_file = Path::new(&new_file); + let tokenizer = tokenizers::TreeTokenizer::from(universe); + let data = RegionSet::try_from(path_to_data); + match data { + Ok(data) => { + let result = tokenizer.tokenize_region_set(&data).expect("Data couldn't be tokenized."); + let _ = result.to_gtok_file(&PathBuf::from(new_file)); + }, + Err(e) => panic!("There was an error readig the data file: {}", e) + } + }, + None => panic!("There was an issue extracting the name of the file.") + } + + } + + } } \ No newline at end of file diff --git a/genimtools/src/tools/mod.rs b/genimtools/src/tools/mod.rs index e9278504..1b880e98 100644 --- a/genimtools/src/tools/mod.rs +++ b/genimtools/src/tools/mod.rs @@ -7,8 +7,11 @@ use walkdir::WalkDir; pub mod cli; pub mod consts { + pub const TOOLS_CMD: &str = "tools"; pub const DATA_DIR_STAT_CMD: &str = "dir-stat"; - pub const DEFAULT_OUTPUT: &str = "output.tsv"; + pub const PRE_TOKENIZATION_CMD: &str = "pretokenize"; + pub const DEFAULT_DATA_DIR_STAT_OUTPUT: &str = "output.tsv"; + pub const PRE_TOKENIZATION_EXT: &str = "gtok"; } ///