diff --git a/Cargo.toml b/Cargo.toml index c0807f6..2ccc4ab 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,10 @@ [package] name = "tataki" +authors = ["Tazro Ohta (tazro.ohta@chiba-u.jp)"] version = "0.2.0" edition = "2021" -license = "Apache-2.0" +repository = "https://github.com/sapporo-wes/tataki" +license = "apache-2.0" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/README.md b/README.md index d20e0d9..9ba64d9 100644 --- a/README.md +++ b/README.md @@ -1,44 +1,148 @@ # Tataki -**This repository is currently under development.** +Tataki is a command-line tool designed primarily for detecting file formats in the bio-science field with the following features: -Tataki is a command line tool for detecting life science data types. - -Currently supports the following file types. - -- bam -- fasta -- fastq -- fastq.gz -- bed +- Supports various **file formats mainly used in bio-science** + - bam + - bcf + - bed + - cram + - fasta + - fastq + - gff3 + - gtf + - sam + - vcf + - will be added in the future +- Allows for the invocation of a [**CWL document**](https://www.commonwl.org/) and enables users to define their own complex criteria for detection. +- Can target both local files and remote URLs +- Compatible with [EDAM ontology](https://edamontology.org/page) ## Installation -A single binary is available (supports Linux only): +A single binary is available for Linux x86_64. ```shell -curl -fsSL -O https://github.com/suecharo/tataki/releases/download/0.1.0/tataki +curl -fsSL -O https://github.com/sapporo-wes/tataki/releases/latest/download/tataki chmod +x ./tataki -./tataki -h +./tataki -V ``` Or, you could clone the repository, then run `cargo build`. -## Example +## Usage + +Specify the paths of the files as arguments to `tataki`. Both local file path and remote URL are supported. + +```shell +tataki ... +``` + +For more details: + +```shell +$ tataki --help +Usage: tataki [OPTIONS] [FILE|URL]... + +Arguments: + [FILE|URL]... Path to the file + +Options: + -o, --output Path to the output file [default: stdout] + -f [default: csv] [possible values: yaml, tsv, csv, json] + --cache-dir Specify the directory in which to create a temporary directory. If this option is not provided, a temporary directory will be created in the default system temporary directory (/tmp) + -c, --conf Specify the tataki configuration file. If this option is not provided, the default configuration will be used. The option `--dry-run` shows the default configuration file + --dry-run Output the configuration file in yaml format and exit the program. If `--conf` option is not provided, the default configuration file will be shown + -v, --verbose Sets the level of verbosity + -q, --quiet Suppress all log messages + -h, --help Print help + -V, --version Print version -```txt -$ tataki bed12.bed -bed12.bed: 12 column BED file -$ tataki fastq01.fq.gz -fastq01.fq.gz: gzip compressed fastq file +Version: 0.2.0 ``` -## Todo +### Determining Formats in Your Preferred Order + +Using the `-c|--conf=` option allows you to change the order or set the file formats to use for determination. + +The configuration file is in YAML format. Please refer to the default configuration file for the schema. + +```yaml +order: + - bam + - bcf + - bed + - cram + - fasta + - fastq + - gff3 + - gtf + - sam + - vcf +``` + +### Executing a CWL Document with External Extension Mode + +Tataki can also be used to execute a CWL document with external extension mode. This is useful when determining file formats that are not supported in pre-built mode or when you want to perform complex detections. + +This mode is dependent on Docker, so please ensure that 'docker' is in your PATH. -- add support for more file types, such as .sam, .vcf, .gtf, etc. -- add support for EDAM ontology. -- implement fast mode with which the tool could perform well on larger files. +Here are the steps to execute a CWL document with external extension mode. + +1. Prepare a CWL document +2. Specify the CWL document in the configuration file +3. Execute `tataki`. + +#### Preparation of CWL Document + +The CWL document must be prepared in advance. The following is an example of a CWL document that executes `samtools view`. + +`edam_Id` and `label` are the two required fields for the CWL document. Both must be listed in the `tataki` prefix listed in the `$namespaces` section of the document. + +```cwl +cwlVersion: v1.2 +class: CommandLineTool + +requirements: + DockerRequirement: + dockerPull: quay.io/biocontainers/samtools:1.18--h50ea8bc_1 + InlineJavascriptRequirement: {} + +baseCommand: [samtools, head] + +successCodes: [0, 139] + +inputs: + input_file: + type: File + inputBinding: + position: 1 + +outputs: {} + +$namespaces: + tataki: https://github.com/sapporo-wes/tataki + +tataki:edam_id: http://edamontology.org/format_2573 +tataki:label: SAM +``` + +#### Configuration File + +Insert a path to the CWL document in [the configuration file](#determining-formats-in-your-preferred-order). This example shown below executes the CWL document followed by SAM and BAM format detection. + +```yaml +order: + - ./path/to/cwl_document.cwl + - sam + - bam +``` ## License -[Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0). See the [LICENSE](https://github.com/suecharo/tataki/blob/main/LICENSE). +The contents of this deposit are basically licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0). See the [LICENSE](https://github.com/sapporo-wes/tataki/blob/main/LICENSE). +However, the following files are licensed under Creative Commons Attribution Share Alike 4.0 International (). + +- ./src/EDAM_1.25.id_label.csv + - Source: + - Removed the lines not related to 'format' and the columns other than 'Preferred Label' and 'Class ID' diff --git a/cwl/sam_head.cwl b/cwl/sam_head.cwl new file mode 100644 index 0000000..7922fe8 --- /dev/null +++ b/cwl/sam_head.cwl @@ -0,0 +1,25 @@ +cwlVersion: v1.2 +class: CommandLineTool + +requirements: + DockerRequirement: + dockerPull: quay.io/biocontainers/samtools:1.18--h50ea8bc_1 + InlineJavascriptRequirement: {} + +baseCommand: [samtools, head] + +successCodes: [0, 139] + +inputs: + input_file: + type: File + inputBinding: + position: 1 + +outputs: {} + +$namespaces: + tataki: https://github.com/sapporo-wes/tataki + +tataki:edam_id: http://edamontology.org/format_2573 +tataki:label: SAM \ No newline at end of file diff --git a/src/args.rs b/src/args.rs index b945747..a456ec5 100644 --- a/src/args.rs +++ b/src/args.rs @@ -6,8 +6,7 @@ pub enum OutputFormat { Yaml, Tsv, Csv, - // Output only Edam label. - Edam, + Json, } #[derive(Parser, Debug)] @@ -15,34 +14,32 @@ pub enum OutputFormat { name = env!("CARGO_PKG_NAME"), about = env!("CARGO_PKG_DESCRIPTION"), version = env!("CARGO_PKG_VERSION"), - after_help = "", + after_help = concat!("Version: ", env!("CARGO_PKG_VERSION")), arg_required_else_help = true, )] pub struct Args { /// Path to the file - #[clap(name = "FILE", required_unless_present = "dry_run")] - // pub input: Option, + #[clap(name = "FILE|URL", required_unless_present = "dry_run")] pub input: Vec, /// Path to the output file [default: stdout] #[clap(short, long, value_name = "FILE")] pub output: Option, - #[clap(short = 'f', value_enum, default_value = "edam",conflicts_with_all = ["yaml"])] + #[clap(short = 'f', value_enum, default_value = "csv",conflicts_with_all = ["yaml"])] output_format: OutputFormat, #[clap(long, hide = true)] yaml: bool, - // TODO これの実装がまだ。 /// Specify the directory in which to create a temporary directory. If this option is not provided, a temporary directory will be created in the default system temporary directory (/tmp). #[clap(long, value_name = "DIR")] pub cache_dir: Option, - #[clap(long)] - pub full_fetch: bool, - + // TODO + // #[clap(long, hide = true)] + // pub full_fetch: bool, /// Specify the tataki configuration file. If this option is not provided, the default configuration will be used. /// The option `--dry-run` shows the default configuration file. #[clap(short, long, value_name = "FILE")] diff --git a/src/edam.rs b/src/edam.rs index 658fd8f..b8aaac5 100644 --- a/src/edam.rs +++ b/src/edam.rs @@ -1,11 +1,7 @@ use anyhow::Result; use bimap::BiMap; use lazy_static::lazy_static; -use log::warn; -use serde::{Deserialize, Serialize}; -use std::{collections::HashMap, f32::consts::E}; - -use crate::OutputFormat; +use serde::Deserialize; lazy_static! { #[derive(Debug)] @@ -15,9 +11,7 @@ lazy_static! { #[derive(Debug)] // A struct to validate user specified EDAM information. pub struct EdamMap { - // TODO これはBiMapに変えたのであとで消す - // Map of EDAM ID and Edam struct instance whose id is the key. - // label_to_edam: HashMap, + // A bimap of EDAM ID and EDAM label. bimap_id_label: BiMap, } @@ -28,10 +22,9 @@ impl EdamMap { .has_headers(true) .from_reader(&edam_str[..]); - let mut edam_map: HashMap = HashMap::new(); let mut bimap = BiMap::new(); for result in rdr.deserialize::() { - // resultがErrの時はpanicする + // panic if this fails to read EDAM table. match result { Ok(record) => { // edam_map.insert(record.label.clone(), record.clone()); @@ -54,7 +47,7 @@ impl EdamMap { } // check if the given pair of id and label exists in the EDAM table. - pub fn check_id_and_label(&self, id: &str, label: &str) -> Result { + pub fn correspondence_check_id_and_label(&self, id: &str, label: &str) -> Result { let label_from_bimap = self.bimap_id_label.get_by_left(id); match label_from_bimap { diff --git a/src/ext_tools.rs b/src/ext_tools.rs index b488b46..ff71d4b 100644 --- a/src/ext_tools.rs +++ b/src/ext_tools.rs @@ -1,8 +1,7 @@ use anyhow::{anyhow, bail, Context, Result}; -use log::{debug, info, warn}; +use log::{debug, info}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; -use std::io::BufRead; use std::io::Write; use std::path::{Path, PathBuf}; use tempfile::{Builder, NamedTempFile, TempDir}; @@ -11,8 +10,8 @@ use crate::edam; use crate::run::ModuleResult; const CWL_INSPECTOR_DOCKER_IMAGE: &str = "ghcr.io/tom-tan/cwl-inspector:v0.1.1"; -const LABEL_KEY: &str = "LABEL"; -const EDAM_ID_KEY: &str = "EDAM_ID"; +const LABEL_KEY: &str = "label"; +const EDAM_ID_KEY: &str = "edam_id"; pub fn invoke( cwl_file_path: &Path, @@ -36,8 +35,8 @@ pub fn invoke( .with_context(|| format!("Failed to canonicalize {}", cwl_file_path.display()))?; // get the EDAM_ID and LABEL from the comment lines in the CWL file. - let mut cwl_edam_info = get_id_and_label_from_cwl_file(&cwl_file_path)?; - validate_and_correct_id_and_label(&mut cwl_edam_info, &cwl_file_path)?; + let mut cwl_metadatas = get_metadata_fields_from_cwl_file(&cwl_file_path)?; + validate_id_and_label(&mut cwl_metadatas, &cwl_file_path)?; // create a docker commandline from the CWL file using cwl-inspector. let inspector_process = std::process::Command::new("docker") @@ -115,6 +114,7 @@ pub fn invoke( cwl_docker_args_before_v.join(" "), cwl_docker_args_after_v.join(" ") ); + let cwl_docker_process = std::process::Command::new(cwl_docker_commandname) .args(cwl_docker_args_before_v) .args(cwl_docker_args_after_v) @@ -123,8 +123,8 @@ pub fn invoke( .output()?; let mut module_result = ModuleResult::with_result( - cwl_edam_info.get(LABEL_KEY).map(|s| s.to_string()), - cwl_edam_info.get(EDAM_ID_KEY).map(|s| s.to_string()), + cwl_metadatas.get(LABEL_KEY).map(|s| s.to_string()), + cwl_metadatas.get(EDAM_ID_KEY).map(|s| s.to_string()), ); module_result.set_is_ok(cwl_docker_process.status.success()); @@ -136,30 +136,65 @@ pub fn invoke( Ok(module_result) } -fn get_id_and_label_from_cwl_file(cwl_file_path: &Path) -> Result> { - // # EDAM_ID=format_2573 - // # LABEL=sam - // このようになっているファイルから、EDAM_IDとLABELを取得する。 +fn docker_path() -> Result { + let process = std::process::Command::new("which") + .arg("docker") + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .output()?; + + if process.status.success() { + let path = String::from_utf8(process.stdout)?; + Ok(PathBuf::from(path.trim())) + } else { + bail!("Please make sure that the docker command is present in your PATH"); + } +} + +#[derive(Deserialize, Debug)] +struct CwlMetadata { + #[serde(flatten)] + entries: HashMap, + #[serde(rename = "$namespaces")] + namespaces: HashMap, +} + +fn get_metadata_fields_from_cwl_file(cwl_file_path: &Path) -> Result> { + // Extract the EDAM_ID and LABEL from metadata in the CWL file. ex: + // $namespaces: + // tataki: https://tataki.io/ + // tataki:edam_id: http://edamontology.org/format_2573 + // tataki:label: sam let file = std::fs::File::open(cwl_file_path)?; let reader = std::io::BufReader::new(file); + let cwl_metadata: CwlMetadata = serde_yaml::from_reader(reader) + .with_context(|| format!("Failed to parse the CWL file: {}", cwl_file_path.display()))?; - let mut parameters = HashMap::new(); - for line_result in reader.lines() { - let line = line_result?; - if line.starts_with('#') { - if let Some((key, value)) = parse_parameter_in_cwl_comment_line(&line)? { - let key = key.to_uppercase(); - if key == EDAM_ID_KEY || key == LABEL_KEY { - parameters.insert(key, value); - } - } + let mut extracted_fields: HashMap = HashMap::new(); + let (prefix, _) = cwl_metadata.namespaces.iter().next().ok_or_else(|| { + anyhow!( + "The CWL file does not have the $namespaces field: {}", + cwl_file_path.display() + ) + })?; + if prefix != "tataki" { + bail!( + "The CWL file does not have the 'tataki' namespace: {}", + cwl_file_path.display() + ); + } + for (key, value) in cwl_metadata.entries.iter() { + if let Some(stripped_key) = key.strip_prefix(&format!("{}:", prefix)) { + let value = serde_yaml::to_string(value)?; + let value = value.trim_end(); + extracted_fields.insert(stripped_key.to_string(), value.to_owned()); } } - Ok(parameters) + Ok(extracted_fields) } -fn validate_and_correct_id_and_label( +fn validate_id_and_label( parameters: &mut HashMap, cwl_file_path: &Path, ) -> Result<()> { @@ -167,90 +202,27 @@ fn validate_and_correct_id_and_label( if parameters.contains_key(EDAM_ID_KEY) && parameters.contains_key(LABEL_KEY) { let id = parameters.get(EDAM_ID_KEY).unwrap(); let label = parameters.get(LABEL_KEY).unwrap(); - if edam::EDAM_MAP.check_id_and_label(id, label)? { - warn!( - "The specified pair of EDAM_ID and label in the CWL file does not exist in the EDAM table. Please check the ID and label: EDAM_ID={}, LABEL={}, CWL file={}", - id, - label, - cwl_file_path.display() - ); - parameters.remove(LABEL_KEY); - } - } - // if only LABEL is present, get the ID from the label if possible. - else if parameters.contains_key(LABEL_KEY) { - let label = parameters.get(LABEL_KEY).unwrap(); - let id = edam::EDAM_MAP.get_id_from_label(label); - if let Some(id) = id { - debug!( - "The EDAM ID to the specified label is found: EDAM_ID={}, LABEL={}, CWL file={}", - id, - label, - cwl_file_path.display() - ); - parameters.insert(EDAM_ID_KEY.to_string(), id); - } else { + if !edam::EDAM_MAP.correspondence_check_id_and_label(id, label)? { info!( - "The specified label is not found in EDAM table. Assuming it is custom operation name...: LABEL={}, CWL file={}", + "The specified edam_id and label do not correspond with each other. Assuming it is a custom label...: edam_id={}, label={}, CWL file={}", + id, label, cwl_file_path.display() ); } } - // if only ID is present, respect it and do nothing - else if parameters.contains_key(EDAM_ID_KEY) { - // do nothing - } // if both EDAM_ID and LABEL are not present, return error. else { bail!( - "Neither EDAM_ID nor LABEL is present in the CWL file: {}", + "The CWL file is missing required fields under the 'tataki' namespace. Please ensure that both 'tataki.{}' and 'tataki.{}' fields are included in the file.: CWL file={}", + EDAM_ID_KEY, + LABEL_KEY, cwl_file_path.display() ); } Ok(()) } -fn parse_parameter_in_cwl_comment_line(line: &str) -> Result> { - let line = line.trim_start_matches('#').trim(); - let parts: Vec<&str> = line.split('=').map(|part| part.trim()).collect(); - if parts.len() == 2 { - let key = parts[0].to_string(); - let mut value = parts[1].to_string(); - - // if the value is quoted, remove the quotes. - if (value.starts_with('\"') && value.ends_with('\"') && value.len() > 1) - || (value.starts_with('\'') && value.ends_with('\'') && value.len() > 1) - { - value = value.trim_matches(|c| c == '\"' || c == '\'').to_string(); - } - - Ok(Some((key, value))) - } else { - warn!( - "Failed to parse a parameter in a CWL comment line: {}", - line - ); - Ok(None) - } -} - -fn docker_path() -> Result { - let process = std::process::Command::new("which") - .arg("docker") - .stdout(std::process::Stdio::piped()) - .stderr(std::process::Stdio::piped()) - .output()?; - - if process.status.success() { - // processの結果をPathBufに変換する - let path = String::from_utf8(process.stdout)?; - Ok(PathBuf::from(path.trim())) - } else { - bail!("Please make sure that the docker command is present in your PATH"); - } -} - #[derive(Serialize, Deserialize, Debug)] struct InputFile { class: String, diff --git a/src/fetch.rs b/src/fetch.rs index ec481b1..cb0aa5b 100644 --- a/src/fetch.rs +++ b/src/fetch.rs @@ -1,10 +1,9 @@ -use anyhow::{bail, ensure, Result}; +use anyhow::{ensure, Result}; use chrono::Local; -use reqwest; -use std::path::{Path, PathBuf}; +use std::path::PathBuf; use std::time; use std::{fs::File, io::Write}; -use tempfile::{Builder, NamedTempFile, TempDir}; +use tempfile::TempDir; use url::Url; pub fn create_temporary_dir(cache_dir: &Option) -> Result { @@ -23,7 +22,7 @@ pub fn create_temporary_dir(cache_dir: &Option) -> Result { } } -pub fn download_from_url(url: Url, temp_dir: &TempDir) -> Result { +pub fn download_from_url(url: &Url, temp_dir: &TempDir) -> Result { // timeout in 60 * 60 seconds let client = reqwest::blocking::Client::builder() .timeout(time::Duration::from_secs(3600)) diff --git a/src/main.rs b/src/main.rs index 98518aa..36a825a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,17 +6,13 @@ mod logger; mod parser; mod run; -use anyhow::{anyhow, Result}; +use anyhow::{Context, Result}; use clap::Parser; -use log::{debug, error, info, warn}; -use serde::{Deserialize, Serialize}; -use std::collections::HashMap; +use log::{debug, info}; use std::fs::File; use std::io::BufReader; -use std::process::exit; use crate::args::OutputFormat; -use crate::edam::EDAM_MAP; use crate::run::Config; fn main() -> Result<()> { @@ -32,7 +28,12 @@ fn main() -> Result<()> { Some(path) => { let config_file = File::open(path)?; let reader = BufReader::new(config_file); - serde_yaml::from_reader(reader)? + serde_yaml::from_reader(reader).with_context(|| { + format!( + "Failed to parse the config file: {}", + path.to_str().unwrap(), + ) + })? } }; @@ -40,8 +41,8 @@ fn main() -> Result<()> { run::dry_run(config)?; } else { info!("tataki started"); - debug!("args: {:?}", args); - debug!("output format: {:?}", args.get_output_format()); + debug!("Args: {:?}", args); + debug!("Output format: {:?}", args.get_output_format()); run::run(config, args)?; } diff --git a/src/parser.rs b/src/parser.rs index 3cd9ed5..3a9f160 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -2,6 +2,7 @@ mod bam; mod bcf; mod bed; mod cram; +mod empty; mod fasta; mod fastq; mod gff3; @@ -16,12 +17,12 @@ use std::path::Path; use crate::run::ModuleResult; pub trait Parser { - /// Determine whether the given file is the format that this parser can parse. - /// If the given file is the format that this parser can parse, return `Ok(ModuleResult)`. - /// Otherwise, return `Err(anyhow::Error)` and the error message should provide the reason why this parser cannot parse the given file. - /// To create `ModuleResult`, use `ModuleResult::with_result()` which takes `is_ok`, `label`, `id`, and `error_message` as arguments. - /// `is_ok`: true if the given file is the format that this parser can parse, false otherwise. - /// `label`: + /// Determine if the provided file is in a format that this parser can interpret. + /// If the parser can successfully interpret the file, return `Ok(ModuleResult)`. + /// If it does not, return `Err(anyhow::Error)`, including an error message that specifies the reasons why the parser cannot process the file. + /// To construct `ModuleResult`, utilize `ModuleResult::with_result()` which requires `label` and `id` as parameters. + /// `id`: EDAM Class ID + /// `label`: EDAM Preferred Label fn determine(&self, input_path: &Path) -> Result; } @@ -32,6 +33,7 @@ pub fn from_str_to_parser(module_name: &str) -> Result> { "bcf" => Ok(Box::new(bcf::Bcf)), "bed" => Ok(Box::new(bed::Bed)), "cram" => Ok(Box::new(cram::Cram)), + "empty" => Ok(Box::new(empty::Empty)), "fasta" => Ok(Box::new(fasta::Fasta)), "fastq" => Ok(Box::new(fastq::Fastq)), "gff3" => Ok(Box::new(gff3::Gff3)), @@ -43,8 +45,7 @@ pub fn from_str_to_parser(module_name: &str) -> Result> { } } -// Result<()>ではなく、Resultを返すようにしているのは、 -// determine()自体の成功可否をModuleResult.is_ok、他の処理の成功可否をOk/Errで表現できるようにするため +// Return the result of determine() using Ok(ModuleResult), and return errors in other parts using Err. pub fn invoke(module_name: &str, target_file_path: &Path) -> Result { info!("Invoking parser {}", module_name); diff --git a/src/parser/empty.rs b/src/parser/empty.rs new file mode 100644 index 0000000..a93aacb --- /dev/null +++ b/src/parser/empty.rs @@ -0,0 +1,22 @@ +use std::fs; +use std::path::Path; + +use crate::parser::Parser; +use crate::run::ModuleResult; + +pub struct Empty; + +impl Parser for Empty { + // check if the file is empty or not. + fn determine(&self, input_path: &Path) -> anyhow::Result { + let metadata = fs::metadata(input_path)?; + if metadata.len() == 0 { + Ok(ModuleResult::with_result( + Some("plain text format (unformatted)".to_string()), + Some("http://edamontology.org/format_1964".to_string()), + )) + } else { + anyhow::bail!("The file is not empty"); + } + } +} diff --git a/src/run.rs b/src/run.rs index 36b9a07..dae84f8 100644 --- a/src/run.rs +++ b/src/run.rs @@ -1,7 +1,6 @@ -use anyhow::{anyhow, bail, Result}; -use log::{debug, error, info, warn}; -use serde::ser::SerializeMap; -use serde::{Deserialize, Serialize, Serializer}; +use anyhow::{anyhow, bail, Context, Result}; +use log::{debug, error, info}; +use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::path::{Path, PathBuf}; use tempfile::{NamedTempFile, TempDir}; @@ -15,36 +14,15 @@ use crate::OutputFormat; // Struct to store the result of Parser invocation and ExtTools invocation. #[derive(Debug)] -// TODO あとでこのpub外す pub struct ModuleResult { target_file_path: PathBuf, is_ok: bool, label: Option, id: Option, error_message: Option, - is_edam: bool, -} - -// TODO こいついらなくなったかも。 -impl Serialize for ModuleResult { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - let mut state = serializer.serialize_map(Some(2))?; - - // Someの中身がNoneの場合は、serialize_entryしない。 - - state.serialize_entry( - &self.target_file_path, - &HashMap::from([("id", &self.id), ("label", &self.label)]), - )?; - state.end() - } } impl ModuleResult { - // TODO こいつで書き直す pub fn with_result(label: Option, id: Option) -> Self { Self { target_file_path: PathBuf::new(), @@ -52,7 +30,6 @@ impl ModuleResult { label, id, error_message: None, - is_edam: true, } } @@ -67,22 +44,76 @@ impl ModuleResult { pub fn set_target_file_path(&mut self, target_file_path: PathBuf) { self.target_file_path = target_file_path; } + + pub fn create_module_results_string( + module_results: &[ModuleResult], + format: OutputFormat, + ) -> Result { + fn csv_serialize(module_results: &[ModuleResult], delimiter: u8) -> Result { + let mut data = Vec::new(); + { + let mut writer = csv::WriterBuilder::new() + .delimiter(delimiter) + .from_writer(&mut data); + + writer.write_record(["File Path", "Edam ID", "Label"])?; + + for module_result in module_results.iter() { + let target_file_path = &module_result.target_file_path; + writer.serialize(( + target_file_path.to_str().with_context(|| { + format!( + "Failed to convert the file path to a string: {}", + target_file_path.display() + ) + })?, + &module_result.id, + &module_result.label, + ))?; + } + } + + let data_str = String::from_utf8_lossy(&data); + Ok(data_str.into_owned()) + } + + match format { + OutputFormat::Yaml => { + let mut serialized_map = HashMap::new(); + for module_result in module_results { + let target_file_path = &module_result.target_file_path; + serialized_map.insert( + target_file_path.clone(), + HashMap::from([("id", &module_result.id), ("label", &module_result.label)]), + ); + } + + let yaml_str = serde_yaml::to_string(&serialized_map)?; + Ok(yaml_str) + } + OutputFormat::Tsv => csv_serialize(module_results, b'\t'), + OutputFormat::Csv => csv_serialize(module_results, b','), + OutputFormat::Json => { + let mut serialized_map = HashMap::new(); + for module_result in module_results { + let target_file_path = &module_result.target_file_path; + serialized_map.insert( + target_file_path.clone(), + HashMap::from([("id", &module_result.id), ("label", &module_result.label)]), + ); + } + + let json_str = serde_json::to_string(&serialized_map)?; + Ok(json_str) + } + } + } } // Struct to deserialize the contents of the conf file. #[derive(Debug, Serialize, Deserialize)] pub struct Config { - order: Vec, -} - -// Enum to represent the operation and to deserialize the contents of the conf file. -#[derive(Debug, Serialize, Deserialize)] -#[serde(untagged)] -enum Operation { - // module name - Default(String), - - Custom(HashMap), + order: Vec, } pub fn run(config: Config, args: Args) -> Result<()> { @@ -91,12 +122,23 @@ pub fn run(config: Config, args: Args) -> Result<()> { let mut module_results: Vec = Vec::new(); - for input in args.input { + // insert "empty" module at the beginning of the module order, so that the empty module is always invoked first. + let mut config = config; + config.order.insert(0, "empty".to_string()); + + for input in &args.input { + info!("Processing input: {}", input); + // Prepare input file path from url or local file path. // Download the file and store it in the specified cache directory if input is url. // let target_file_path = match input.as_ref().and_then(|input| Url::parse(input).ok()) { - let target_file_path = match Url::parse(&input).ok() { - Some(url) => crate::fetch::download_from_url(url, &temp_dir)?, + let target_file_path = match Url::parse(input).ok() { + Some(url) => { + info!("Downloading from {}", url); + let path = crate::fetch::download_from_url(&url, &temp_dir)?; + info!("Downloaded to {}", path.display()); + path + } None => { let path = PathBuf::from(input); if !path.exists() { @@ -115,22 +157,31 @@ pub fn run(config: Config, args: Args) -> Result<()> { module_results.push(module_result); } - let mut serialized_map = HashMap::new(); - for module_result in &module_results { - let target_file_path = &module_result.target_file_path; - serialized_map.insert( - target_file_path.clone(), - HashMap::from([("id", &module_result.id), ("label", &module_result.label)]), + // if args.cache_dir is Some, keep the temporary directory. + // Otherwise, delete the temporary directory. + if args.cache_dir.is_some() { + info!( + "Keeping temporary directory: {}", + temp_dir.into_path().display() + ); + } else { + info!( + "Deleting temporary directory: {}", + temp_dir.path().display() ); + temp_dir.close()?; } - println!("----"); - let yaml_str = serde_yaml::to_string(&serialized_map)?; - println!("test_output1:\n\n{}", yaml_str); + let result_str = + ModuleResult::create_module_results_string(&module_results, args.get_output_format())?; - // TODO この出力方法だと、yamlが配列になっちゃう。消す? - let module_results_str = serde_yaml::to_string(&module_results)?; - println!("test_output2:\n\n{}", module_results_str); + // if args.output is Some, write the result to the specified file. Otherwise, write the result to stdout. + if let Some(output_path) = args.output { + info!("Writing the result to {}", output_path.display()); + std::fs::write(output_path, result_str)?; + } else { + println!("{}", result_str); + } Ok(()) } @@ -138,9 +189,9 @@ pub fn run(config: Config, args: Args) -> Result<()> { fn run_modules( target_file_path: PathBuf, config: &Config, - // cache_dir: &Option, temp_dir: &TempDir, ) -> Result { + // create an input file for CWL modules if there is any CWL module in the config file. let cwl_input_file_path: Option = if cwl_module_exists(config)? { Some(ext_tools::make_cwl_input_file( target_file_path.clone(), @@ -150,68 +201,57 @@ fn run_modules( None }; - for item in &config.order { - let (operation_name, module) = match item { - Operation::Default(module) => (None, module), - Operation::Custom(custom) => { - let (operation_name, module) = custom - .iter() - .next() - .ok_or_else(|| anyhow!("Invalid custom operation specified."))?; - (Some(operation_name), module) - } - }; - - let module_path = Path::new(&module); - let module_extension = module_path - .extension() - .and_then(std::ffi::OsStr::to_str) - .unwrap_or(""); - - let mut module_result = match module_extension { - "" => parser::invoke(module, &target_file_path)?, - "cwl" => ext_tools::invoke( - module_path, - &target_file_path, - cwl_input_file_path.as_ref().unwrap(), - )?, - _ => anyhow::bail!( - "An unsupported file extension was specified for the module value in the conf file" - ), - }; - - module_result.set_target_file_path(target_file_path.clone()); - - if module_result.is_ok { - info!("Detected!! {}", module); - - if let Some(operation_name) = operation_name { - module_result.label = Some(operation_name.clone()); - module_result.id = None; - module_result.is_edam = false; + let module_result = config + .order + .iter() + .find_map(|module| { + let module_path = Path::new(&module); + let module_extension = module_path + .extension() + .and_then(std::ffi::OsStr::to_str) + .unwrap_or(""); + + let result = match module_extension { + "" => parser::invoke(module, &target_file_path), + "cwl" => ext_tools::invoke( + module_path, + &target_file_path, + cwl_input_file_path.as_ref().unwrap(), + ), + _ => Err(anyhow!( + "An unsupported file extension '.{}' was specified for the module value in the conf file. Only .cwl is supported for external extension mode.", + module_extension + )), + }; + + match result { + Ok(mut module_result) => { + if module_result.is_ok { + info!("Detected!! {}", module); + module_result.set_target_file_path(target_file_path.clone()); + Some(module_result) + } else { + debug!( + "Module \"{}\" failed. Reason:\n{}", + module, + module_result.error_message.unwrap_or("".to_string()) + ); + None + } + }, + Err(e) => { + error!("An error occurred while trying to invoke the \'{}\' module. Reason:\n{}", module, e); + None + }, } - - // TODO : for debug. delete later - // println!("\nend {:?}", &module_result); - return Ok(module_result); - } else { - debug!( - "Module \"{}\" failed. Reason:\n{}", - module, - module_result.error_message.unwrap_or("".to_string()) - ); - } - } - - // Found that no module can handle the input file, so return ModuleResult with is_ok=false. - return Ok(ModuleResult { - target_file_path, - is_ok: false, - label: None, - id: None, - error_message: None, - is_edam: false, - }); + }) + .unwrap_or_else(|| { + let mut none_result = ModuleResult::with_result(None, None); + none_result.set_target_file_path(target_file_path.clone()); + none_result + }); + + Ok(module_result) } pub fn dry_run(config: Config) -> Result<()> { @@ -223,19 +263,7 @@ pub fn dry_run(config: Config) -> Result<()> { } fn cwl_module_exists(config: &Config) -> Result { - for item in &config.order { - // TODO これ重複する作業なので、Operationにimplしてまとめたい - let (_, module) = match item { - Operation::Default(module) => (None, module), - Operation::Custom(custom) => { - let (operation_name, module) = custom - .iter() - .next() - .ok_or_else(|| anyhow!("Invalid custom operation specified."))?; - (Some(operation_name), module) - } - }; - + for module in &config.order { let module_path = Path::new(&module); let module_extension = module_path .extension() diff --git a/src/tataki.conf b/src/tataki.conf index 612cca0..7d30383 100644 --- a/src/tataki.conf +++ b/src/tataki.conf @@ -1,17 +1,12 @@ order: - # - ./tests/cwl_idea/sam_command.cwl - - this_is_sam_head_check: ./tests/cwl_idea/sam_command.cwl - # - this_is_fastqc_check: fastqc_check.cwl - - fastq - - fasta - bam - bcf - bed - cram + - fasta + - fastq - gff3 - gtf - sam - vcf - # - fastqc_check.cwl - # - this_is_bam: bam diff --git a/tests/cwl_idea/empty b/tests/cwl_idea/empty new file mode 100644 index 0000000..e69de29 diff --git a/tests/cwl_idea/toy.fa b/tests/cwl_idea/toy.fa index afe990a..e51af7c 100644 --- a/tests/cwl_idea/toy.fa +++ b/tests/cwl_idea/toy.fa @@ -1,4 +1,5 @@ >ref AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT + >ref2 aggttttataaaacaattaagtctacagagcaactacgcg