From a5047e096273bae647350226ce2c73d0cc640d97 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Mon, 22 Jan 2024 10:43:14 -0500 Subject: [PATCH] more io operations --- genimtools/src/ailist/mod.rs | 8 ++-- .../src/common/models/tokenized_regionset.rs | 12 ++---- genimtools/src/io/mod.rs | 39 +++++++++++++++++++ genimtools/src/lib.rs | 3 +- genimtools/src/tools/cli.rs | 4 +- 5 files changed, 51 insertions(+), 15 deletions(-) create mode 100644 genimtools/src/io/mod.rs diff --git a/genimtools/src/ailist/mod.rs b/genimtools/src/ailist/mod.rs index 026ca0a3..7bc22e22 100644 --- a/genimtools/src/ailist/mod.rs +++ b/genimtools/src/ailist/mod.rs @@ -37,7 +37,7 @@ impl AIList { *intervals = results.3; - if intervals.len() == 0 { + if intervals.is_empty() { break; } else { header_list.push(starts.len()); @@ -53,7 +53,7 @@ impl AIList { } fn decompose( - intervals: &mut Vec, + intervals: &mut [Interval], minimum_coverage_length: usize, ) -> (Vec, Vec, Vec, Vec) { // look at the next minL*2 intervals @@ -119,7 +119,7 @@ impl AIList { }) } } - return results_list; + results_list } pub fn query(&self, interval: &Interval) -> Vec { @@ -142,7 +142,7 @@ impl AIList { &self.max_ends[self.header_list[i]..], )); - return results_list; + results_list } pub fn print(&self) { diff --git a/genimtools/src/common/models/tokenized_regionset.rs b/genimtools/src/common/models/tokenized_regionset.rs index 5f97b97e..9246ca12 100644 --- a/genimtools/src/common/models/tokenized_regionset.rs +++ b/genimtools/src/common/models/tokenized_regionset.rs @@ -7,6 +7,7 @@ use crate::common::consts::{PAD_CHR, PAD_END, PAD_START}; use crate::common::models::region::Region; use crate::common::models::tokenized_region::TokenizedRegion; use crate::common::models::universe::Universe; +use crate::io::write_tokens_to_gtok; pub struct TokenizedRegionSet<'a> { pub regions: Vec, @@ -72,14 +73,9 @@ impl<'a> TokenizedRegionSet<'a> { /// Write a TokenizedRegionSet to a .gtok file /// * `path` - A PathBuf to write the .gtok file to /// - pub fn to_gtok_file(&self, path: &PathBuf) -> Result<(), Box> { - let mut file = File::create(path)?; - for region in self.regions.iter() { - let id = self.universe.convert_region_to_id(region); - let line = format!("{}\n", id); - file.write_all(line.as_bytes())?; - } - + pub fn to_gtok_file(&self, path: &str) -> Result<(), Box> { + let tokens = self.to_region_ids(); + write_tokens_to_gtok(path, &tokens)?; Ok(()) } diff --git a/genimtools/src/io/mod.rs b/genimtools/src/io/mod.rs new file mode 100644 index 00000000..ea5afb5e --- /dev/null +++ b/genimtools/src/io/mod.rs @@ -0,0 +1,39 @@ +use std::fs::File; +use std::io::{Write, Read, BufReader, BufWriter}; + +/// +/// Writes a vector of tokens to a file in the `.gtok` format. +/// # Arguments +/// - filename: the file to save the tokens to +/// - tokens: tokens to save +/// +pub fn write_tokens_to_gtok(filename: &str, tokens: &[u32]) -> std::io::Result<()> { + let file = File::create(filename)?; + let mut writer = BufWriter::new(file); + + for &token in tokens { + writer.write_all(&token.to_le_bytes())?; + } + + Ok(()) +} + +/// +/// Read in a vector of tokens from a file in the `.gtok` format. +/// # Arguments +/// - filename: filename to read the tokens from +/// +/// # Returns +/// - vector of tokens in u32 format +pub fn read_tokens_from_gtok(filename: &str) -> std::io::Result> { + let file = File::open(filename)?; + let mut reader = BufReader::new(file); + let mut tokens = Vec::new(); + let mut buffer = [0; 4]; + + while let Ok(()) = reader.read_exact(&mut buffer) { + tokens.push(u32::from_le_bytes(buffer)); + } + + Ok(tokens) +} \ No newline at end of file diff --git a/genimtools/src/lib.rs b/genimtools/src/lib.rs index 9129528e..71157319 100644 --- a/genimtools/src/lib.rs +++ b/genimtools/src/lib.rs @@ -9,4 +9,5 @@ pub mod common; pub mod tokenizers; pub mod uniwig; pub mod vocab; -pub mod tools; \ No newline at end of file +pub mod tools; +pub mod io; \ No newline at end of file diff --git a/genimtools/src/tools/cli.rs b/genimtools/src/tools/cli.rs index d374dcfc..c8b8cb6d 100644 --- a/genimtools/src/tools/cli.rs +++ b/genimtools/src/tools/cli.rs @@ -34,7 +34,7 @@ pub fn make_tools_cli() -> Command { pub mod handlers { - use std::path::{Path, PathBuf}; + use std::path::Path; use crate::{tokenizers::{self, Tokenizer}, common::models::RegionSet}; @@ -90,7 +90,7 @@ pub mod handlers { match data { Ok(data) => { let result = tokenizer.tokenize_region_set(&data).expect("Data couldn't be tokenized."); - let _ = result.to_gtok_file(&PathBuf::from(new_file)); + let _ = result.to_gtok_file(new_file.to_str().unwrap()); }, Err(e) => panic!("There was an error readig the data file: {}", e) }