From c5b524b0f60678895c3c54b123dd8a3bb7277038 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Mon, 22 Jan 2024 11:33:56 -0500 Subject: [PATCH] gtok file format --- bindings/Cargo.toml | 2 +- bindings/genimtools/utils/__init__.py | 1 + bindings/genimtools/utils/__init__.pyi | 11 +++++++++++ bindings/src/lib.rs | 5 +++++ bindings/src/models/region_set.rs | 1 + bindings/src/utils/mod.rs | 21 +++++++++++++++++++++ genimtools/Cargo.toml | 2 +- genimtools/docs/changelog.md | 4 ++++ 8 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 bindings/genimtools/utils/__init__.py create mode 100644 bindings/genimtools/utils/__init__.pyi create mode 100644 bindings/src/utils/mod.rs diff --git a/bindings/Cargo.toml b/bindings/Cargo.toml index a06317b8..f04e3b76 100644 --- a/bindings/Cargo.toml +++ b/bindings/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "genimtools-py" -version = "0.0.8" +version = "0.0.9" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/bindings/genimtools/utils/__init__.py b/bindings/genimtools/utils/__init__.py new file mode 100644 index 00000000..7b569c59 --- /dev/null +++ b/bindings/genimtools/utils/__init__.py @@ -0,0 +1 @@ +from .genimtools.utils import * \ No newline at end of file diff --git a/bindings/genimtools/utils/__init__.pyi b/bindings/genimtools/utils/__init__.pyi new file mode 100644 index 00000000..476431e0 --- /dev/null +++ b/bindings/genimtools/utils/__init__.pyi @@ -0,0 +1,11 @@ +from typing import List + +def write_tokens_to_gtok(filename: str, tokens: List[int]) -> None: + """ + Write a list of tokens to a gtok file. + """ + +def read_tokens_from_gtok(filename: str) -> List[int]: + """ + Read a list of tokens from a gtok file. + """ \ No newline at end of file diff --git a/bindings/src/lib.rs b/bindings/src/lib.rs index 25320d90..e958c002 100644 --- a/bindings/src/lib.rs +++ b/bindings/src/lib.rs @@ -6,6 +6,7 @@ mod consts; mod models; mod tokenizers; mod vocab; +mod utils; pub const VERSION: &str = env!("CARGO_PKG_VERSION"); @@ -14,9 +15,12 @@ fn genimtools(py: Python, m: &PyModule) -> PyResult<()> { let vocab_module = pyo3::wrap_pymodule!(vocab::vocab); let tokenize_module = pyo3::wrap_pymodule!(tokenizers::tokenizers); let ailist_module = pyo3::wrap_pymodule!(ailist::ailist); + let utils_module = pyo3::wrap_pymodule!(utils::utils); + m.add_wrapped(vocab_module)?; m.add_wrapped(tokenize_module)?; m.add_wrapped(ailist_module)?; + m.add_wrapped(utils_module)?; let sys = PyModule::import(py, "sys")?; let sys_modules: &PyDict = sys.getattr("modules")?.downcast()?; @@ -25,6 +29,7 @@ fn genimtools(py: Python, m: &PyModule) -> PyResult<()> { sys_modules.set_item("genimtools.vocab", m.getattr("vocab")?)?; sys_modules.set_item("genimtools.tokenizers", m.getattr("tokenizers")?)?; sys_modules.set_item("genimtools.ailist", m.getattr("ailist")?)?; + sys_modules.set_item("genimtools.utils", m.getattr("utils")?)?; // add constants m.add("PAD_CHR", consts::PAD_CHR)?; diff --git a/bindings/src/models/region_set.rs b/bindings/src/models/region_set.rs index 48ed2b96..e0c457f6 100644 --- a/bindings/src/models/region_set.rs +++ b/bindings/src/models/region_set.rs @@ -5,6 +5,7 @@ use genimtools::common::consts::{PAD_CHR, PAD_END, PAD_START}; use crate::models::{PyRegion, PyTokenizedRegion}; + #[pyclass(name = "TokenizedRegionSet")] #[derive(Clone, Debug)] pub struct PyTokenizedRegionSet { diff --git a/bindings/src/utils/mod.rs b/bindings/src/utils/mod.rs new file mode 100644 index 00000000..7bdd4980 --- /dev/null +++ b/bindings/src/utils/mod.rs @@ -0,0 +1,21 @@ +use pyo3::prelude::*; + + +#[pyfunction] +pub fn write_tokens_to_gtok(filename: &str, tokens: Vec) -> PyResult<()> { + genimtools::io::write_tokens_to_gtok(filename, &tokens)?; + Ok(()) +} + +#[pyfunction] +pub fn read_tokens_from_gtok(filename: &str) -> PyResult> { + let tokens = genimtools::io::read_tokens_from_gtok(filename)?; + Ok(tokens) +} + +#[pymodule] +pub fn utils(_py: Python, m: &PyModule) -> PyResult<()> { + m.add_wrapped(wrap_pyfunction!(write_tokens_to_gtok))?; + m.add_wrapped(wrap_pyfunction!(read_tokens_from_gtok))?; + Ok(()) +} diff --git a/genimtools/Cargo.toml b/genimtools/Cargo.toml index 89622d93..ac00a593 100644 --- a/genimtools/Cargo.toml +++ b/genimtools/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "genimtools" -version = "0.0.8" +version = "0.0.9" edition = "2021" description = "Performance-critical tools to manipulate, analyze, and process genomic interval data. Primarily focused on building tools for geniml - our genomic machine learning python package." license = "MIT" diff --git a/genimtools/docs/changelog.md b/genimtools/docs/changelog.md index 0ae99ba3..68283681 100644 --- a/genimtools/docs/changelog.md +++ b/genimtools/docs/changelog.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.0.9] +- start working on the concept of a `.gtok` file-format to store tokenized regions +- - added basic readers and writers for this format + ## [0.0.8] - add a new `ids_as_strs` getter to the `TokenizedRegionSet` struct so that we can get the ids as strings quickly, this is meant mostly for interface with geniml.