Skip to content

Commit

Permalink
gtok file format
Browse files Browse the repository at this point in the history
  • Loading branch information
nleroy917 committed Jan 22, 2024
1 parent a5047e0 commit c5b524b
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 2 deletions.
2 changes: 1 addition & 1 deletion bindings/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "genimtools-py"
version = "0.0.8"
version = "0.0.9"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
1 change: 1 addition & 0 deletions bindings/genimtools/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .genimtools.utils import *
11 changes: 11 additions & 0 deletions bindings/genimtools/utils/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from typing import List

def write_tokens_to_gtok(filename: str, tokens: List[int]) -> None:
"""
Write a list of tokens to a gtok file.
"""

def read_tokens_from_gtok(filename: str) -> List[int]:
"""
Read a list of tokens from a gtok file.
"""
5 changes: 5 additions & 0 deletions bindings/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ mod consts;
mod models;
mod tokenizers;
mod vocab;
mod utils;

pub const VERSION: &str = env!("CARGO_PKG_VERSION");

Expand All @@ -14,9 +15,12 @@ fn genimtools(py: Python, m: &PyModule) -> PyResult<()> {
let vocab_module = pyo3::wrap_pymodule!(vocab::vocab);
let tokenize_module = pyo3::wrap_pymodule!(tokenizers::tokenizers);
let ailist_module = pyo3::wrap_pymodule!(ailist::ailist);
let utils_module = pyo3::wrap_pymodule!(utils::utils);

m.add_wrapped(vocab_module)?;
m.add_wrapped(tokenize_module)?;
m.add_wrapped(ailist_module)?;
m.add_wrapped(utils_module)?;

let sys = PyModule::import(py, "sys")?;
let sys_modules: &PyDict = sys.getattr("modules")?.downcast()?;
Expand All @@ -25,6 +29,7 @@ fn genimtools(py: Python, m: &PyModule) -> PyResult<()> {
sys_modules.set_item("genimtools.vocab", m.getattr("vocab")?)?;
sys_modules.set_item("genimtools.tokenizers", m.getattr("tokenizers")?)?;
sys_modules.set_item("genimtools.ailist", m.getattr("ailist")?)?;
sys_modules.set_item("genimtools.utils", m.getattr("utils")?)?;

// add constants
m.add("PAD_CHR", consts::PAD_CHR)?;
Expand Down
1 change: 1 addition & 0 deletions bindings/src/models/region_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use genimtools::common::consts::{PAD_CHR, PAD_END, PAD_START};

use crate::models::{PyRegion, PyTokenizedRegion};


#[pyclass(name = "TokenizedRegionSet")]
#[derive(Clone, Debug)]
pub struct PyTokenizedRegionSet {
Expand Down
21 changes: 21 additions & 0 deletions bindings/src/utils/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
use pyo3::prelude::*;


#[pyfunction]
pub fn write_tokens_to_gtok(filename: &str, tokens: Vec<u32>) -> PyResult<()> {
genimtools::io::write_tokens_to_gtok(filename, &tokens)?;
Ok(())
}

#[pyfunction]
pub fn read_tokens_from_gtok(filename: &str) -> PyResult<Vec<u32>> {
let tokens = genimtools::io::read_tokens_from_gtok(filename)?;
Ok(tokens)
}

#[pymodule]
pub fn utils(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_wrapped(wrap_pyfunction!(write_tokens_to_gtok))?;
m.add_wrapped(wrap_pyfunction!(read_tokens_from_gtok))?;
Ok(())
}
2 changes: 1 addition & 1 deletion genimtools/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "genimtools"
version = "0.0.8"
version = "0.0.9"
edition = "2021"
description = "Performance-critical tools to manipulate, analyze, and process genomic interval data. Primarily focused on building tools for geniml - our genomic machine learning python package."
license = "MIT"
Expand Down
4 changes: 4 additions & 0 deletions genimtools/docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.0.9]
- start working on the concept of a `.gtok` file-format to store tokenized regions
- - added basic readers and writers for this format

## [0.0.8]
- add a new `ids_as_strs` getter to the `TokenizedRegionSet` struct so that we can get the ids as strings quickly, this is meant mostly for interface with geniml.

Expand Down

0 comments on commit c5b524b

Please sign in to comment.