diff --git a/bindings/Cargo.toml b/bindings/Cargo.toml index cac9c4a3..b5f5b3f6 100644 --- a/bindings/Cargo.toml +++ b/bindings/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "genimtools-py" -version = "0.0.11" +version = "0.0.12" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/bindings/src/models/region.rs b/bindings/src/models/region.rs index 246a79d4..35b605f7 100644 --- a/bindings/src/models/region.rs +++ b/bindings/src/models/region.rs @@ -79,12 +79,18 @@ impl PyRegion { #[derive(Clone, Debug)] pub struct PyTokenizedRegion { pub id: u32, - pub universe: PyUniverse, + pub universe: Py, } impl From for PyRegion { fn from(value: PyTokenizedRegion) -> Self { - value.universe.convert_id_to_region(value.id).unwrap() + Python::with_gil(|py| { + value + .universe + .borrow(py) + .convert_id_to_region(value.id) + .unwrap() + }) } } @@ -92,21 +98,48 @@ impl From for PyRegion { impl PyTokenizedRegion { #[getter] pub fn chr(&self) -> Result { - Ok(self.universe.convert_id_to_region(self.id).unwrap().chr) + Python::with_gil(|py| { + Ok(self + .universe + .borrow(py) + .convert_id_to_region(self.id) + .unwrap() + .chr) + }) } #[getter] pub fn start(&self) -> Result { - Ok(self.universe.convert_id_to_region(self.id).unwrap().start) + Python::with_gil(|py| { + Ok(self + .universe + .borrow(py) + .convert_id_to_region(self.id) + .unwrap() + .start) + }) } #[getter] pub fn end(&self) -> Result { - Ok(self.universe.convert_id_to_region(self.id).unwrap().end) + Python::with_gil(|py| { + Ok(self + .universe + .borrow(py) + .convert_id_to_region(self.id) + .unwrap() + .end) + }) } pub fn to_region(&self) -> Result { - Ok(self.universe.convert_id_to_region(self.id).unwrap()) + Python::with_gil(|py| { + Ok(self + .universe + .borrow(py) + .convert_id_to_region(self.id) + .unwrap()) + }) } #[getter] pub fn id(&self) -> Result { diff --git a/bindings/src/models/region_set.rs b/bindings/src/models/region_set.rs index 49562ab1..c9741d2d 100644 --- a/bindings/src/models/region_set.rs +++ b/bindings/src/models/region_set.rs @@ -6,7 +6,7 @@ use numpy::ndarray::Array; use numpy::{IntoPyArray, PyArray1}; use anyhow::Result; -use genimtools::common::{models::TokenizedRegionSet, utils::extract_regions_from_bed_file}; +use genimtools::common::utils::extract_regions_from_bed_file; use crate::models::{PyRegion, PyTokenizedRegion, PyUniverse}; @@ -89,18 +89,8 @@ impl PyRegionSet { #[derive(Clone, Debug)] pub struct PyTokenizedRegionSet { pub ids: Vec, - pub universe: PyUniverse, - curr: usize, -} - -impl From> for PyTokenizedRegionSet { - fn from(value: TokenizedRegionSet) -> Self { - PyTokenizedRegionSet { - ids: value.ids, - universe: value.universe.to_owned().into(), - curr: 0, - } - } + pub universe: Py, + pub curr: usize, } #[pymethods] @@ -117,21 +107,25 @@ impl PyTokenizedRegionSet { } pub fn to_bit_vector(&self) -> Result> { - let mut bit_vector = vec![0; self.universe.id_to_region.len()]; + Python::with_gil(|py| { + let mut bit_vector = vec![0; self.universe.borrow(py).id_to_region.len()]; - for id in &self.ids { - bit_vector[*id as usize] = 1; - } + for id in &self.ids { + bit_vector[*id as usize] = 1; + } - Ok(bit_vector) + Ok(bit_vector) + }) } pub fn to_regions(&self) -> Result> { - Ok(self - .ids - .iter() - .map(|id| self.universe.id_to_region[&id].clone()) - .collect()) + Python::with_gil(|py| { + Ok(self + .ids + .iter() + .map(|id| self.universe.borrow(py).id_to_region[&id].clone()) + .collect()) + }) } pub fn to_ids(&self) -> Result> { @@ -164,17 +158,19 @@ impl PyTokenizedRegionSet { } pub fn __next__(&mut self) -> Option { - if self.curr < self.ids.len() { - let id = self.ids[self.curr]; - self.curr += 1; - - Some(PyTokenizedRegion { - universe: self.universe.to_owned(), - id, - }) - } else { - None - } + Python::with_gil(|py| { + if self.curr < self.ids.len() { + let id = self.ids[self.curr]; + self.curr += 1; + + Some(PyTokenizedRegion { + universe: self.universe.clone_ref(py), + id, + }) + } else { + None + } + }) } pub fn __getitem__(&self, indx: isize) -> Result { diff --git a/bindings/src/models/universe.rs b/bindings/src/models/universe.rs index 136894fd..362d8845 100644 --- a/bindings/src/models/universe.rs +++ b/bindings/src/models/universe.rs @@ -17,17 +17,18 @@ pub struct PyUniverse { impl From for PyUniverse { fn from(value: Universe) -> Self { - let regions = value.regions.iter().map(|r| r.to_owned().into()).collect(); + let regions = value.regions.into_iter().map(|r| r.into()).collect(); let region_to_id = value .region_to_id - .iter() - .map(|(k, v)| (k.to_owned().into(), *v)) + .into_iter() + .map(|(k, v)| (k.into(), v)) .collect(); + let id_to_region = value .id_to_region - .iter() - .map(|(k, v)| (*k, v.to_owned().into())) + .into_iter() + .map(|(k, v)| (k, v.into())) .collect(); PyUniverse { diff --git a/bindings/src/tokenizers/tree_tokenizer.rs b/bindings/src/tokenizers/tree_tokenizer.rs index e5a714a4..f4bb7eda 100644 --- a/bindings/src/tokenizers/tree_tokenizer.rs +++ b/bindings/src/tokenizers/tree_tokenizer.rs @@ -15,16 +15,24 @@ use crate::utils::extract_regions_from_py_any; #[pyclass(name = "TreeTokenizer")] pub struct PyTreeTokenizer { pub tokenizer: TreeTokenizer, + pub universe: Py, // this is a Py-wrapped version self.tokenizer.universe for performance reasons } #[pymethods] impl PyTreeTokenizer { #[new] pub fn new(path: String) -> Result { - let path = Path::new(&path); - let tokenizer = TreeTokenizer::try_from(path)?; + Python::with_gil(|py| { + let path = Path::new(&path); + let tokenizer = TreeTokenizer::try_from(path)?; + let py_universe: PyUniverse = tokenizer.universe.to_owned().into(); + let py_universe_bound = Py::new(py, py_universe)?; - Ok(PyTreeTokenizer { tokenizer }) + Ok(PyTreeTokenizer { + tokenizer, + universe: py_universe_bound, + }) + }) } #[getter] @@ -138,7 +146,16 @@ impl PyTreeTokenizer { // tokenize the RegionSet let tokenized = self.tokenizer.tokenize_region_set(&rs); - Ok(tokenized.into()) + Python::with_gil(|py| { + let py_tokenized_region_set = PyTokenizedRegionSet { + ids: tokenized.ids, + curr: 0, + universe: self.universe.clone_ref(py), + }; + + Ok(py_tokenized_region_set) + }) + } // encode returns a list of ids diff --git a/genimtools/Cargo.toml b/genimtools/Cargo.toml index 86e5bbcb..059957bb 100644 --- a/genimtools/Cargo.toml +++ b/genimtools/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "genimtools" -version = "0.0.11" +version = "0.0.12" edition = "2021" description = "Performance-critical tools to manipulate, analyze, and process genomic interval data. Primarily focused on building tools for geniml - our genomic machine learning python package." license = "MIT" diff --git a/genimtools/docs/changelog.md b/genimtools/docs/changelog.md index a7fd016f..cb66da01 100644 --- a/genimtools/docs/changelog.md +++ b/genimtools/docs/changelog.md @@ -4,6 +4,9 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.0.12] +- optimize creation of `PyRegionSet` to reduce expensive cloning of `Universe` structs. + ## [0.0.11] - redesigned API for the tokenizers to better emulate the huggingface tokenizers API. - implemented new traits for tokenizers to allow for more flexibility when creating new tokenizers. diff --git a/genimtools/src/common/models/universe.rs b/genimtools/src/common/models/universe.rs index 309a88ae..0987f8d6 100644 --- a/genimtools/src/common/models/universe.rs +++ b/genimtools/src/common/models/universe.rs @@ -82,4 +82,4 @@ impl TryFrom<&Path> for Universe { id_to_region, }) } -} +} \ No newline at end of file