Skip to content

Commit

Permalink
Merge pull request #22 from databio/performance
Browse files Browse the repository at this point in the history
Fix pything bindings tokenization performance issues
  • Loading branch information
nleroy917 authored May 28, 2024
2 parents 539e35f + 6cead90 commit dab4beb
Show file tree
Hide file tree
Showing 8 changed files with 102 additions and 52 deletions.
2 changes: 1 addition & 1 deletion bindings/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "genimtools-py"
version = "0.0.11"
version = "0.0.12"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
45 changes: 39 additions & 6 deletions bindings/src/models/region.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,34 +79,67 @@ impl PyRegion {
#[derive(Clone, Debug)]
pub struct PyTokenizedRegion {
pub id: u32,
pub universe: PyUniverse,
pub universe: Py<PyUniverse>,
}

impl From<PyTokenizedRegion> for PyRegion {
fn from(value: PyTokenizedRegion) -> Self {
value.universe.convert_id_to_region(value.id).unwrap()
Python::with_gil(|py| {
value
.universe
.borrow(py)
.convert_id_to_region(value.id)
.unwrap()
})
}
}

#[pymethods]
impl PyTokenizedRegion {
#[getter]
pub fn chr(&self) -> Result<String> {
Ok(self.universe.convert_id_to_region(self.id).unwrap().chr)
Python::with_gil(|py| {
Ok(self
.universe
.borrow(py)
.convert_id_to_region(self.id)
.unwrap()
.chr)
})
}

#[getter]
pub fn start(&self) -> Result<u32> {
Ok(self.universe.convert_id_to_region(self.id).unwrap().start)
Python::with_gil(|py| {
Ok(self
.universe
.borrow(py)
.convert_id_to_region(self.id)
.unwrap()
.start)
})
}

#[getter]
pub fn end(&self) -> Result<u32> {
Ok(self.universe.convert_id_to_region(self.id).unwrap().end)
Python::with_gil(|py| {
Ok(self
.universe
.borrow(py)
.convert_id_to_region(self.id)
.unwrap()
.end)
})
}

pub fn to_region(&self) -> Result<PyRegion> {
Ok(self.universe.convert_id_to_region(self.id).unwrap())
Python::with_gil(|py| {
Ok(self
.universe
.borrow(py)
.convert_id_to_region(self.id)
.unwrap())
})
}
#[getter]
pub fn id(&self) -> Result<u32> {
Expand Down
64 changes: 30 additions & 34 deletions bindings/src/models/region_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use numpy::ndarray::Array;
use numpy::{IntoPyArray, PyArray1};

use anyhow::Result;
use genimtools::common::{models::TokenizedRegionSet, utils::extract_regions_from_bed_file};
use genimtools::common::utils::extract_regions_from_bed_file;

use crate::models::{PyRegion, PyTokenizedRegion, PyUniverse};

Expand Down Expand Up @@ -89,18 +89,8 @@ impl PyRegionSet {
#[derive(Clone, Debug)]
pub struct PyTokenizedRegionSet {
pub ids: Vec<u32>,
pub universe: PyUniverse,
curr: usize,
}

impl From<TokenizedRegionSet<'_>> for PyTokenizedRegionSet {
fn from(value: TokenizedRegionSet) -> Self {
PyTokenizedRegionSet {
ids: value.ids,
universe: value.universe.to_owned().into(),
curr: 0,
}
}
pub universe: Py<PyUniverse>,
pub curr: usize,
}

#[pymethods]
Expand All @@ -117,21 +107,25 @@ impl PyTokenizedRegionSet {
}

pub fn to_bit_vector(&self) -> Result<Vec<u8>> {
let mut bit_vector = vec![0; self.universe.id_to_region.len()];
Python::with_gil(|py| {
let mut bit_vector = vec![0; self.universe.borrow(py).id_to_region.len()];

for id in &self.ids {
bit_vector[*id as usize] = 1;
}
for id in &self.ids {
bit_vector[*id as usize] = 1;
}

Ok(bit_vector)
Ok(bit_vector)
})
}

pub fn to_regions(&self) -> Result<Vec<PyRegion>> {
Ok(self
.ids
.iter()
.map(|id| self.universe.id_to_region[&id].clone())
.collect())
Python::with_gil(|py| {
Ok(self
.ids
.iter()
.map(|id| self.universe.borrow(py).id_to_region[&id].clone())
.collect())
})
}

pub fn to_ids(&self) -> Result<Vec<u32>> {
Expand Down Expand Up @@ -164,17 +158,19 @@ impl PyTokenizedRegionSet {
}

pub fn __next__(&mut self) -> Option<PyTokenizedRegion> {
if self.curr < self.ids.len() {
let id = self.ids[self.curr];
self.curr += 1;

Some(PyTokenizedRegion {
universe: self.universe.to_owned(),
id,
})
} else {
None
}
Python::with_gil(|py| {
if self.curr < self.ids.len() {
let id = self.ids[self.curr];
self.curr += 1;

Some(PyTokenizedRegion {
universe: self.universe.clone_ref(py),
id,
})
} else {
None
}
})
}

pub fn __getitem__(&self, indx: isize) -> Result<PyTokenizedRegion> {
Expand Down
11 changes: 6 additions & 5 deletions bindings/src/models/universe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,18 @@ pub struct PyUniverse {

impl From<Universe> for PyUniverse {
fn from(value: Universe) -> Self {
let regions = value.regions.iter().map(|r| r.to_owned().into()).collect();
let regions = value.regions.into_iter().map(|r| r.into()).collect();

let region_to_id = value
.region_to_id
.iter()
.map(|(k, v)| (k.to_owned().into(), *v))
.into_iter()
.map(|(k, v)| (k.into(), v))
.collect();

let id_to_region = value
.id_to_region
.iter()
.map(|(k, v)| (*k, v.to_owned().into()))
.into_iter()
.map(|(k, v)| (k, v.into()))
.collect();

PyUniverse {
Expand Down
25 changes: 21 additions & 4 deletions bindings/src/tokenizers/tree_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,24 @@ use crate::utils::extract_regions_from_py_any;
#[pyclass(name = "TreeTokenizer")]
pub struct PyTreeTokenizer {
pub tokenizer: TreeTokenizer,
pub universe: Py<PyUniverse>, // this is a Py-wrapped version self.tokenizer.universe for performance reasons
}

#[pymethods]
impl PyTreeTokenizer {
#[new]
pub fn new(path: String) -> Result<Self> {
let path = Path::new(&path);
let tokenizer = TreeTokenizer::try_from(path)?;
Python::with_gil(|py| {
let path = Path::new(&path);
let tokenizer = TreeTokenizer::try_from(path)?;
let py_universe: PyUniverse = tokenizer.universe.to_owned().into();
let py_universe_bound = Py::new(py, py_universe)?;

Ok(PyTreeTokenizer { tokenizer })
Ok(PyTreeTokenizer {
tokenizer,
universe: py_universe_bound,
})
})
}

#[getter]
Expand Down Expand Up @@ -138,7 +146,16 @@ impl PyTreeTokenizer {
// tokenize the RegionSet
let tokenized = self.tokenizer.tokenize_region_set(&rs);

Ok(tokenized.into())
Python::with_gil(|py| {
let py_tokenized_region_set = PyTokenizedRegionSet {
ids: tokenized.ids,
curr: 0,
universe: self.universe.clone_ref(py),
};

Ok(py_tokenized_region_set)
})

}

// encode returns a list of ids
Expand Down
2 changes: 1 addition & 1 deletion genimtools/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "genimtools"
version = "0.0.11"
version = "0.0.12"
edition = "2021"
description = "Performance-critical tools to manipulate, analyze, and process genomic interval data. Primarily focused on building tools for geniml - our genomic machine learning python package."
license = "MIT"
Expand Down
3 changes: 3 additions & 0 deletions genimtools/docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.0.12]
- optimize creation of `PyRegionSet` to reduce expensive cloning of `Universe` structs.

## [0.0.11]
- redesigned API for the tokenizers to better emulate the huggingface tokenizers API.
- implemented new traits for tokenizers to allow for more flexibility when creating new tokenizers.
Expand Down
2 changes: 1 addition & 1 deletion genimtools/src/common/models/universe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,4 @@ impl TryFrom<&Path> for Universe {
id_to_region,
})
}
}
}

0 comments on commit dab4beb

Please sign in to comment.