From 6403559f361ecbcbafe7df9dbf6696254f470f7f Mon Sep 17 00:00:00 2001
From: aminediro <aminedirhoussi1@gmail.com>
Date: Fri, 10 May 2024 23:43:48 +0200
Subject: [PATCH 01/21] softmax migration

---
 CONTRIBUTING.md                        | 22 ++++++++++++++++++++++
 Cargo.toml                             | 17 +++++++++++------
 crates/ratchet-core/Cargo.toml         |  9 +++++----
 crates/ratchet-core/src/ops/softmax.rs | 16 +++++-----------
 crates/ratchet-core/src/tensor.rs      | 22 ++++++++++++++++++++++
 requirements.txt                       |  2 +-
 6 files changed, 66 insertions(+), 22 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7291737f..442692c1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -131,6 +131,8 @@ If that looks like this, you are good to go 🎉
 
 ### Step 3: Run Tests
 
+#### PYO3 tests
+
 Finally, run the tests for the package using Cargo:
 
 ```sh
@@ -143,6 +145,26 @@ To run the `PyO3` tests, add the `pyo3` flag:
 cargo test --features pyo3
 ```
 
+#### `tch` tests
+
+`tch` based tests are ran behind the `testing` feature. You need to first have the PyTorch library (libtorch) in v2.3.0 to be available on your system. Follow the [official `tch` for more details](https://github.com/LaurentMazare/tch-rs/tree/main?tab=readme-ov-file). We'll use the libtorch library installed in the python envionment:
+
+```sh
+export LIBTORCH_USE_PYTORCH=1
+```
+
+You can now run tests:
+
+```sh
+cargo test --features testing
+```
+
+**NOTE**: If you're having compilation issue with MacOS. You can add the `libtorch` lib to your environment :
+
+```sh
+export DYLD_LIBRARY_PATH=$PWD/venv/lib/python3.10/site-packages/torch/lib:$DYLD_LIBRARY_PATH
+```
+
 ### Step 5: Run WASM Tests
 
 To run WASM tests (e.g., the whisper test) run:
diff --git a/Cargo.toml b/Cargo.toml
index 0d5f518f..967aee7a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,8 +5,8 @@ members = [
     "crates/ratchet-web",
     "crates/ratchet-loader",
     "crates/ratchet-models",
-    "crates/ratchet-nn", 
-    "crates/ratchet-hub", 
+    "crates/ratchet-nn",
+    "crates/ratchet-hub",
     "crates/ratchet-cli",
 ]
 resolver = "2"
@@ -18,7 +18,7 @@ debug-assertions = true
 [profile.release]
 panic = 'abort'
 lto = "fat"
-codegen-units = 1 
+codegen-units = 1
 
 [profile.profiling]
 inherits = "release"
@@ -26,18 +26,22 @@ debug = 2
 
 [workspace.dependencies]
 wgpu = { version = "0.20", features = ["fragile-send-sync-non-atomic-wasm"] }
-bytemuck = { version = "1.14.0", features=["wasm_simd", "aarch64_simd", "extern_crate_alloc"] }
+bytemuck = { version = "1.14.0", features = [
+    "wasm_simd",
+    "aarch64_simd",
+    "extern_crate_alloc",
+] }
 num-traits = "0.2.17"
 half = { version = "2.3.1", features = ["num-traits", "bytemuck"] }
 derive-new = "0.6.0"
 log = "0.4.20"
 thiserror = "1.0.56"
 byteorder = "1.5.0"
-npyz = { version = "0.8.3"}
+npyz = { version = "0.8.3" }
 hf-hub = "0.3.2"
 serde = "1.0"
 anyhow = "1.0.79"
-tokenizers = "0.19.1" 
+tokenizers = "0.19.1"
 
 js-sys = "0.3.64"
 wasm-bindgen = "0.2.91"
@@ -90,3 +94,4 @@ wasm-bindgen-futures = "0.4.41"
 web-sys = "0.3.64"
 web-time = "1.0.0"
 futures-intrusive = "0.5.0"
+tch = "0.16.0"
diff --git a/crates/ratchet-core/Cargo.toml b/crates/ratchet-core/Cargo.toml
index 1426c0c6..db3ab55d 100644
--- a/crates/ratchet-core/Cargo.toml
+++ b/crates/ratchet-core/Cargo.toml
@@ -9,7 +9,7 @@ default = ["rand", "testing"]
 gpu-profiling = ["dep:tabled", "dep:itertools"]
 rand = ["dep:rand", "dep:rand_distr"]
 plotting = ["dep:dot3", "dep:tempfile"]
-testing = ["dep:npyz", "dep:ndarray"]
+testing = ["dep:npyz", "dep:ndarray","dep:tch"]
 pyo3 = ["dep:pyo3", "dep:numpy", "dep:regex"]
 
 [build-dependencies]
@@ -31,7 +31,7 @@ num-traits = { workspace = true }
 log = { workspace = true }
 thiserror = { workspace = true }
 serde = { workspace = true, features = ["derive"] }
-anyhow.workspace = true 
+anyhow.workspace = true
 
 rustc-hash = { workspace = true }
 slotmap = { workspace = true }
@@ -55,12 +55,13 @@ tempfile = { workspace = true, optional = true }
 tabled = { workspace = true, optional = true }
 itertools = { workspace = true, optional = true }
 
-pyo3 = { workspace = true, features = ["auto-initialize"], optional = true } 
+pyo3 = { workspace = true, features = ["auto-initialize"], optional = true }
 regex = { workspace = true, optional = true }
 numpy = { workspace = true, optional = true }
+tch = {workspace =true, optional=true}
 
 [target.'cfg(target_arch = "wasm32")'.dependencies]
-wasm-bindgen.workspace = true 
+wasm-bindgen.workspace = true
 futures-intrusive.workspace = true
 
 async-trait = "0.1.77"
diff --git a/crates/ratchet-core/src/ops/softmax.rs b/crates/ratchet-core/src/ops/softmax.rs
index f481d8f4..1addc7ad 100644
--- a/crates/ratchet-core/src/ops/softmax.rs
+++ b/crates/ratchet-core/src/ops/softmax.rs
@@ -104,25 +104,19 @@ impl MetaOperation for Softmax {
     }
 }
 
-#[cfg(all(test, feature = "pyo3"))]
+#[cfg(all(test, feature = "testing"))]
 mod tests {
-    use test_strategy::{proptest, Arbitrary};
-
-    use crate::test_util::run_py_prg;
     use crate::{shape, Device, DeviceRequest, Tensor};
+    use tch;
+    use test_strategy::{proptest, Arbitrary};
 
     thread_local! {
         static GPU_DEVICE: Device = Device::request_device(DeviceRequest::GPU).unwrap();
     }
 
     fn ground_truth(a: &Tensor) -> anyhow::Result<Tensor> {
-        let prg = r#"
-import torch
-import torch.nn.functional as F
-def softmax(a):
-    return F.softmax(torch.from_numpy(a), dim=-1).numpy()
-"#;
-        run_py_prg(prg.to_string(), &[a], &[])
+        let t = a.to_tch::<f32>()?;
+        Tensor::try_from(&t.softmax(-1, Some(tch::kind::Kind::Float)))
     }
 
     fn run_softmax_trial(problem: SoftmaxProblem) {
diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index c0ed6ecd..7bea37d0 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -5,6 +5,7 @@ use crate::{
     Storage, Strides, TensorDType, TensorId,
 };
 use derive_new::new;
+use ndarray::OwnedRepr;
 use parking_lot::{RwLock, RwLockReadGuard};
 use std::collections::HashSet;
 use std::io::{BufRead, Seek};
@@ -15,6 +16,7 @@ use std::sync::Arc;
 #[cfg(feature = "rand")]
 use {rand::prelude::*, rand_distr::StandardNormal};
 
+use ndarray::ArrayBase;
 #[cfg(feature = "testing")]
 use ndarray::{ArrayD, ArrayViewD, Dimension};
 
@@ -838,6 +840,17 @@ impl Tensor {
         ))
     }
 
+    #[cfg(feature = "testing")]
+    pub fn to_tch<T: TensorDType + tch::kind::Element>(&self) -> anyhow::Result<tch::Tensor> {
+        assert!(
+            self.device().is_cpu(),
+            "Cannot convert non-CPU tensor to numpy array"
+        );
+        Ok(tch::Tensor::try_from(
+            &self.deep_clone().into_ndarray::<T>(),
+        )?)
+    }
+
     #[cfg(feature = "pyo3")]
     pub fn to_py<'s, 'p: 's, T: TensorDType + numpy::Element>(
         &'s self,
@@ -852,6 +865,15 @@ impl Tensor {
     }
 }
 
+#[cfg(feature = "testing")]
+impl TryFrom<&tch::Tensor> for Tensor {
+    type Error = anyhow::Error;
+    fn try_from(array: &tch::Tensor) -> anyhow::Result<Self> {
+        let base: ArrayBase<OwnedRepr<f32>, _> = array.try_into()?;
+        Ok(Self::from(base))
+    }
+}
+
 #[cfg(feature = "pyo3")]
 impl<T: TensorDType + numpy::Element> From<&PyArrayDyn<T>> for Tensor {
     fn from(array: &PyArrayDyn<T>) -> Self {
diff --git a/requirements.txt b/requirements.txt
index f431ed23..21c0a13e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 numpy==1.24.3
-torch==2.0.1
+torch==2.3.0
 requests==2.26.0
 mlx==0.9.0; sys_platform == 'darwin'
 git+https://github.com/FL33TW00D/whisper.git@feature/reference#egg=openai-whisper

From 7047532d5adef65f8f39480e2d187a1a117f0912 Mon Sep 17 00:00:00 2001
From: aminediro <aminedirhoussi@gmail.com>
Date: Sat, 11 May 2024 00:44:19 +0200
Subject: [PATCH 02/21] support tch dtype conversion

---
 crates/ratchet-core/src/tensor.rs | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 7bea37d0..b1ec9762 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -4,6 +4,7 @@ use crate::{
     InvariantError, LazyOp, MetaOperation, Operation, OperationError, RVec, RawCPUBuffer, Shape,
     Storage, Strides, TensorDType, TensorId,
 };
+use anyhow::bail;
 use derive_new::new;
 use ndarray::OwnedRepr;
 use parking_lot::{RwLock, RwLockReadGuard};
@@ -869,8 +870,18 @@ impl Tensor {
 impl TryFrom<&tch::Tensor> for Tensor {
     type Error = anyhow::Error;
     fn try_from(array: &tch::Tensor) -> anyhow::Result<Self> {
-        let base: ArrayBase<OwnedRepr<f32>, _> = array.try_into()?;
-        Ok(Self::from(base))
+        let kind = array.kind();
+        match kind {
+            tch::Kind::Float => {
+                let base: ArrayD<f32> = array.try_into()?;
+                Ok(Self::from(base))
+            }
+            tch::Kind::QInt8 => todo!(),
+            tch::Kind::Half => todo!(),
+            tch::Kind::BFloat16 => todo!(),
+            tch::Kind::Int => todo!(),
+            _ => bail!("unsupported tch dtype"),
+        }
     }
 }
 

From 35af80e995abac6f2b6f578f71a72f4993f39163 Mon Sep 17 00:00:00 2001
From: aminediro <aminedirhoussi@gmail.com>
Date: Sat, 11 May 2024 10:39:20 +0200
Subject: [PATCH 03/21] added binary tests tch

---
 crates/ratchet-core/src/ops/binary.rs | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/crates/ratchet-core/src/ops/binary.rs b/crates/ratchet-core/src/ops/binary.rs
index 89193f37..0e1fd0a7 100644
--- a/crates/ratchet-core/src/ops/binary.rs
+++ b/crates/ratchet-core/src/ops/binary.rs
@@ -151,9 +151,9 @@ impl MetaOperation for Binary {
     }
 }
 
-#[cfg(all(test, feature = "pyo3"))]
+#[cfg(all(test, feature = "testing"))]
 mod tests {
-    use crate::{test_util::run_py_prg, BinaryOp, Device, DeviceRequest, Shape, Tensor};
+    use crate::{BinaryOp, Device, DeviceRequest, Shape, Tensor};
     use test_strategy::{proptest, Arbitrary};
 
     thread_local! {
@@ -168,16 +168,15 @@ mod tests {
     }
 
     fn ground_truth(a: &Tensor, b: &Tensor, op: &BinaryOp) -> anyhow::Result<Tensor> {
-        let kn = op.kernel_name();
-        let prg = format!(
-            r#"
-import torch
-def {}(a, b):
-    return torch.{}(torch.from_numpy(a), torch.from_numpy(b)).numpy()
-"#,
-            kn, kn
-        );
-        run_py_prg(prg.to_string(), &[a, b], &[])
+        let a = a.to_tch::<f32>()?;
+        let b = b.to_tch::<f32>()?;
+        let result = match op {
+            BinaryOp::Add => a.f_add(&b)?,
+            BinaryOp::Sub => a.f_sub(&b)?,
+            BinaryOp::Mul => a.f_mul(&b)?,
+            BinaryOp::Div => a.f_div(&b)?,
+        };
+        Tensor::try_from(&result)
     }
 
     fn run_binary_trial(prob: BinaryProblem) -> anyhow::Result<()> {

From 8aad89f75d6b7c365f1c454db76fd37e061ff76c Mon Sep 17 00:00:00 2001
From: aminediro <aminedirhoussi@gmail.com>
Date: Sat, 11 May 2024 11:54:22 +0200
Subject: [PATCH 04/21] cleanup imports

---
 crates/ratchet-core/src/tensor.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index b1ec9762..86add21b 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -6,7 +6,6 @@ use crate::{
 };
 use anyhow::bail;
 use derive_new::new;
-use ndarray::OwnedRepr;
 use parking_lot::{RwLock, RwLockReadGuard};
 use std::collections::HashSet;
 use std::io::{BufRead, Seek};
@@ -17,7 +16,6 @@ use std::sync::Arc;
 #[cfg(feature = "rand")]
 use {rand::prelude::*, rand_distr::StandardNormal};
 
-use ndarray::ArrayBase;
 #[cfg(feature = "testing")]
 use ndarray::{ArrayD, ArrayViewD, Dimension};
 

From 3bb4f50b49b093bdb2861afaa20d96032ee7e3b4 Mon Sep 17 00:00:00 2001
From: aminediro <aminedirhoussi@gmail.com>
Date: Sat, 11 May 2024 12:05:38 +0200
Subject: [PATCH 05/21] sgemm to tch tests

---
 crates/ratchet-core/src/ops/matmul.rs | 65 +++++++--------------------
 1 file changed, 16 insertions(+), 49 deletions(-)

diff --git a/crates/ratchet-core/src/ops/matmul.rs b/crates/ratchet-core/src/ops/matmul.rs
index 09a2c51f..51d7229a 100644
--- a/crates/ratchet-core/src/ops/matmul.rs
+++ b/crates/ratchet-core/src/ops/matmul.rs
@@ -594,12 +594,10 @@ impl MetaOperation for GEMM {
     }
 }
 
-#[cfg(all(test, feature = "pyo3"))]
+#[cfg(all(test, feature = "testing"))]
 mod tests {
     use test_strategy::{proptest, Arbitrary};
 
-    use crate::test_util::run_py_prg;
-
     use crate::{shape, Device, DeviceRequest, Quantization, Quantizer};
 
     use super::*;
@@ -612,53 +610,22 @@ mod tests {
         trans_rhs: bool,
         trans_out: bool,
     ) -> anyhow::Result<Tensor> {
-        let a_op = if trans_lhs {
-            "torch.permute(torch.from_numpy(a), [0, 2, 1])"
-        } else {
-            "torch.from_numpy(a)"
-        };
-
-        let b_op = if trans_rhs {
-            "torch.permute(torch.from_numpy(b), [0, 2, 1])"
-        } else {
-            "torch.from_numpy(b)"
-        };
-
-        let inner = if bias.is_some() {
-            format!(
-                "torch.add(torch.matmul({}, {}), torch.from_numpy(bias))",
-                a_op, b_op
-            )
-        } else {
-            format!("torch.matmul({}, {})", a_op, b_op)
-        };
-
-        let result_op = if trans_out {
-            format!(
-                "np.ascontiguousarray(torch.permute({}, [0, 2, 1]).numpy())",
-                inner
-            )
-        } else {
-            format!("{}.numpy()", inner)
-        };
-
-        let prg = format!(
-            r#"
-import torch
-import numpy as np
-def matmul(a, b{}):
-    return {}"#,
-            if bias.is_some() { ", bias" } else { "" },
-            result_op
-        );
-
-        let args = if let Some(bias) = bias {
-            vec![a, b, bias]
-        } else {
-            vec![a, b]
+        let a = a.to_tch::<f32>()?;
+        let b = b.to_tch::<f32>()?;
+        let a = if trans_lhs { a.permute([0, 2, 1]) } else { a };
+        let b = if trans_rhs { b.permute([0, 2, 1]) } else { b };
+
+        let result = match bias {
+            Some(bias) => {
+                let bias = bias.to_tch::<f32>()?;
+                a.matmul(&b).f_add(&bias)?
+            }
+            None => a.matmul(&b),
         };
-
-        run_py_prg(prg.to_string(), &args, &[])
+        if trans_out {
+            return Tensor::try_from(&result.permute([0, 2, 1]).contiguous());
+        }
+        Tensor::try_from(&result)
     }
 
     #[derive(Arbitrary, Clone, Debug)]

From ef60b168dbfc87f00f1886112e5e1e8e7a4155fc Mon Sep 17 00:00:00 2001
From: aminediro <aminedirhoussi@gmail.com>
Date: Sat, 11 May 2024 12:32:48 +0200
Subject: [PATCH 06/21] norm to tch

---
 crates/ratchet-core/src/ops/norm/groupnorm.rs | 24 +++-----
 crates/ratchet-core/src/ops/norm/mod.rs       | 55 +++++++++----------
 2 files changed, 35 insertions(+), 44 deletions(-)

diff --git a/crates/ratchet-core/src/ops/norm/groupnorm.rs b/crates/ratchet-core/src/ops/norm/groupnorm.rs
index df94e0ed..e74451c9 100644
--- a/crates/ratchet-core/src/ops/norm/groupnorm.rs
+++ b/crates/ratchet-core/src/ops/norm/groupnorm.rs
@@ -30,11 +30,10 @@ impl Operation for GroupNorm {
         Ok(self.norm.input.storage_view().clone())
     }
 }
-#[cfg(all(test, feature = "pyo3"))]
+#[cfg(all(test, feature = "testing"))]
 mod tests {
     use test_strategy::{proptest, Arbitrary};
 
-    use crate::test_util::run_py_prg;
     use crate::{rvec, shape, Device, DeviceRequest, Tensor};
 
     fn ground_truth(
@@ -43,20 +42,15 @@ mod tests {
         bias: Option<&Tensor>,
         num_groups: usize,
     ) -> anyhow::Result<Tensor> {
-        let prg = r#"
-import torch
-import torch.nn.functional as F
-
-def manual_group_norm(input, scale, bias, num_groups):
-    (input, scale, bias) = (torch.from_numpy(input), torch.from_numpy(scale), torch.from_numpy(bias))
-    return F.group_norm(input, num_groups, weight=scale, bias=bias).numpy()
-"#;
-
-        let inputs = match bias {
-            Some(bias) => rvec![input, scale, bias],
-            None => rvec![input, scale],
+        let input = input.to_tch::<f32>()?;
+        let scale = scale.to_tch::<f32>()?;
+        let bias = match bias {
+            Some(b) => Some(b.to_tch::<f32>()?),
+            None => None,
         };
-        run_py_prg(prg.to_string(), &inputs, &[&num_groups])
+        let result =
+            input.f_group_norm(num_groups as i64, Some(&scale), bias.as_ref(), 1e-5, false)?;
+        Tensor::try_from(&result)
     }
 
     fn run_norm_trial(device: &Device, problem: GroupNormProblem) -> anyhow::Result<()> {
diff --git a/crates/ratchet-core/src/ops/norm/mod.rs b/crates/ratchet-core/src/ops/norm/mod.rs
index 7a2fe21f..7598cf41 100644
--- a/crates/ratchet-core/src/ops/norm/mod.rs
+++ b/crates/ratchet-core/src/ops/norm/mod.rs
@@ -180,11 +180,10 @@ impl MetaOperation for NormOp {
     }
 }
 
-#[cfg(all(test, feature = "pyo3"))]
+#[cfg(all(test, feature = "testing"))]
 mod tests {
     use test_strategy::{proptest, Arbitrary};
 
-    use crate::test_util::run_py_prg;
     use crate::{rvec, shape, Device, DeviceRequest, Tensor};
 
     fn ground_truth(
@@ -193,35 +192,33 @@ mod tests {
         scale: &Tensor,
         bias: Option<&Tensor>,
     ) -> anyhow::Result<Tensor> {
-        let ln_prg = r#"
-import torch
-import torch.nn.functional as F
-
-def layer_norm(input, scale, bias):
-    (input, scale, bias) = (torch.from_numpy(input), torch.from_numpy(scale), torch.from_numpy(bias))
-    return F.layer_norm(input, (input.shape[-1],), weight=scale, bias=bias).numpy()
-"#;
-
-        let rms_prg = r#"
-import torch
-def manual_rms_norm(input, scale):
-    (input, scale) = (torch.from_numpy(input), torch.from_numpy(scale))
-    variance = input.to(torch.float32).pow(2).mean(dim=-1, keepdim=True)
-    input = input * torch.rsqrt(variance + 1e-5)
-    return (scale * input).numpy()
-"#;
-
-        let prg = match var {
-            NormVariant::LayerNorm => ln_prg,
-            NormVariant::RMSNorm => rms_prg,
+        let input = input.to_tch::<f32>()?;
+        let scale = scale.to_tch::<f32>()?;
+        let bias = match bias {
+            Some(b) => Some(b.to_tch::<f32>()?),
+            None => None,
         };
-
-        let inputs = match bias {
-            Some(bias) => rvec![input, scale, bias],
-            None => rvec![input, scale],
+        let result = match var {
+            NormVariant::LayerNorm => input.f_layer_norm(
+                [*input.size().last().unwrap()],
+                Some(&scale),
+                bias.as_ref(),
+                1e-5,
+                false,
+            )?,
+            NormVariant::RMSNorm => {
+                // (input, scale) = (torch.from_numpy(input), torch.from_numpy(scale))
+                // variance = input.to(torch.float32).pow(2).mean(dim=-1, keepdim=True)
+                // input = input * torch.rsqrt(variance + 1e-5)
+                // return (scale * input).numpy()
+                let variance = input
+                    .f_pow_tensor_scalar(2)?
+                    .mean_dim(-1, true, input.kind());
+                let input = input.multiply(&variance.f_add_scalar(1e-5)?.rsqrt());
+                scale.multiply(&input)
+            }
         };
-
-        run_py_prg(prg.to_string(), &inputs, &[])
+        Tensor::try_from(&result)
     }
 
     fn run_norm_trial(device: &Device, problem: NormProblem) -> anyhow::Result<()> {

From 70576c2e12ba5da628075389f64417524d169922 Mon Sep 17 00:00:00 2001
From: aminediro <aminedirhoussi@gmail.com>
Date: Sat, 11 May 2024 12:43:25 +0200
Subject: [PATCH 07/21] added unary ops

---
 crates/ratchet-core/src/ops/unary.rs | 58 ++++++++++++----------------
 1 file changed, 24 insertions(+), 34 deletions(-)

diff --git a/crates/ratchet-core/src/ops/unary.rs b/crates/ratchet-core/src/ops/unary.rs
index ac258206..bed6d8b0 100644
--- a/crates/ratchet-core/src/ops/unary.rs
+++ b/crates/ratchet-core/src/ops/unary.rs
@@ -156,12 +156,11 @@ impl MetaOperation for Unary {
     }
 }
 
-#[cfg(all(test, feature = "pyo3"))]
+#[cfg(all(test, feature = "testing"))]
 mod tests {
+    use crate::{shape, Device, DeviceRequest, Tensor, UnaryOp};
     use test_strategy::{proptest, Arbitrary};
 
-    use crate::{shape, test_util::run_py_prg, Device, DeviceRequest, Tensor, UnaryOp};
-
     #[derive(Arbitrary, Debug)]
     struct UnaryProblem {
         op: UnaryOp,
@@ -173,33 +172,28 @@ mod tests {
         N: usize,
     }
 
-    fn ground_truth(a: &Tensor, op: &UnaryOp, args: &str) -> anyhow::Result<Tensor> {
-        let kn = op.kernel_name();
-        let func_prg = format!(
-            r#"
-import torch
-import torch.nn.functional as F
-def {}(a):
-    return F.{}(torch.from_numpy(a), {}).numpy()
-"#,
-            kn, kn, args,
-        );
-
-        let imp_prg = format!(
-            r#"
-import torch
-def {}(a):
-    return torch.{}(torch.from_numpy(a), {}).numpy()
-"#,
-            kn, kn, args,
-        );
-
-        let prg = match op {
-            UnaryOp::Gelu | UnaryOp::Silu | UnaryOp::Sigmoid => func_prg,
-            _ => imp_prg,
+    fn ground_truth(a: &Tensor, op: &UnaryOp) -> anyhow::Result<Tensor> {
+        let a = a.to_tch::<f32>()?;
+        let result = match op {
+            UnaryOp::Gelu => {
+                // UnaryOp::Gelu => "approximate=\"tanh\"",
+                a.f_gelu("tanh")?
+            }
+            UnaryOp::Tanh => a.tanh(),
+            UnaryOp::Exp => a.exp(),
+            UnaryOp::Log => a.log(),
+            UnaryOp::Sin => a.sin(),
+            UnaryOp::Cos => a.cos(),
+            UnaryOp::Abs => a.abs(),
+            UnaryOp::Sqrt => a.sqrt(),
+            UnaryOp::Relu => a.relu(),
+            UnaryOp::Floor => a.floor(),
+            UnaryOp::Ceil => a.ceil(),
+            UnaryOp::Neg => a.neg(),
+            UnaryOp::Silu => a.silu(),
+            UnaryOp::Sigmoid => a.sigmoid(),
         };
-
-        run_py_prg(prg.to_string(), &[a], &[])
+        Tensor::try_from(&result)
     }
 
     thread_local! {
@@ -212,11 +206,7 @@ def {}(a):
         println!("op: {:?}, B: {}, M: {}, N: {}", op, B, M, N);
         let a = Tensor::randn::<f32>(shape![B, M], Device::CPU);
 
-        let args = match op {
-            UnaryOp::Gelu => "approximate=\"tanh\"",
-            _ => "",
-        };
-        let ground = ground_truth(&a, &op, args)?;
+        let ground = ground_truth(&a, &op)?;
 
         let a_gpu = a.to(&device)?;
         let c_gpu = match op {

From f62e3f0f1f65c257e88e661e68064b414eb05063 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Tue, 14 May 2024 22:00:35 +0100
Subject: [PATCH 08/21] chore: migrate more ops

---
 crates/ratchet-core/src/ops/concat.rs         | 30 +++++++------------
 crates/ratchet-core/src/ops/conv.rs           | 23 ++++----------
 crates/ratchet-core/src/ops/norm/groupnorm.rs | 18 +++++------
 crates/ratchet-core/src/ops/norm/mod.rs       |  6 +---
 crates/ratchet-core/src/ops/softmax.rs        |  3 +-
 crates/ratchet-core/src/tensor.rs             |  8 ++---
 6 files changed, 30 insertions(+), 58 deletions(-)

diff --git a/crates/ratchet-core/src/ops/concat.rs b/crates/ratchet-core/src/ops/concat.rs
index 0ab39cd0..cd190572 100644
--- a/crates/ratchet-core/src/ops/concat.rs
+++ b/crates/ratchet-core/src/ops/concat.rs
@@ -141,7 +141,8 @@ impl MetaOperation for Concat {
 
 #[cfg(all(test, feature = "pyo3"))]
 mod tests {
-    use crate::{rvec, shape, test_util::run_py_prg, Device, DeviceRequest, Tensor};
+    use crate::{rvec, shape, Device, DeviceRequest, Tensor};
+    use tch::Tensor as TchTensor;
 
     thread_local! {
         static GPU_DEVICE: Device = Device::request_device(DeviceRequest::GPU).unwrap();
@@ -157,22 +158,13 @@ mod tests {
         dim: usize,
     }
 
-    fn ground_truth(to_cat: &[&Tensor], args: &str) -> anyhow::Result<Tensor> {
-        let prg = format!(
-            r#"
-import torch
-import numpy as np
-def permute(t0, t1, t2, t3, t4):
-    t0 = torch.from_numpy(t0)
-    t1 = torch.from_numpy(t1)
-    t2 = torch.from_numpy(t2)
-    t3 = torch.from_numpy(t3)
-    t4 = torch.from_numpy(t4)
-    return np.ascontiguousarray(torch.cat((t0, t1, t2, t3, t4), dim={}).numpy())
-"#,
-            args
-        );
-        run_py_prg(prg.to_string(), to_cat, &[])
+    fn ground_truth(to_cat: &[&Tensor], dim: i64) -> anyhow::Result<Tensor> {
+        let tch_tensors = to_cat
+            .iter()
+            .map(|x| x.to_tch::<f32>())
+            .collect::<Result<Vec<_>, _>>()?;
+
+        Tensor::try_from(TchTensor::cat(&tch_tensors, dim))
     }
 
     fn run_concat_trial(prob: ConcatProblem) -> anyhow::Result<()> {
@@ -185,9 +177,7 @@ def permute(t0, t1, t2, t3, t4):
             dim,
         } = prob;
         let device = GPU_DEVICE.with(|d| d.clone());
-
-        let arg_str = format!("{}", dim);
-        let ground = ground_truth(&[&t0, &t1, &t2, &t3, &t4], arg_str.as_str())?;
+        let ground = ground_truth(&[&t0, &t1, &t2, &t3, &t4], dim as _)?;
 
         t0 = t0.to(&device)?;
         t1 = t1.to(&device)?;
diff --git a/crates/ratchet-core/src/ops/conv.rs b/crates/ratchet-core/src/ops/conv.rs
index ba2de4d4..f14e14dc 100644
--- a/crates/ratchet-core/src/ops/conv.rs
+++ b/crates/ratchet-core/src/ops/conv.rs
@@ -127,10 +127,8 @@ impl MetaOperation for Conv {
 
 #[cfg(all(test, feature = "pyo3"))]
 mod tests {
-    use test_strategy::{proptest, Arbitrary};
-
-    use crate::test_util::run_py_prg;
     use crate::{shape, Device, DeviceRequest, Tensor};
+    use test_strategy::{proptest, Arbitrary};
 
     fn ground_truth(
         input: &Tensor,
@@ -139,20 +137,11 @@ mod tests {
         stride: usize,
         padding: usize,
     ) -> anyhow::Result<Tensor> {
-        let prg = r#"
-import torch
-import torch.nn.functional as F
-def conv(input, filters, bias, stride, padding):
-    input = torch.from_numpy(input)
-    filters = torch.from_numpy(filters)
-    bias = torch.from_numpy(bias)
-    return F.conv1d(input, filters, bias, stride=stride, padding=padding).numpy()
-"#;
-        run_py_prg(
-            prg.to_string(),
-            &[input, filters, bias],
-            &[&stride, &padding],
-        )
+        let i_tch = input.to_tch::<f32>()?;
+        let f_tch = filters.to_tch::<f32>()?;
+        let b_tch = bias.to_tch::<f32>()?;
+
+        Tensor::try_from(i_tch.conv1d(&f_tch, Some(b_tch), stride as i64, padding as i64, 0, 1))
     }
 
     fn run_conv_trial(device: &Device, problem: ConvProblem) {
diff --git a/crates/ratchet-core/src/ops/norm/groupnorm.rs b/crates/ratchet-core/src/ops/norm/groupnorm.rs
index e74451c9..759d0a61 100644
--- a/crates/ratchet-core/src/ops/norm/groupnorm.rs
+++ b/crates/ratchet-core/src/ops/norm/groupnorm.rs
@@ -1,8 +1,6 @@
+use crate::{DType, Norm, OpGuards, Operation, OperationError, StorageView};
 use derive_new::new;
 
-use super::*;
-use crate::{DType, OpGuards, Operation, OperationError, StorageView, Tensor};
-
 #[derive(new, Debug, Clone)]
 pub struct GroupNorm {
     pub norm: Norm,
@@ -19,9 +17,10 @@ impl OpGuards for GroupNorm {
     fn check_dtypes(&self) {
         assert!(self.norm.input.dt() == DType::F32);
         assert!(self.norm.scale.dt() == DType::F32);
-        if self.norm.bias.is_some() {
-            assert!(self.norm.bias.as_ref().unwrap().dt() == DType::F32);
-        }
+        self.norm
+            .bias
+            .as_ref()
+            .map(|b| assert!(b.dt() == DType::F32));
     }
 }
 
@@ -32,9 +31,8 @@ impl Operation for GroupNorm {
 }
 #[cfg(all(test, feature = "testing"))]
 mod tests {
-    use test_strategy::{proptest, Arbitrary};
-
     use crate::{rvec, shape, Device, DeviceRequest, Tensor};
+    use test_strategy::{proptest, Arbitrary};
 
     fn ground_truth(
         input: &Tensor,
@@ -50,7 +48,7 @@ mod tests {
         };
         let result =
             input.f_group_norm(num_groups as i64, Some(&scale), bias.as_ref(), 1e-5, false)?;
-        Tensor::try_from(&result)
+        Tensor::try_from(result)
     }
 
     fn run_norm_trial(device: &Device, problem: GroupNormProblem) -> anyhow::Result<()> {
@@ -83,7 +81,7 @@ mod tests {
 
     #[derive(Arbitrary, Debug)]
     struct GroupNormProblem {
-        #[map(|num_groups: u32| #C/2 )]
+        #[map(|_num_groups: u32| #C/2 )]
         num_groups: usize,
         #[strategy(1..=1usize)]
         B: usize,
diff --git a/crates/ratchet-core/src/ops/norm/mod.rs b/crates/ratchet-core/src/ops/norm/mod.rs
index 7598cf41..eb44195b 100644
--- a/crates/ratchet-core/src/ops/norm/mod.rs
+++ b/crates/ratchet-core/src/ops/norm/mod.rs
@@ -207,10 +207,6 @@ mod tests {
                 false,
             )?,
             NormVariant::RMSNorm => {
-                // (input, scale) = (torch.from_numpy(input), torch.from_numpy(scale))
-                // variance = input.to(torch.float32).pow(2).mean(dim=-1, keepdim=True)
-                // input = input * torch.rsqrt(variance + 1e-5)
-                // return (scale * input).numpy()
                 let variance = input
                     .f_pow_tensor_scalar(2)?
                     .mean_dim(-1, true, input.kind());
@@ -218,7 +214,7 @@ mod tests {
                 scale.multiply(&input)
             }
         };
-        Tensor::try_from(&result)
+        Tensor::try_from(result)
     }
 
     fn run_norm_trial(device: &Device, problem: NormProblem) -> anyhow::Result<()> {
diff --git a/crates/ratchet-core/src/ops/softmax.rs b/crates/ratchet-core/src/ops/softmax.rs
index 1addc7ad..cdbf81af 100644
--- a/crates/ratchet-core/src/ops/softmax.rs
+++ b/crates/ratchet-core/src/ops/softmax.rs
@@ -107,7 +107,6 @@ impl MetaOperation for Softmax {
 #[cfg(all(test, feature = "testing"))]
 mod tests {
     use crate::{shape, Device, DeviceRequest, Tensor};
-    use tch;
     use test_strategy::{proptest, Arbitrary};
 
     thread_local! {
@@ -116,7 +115,7 @@ mod tests {
 
     fn ground_truth(a: &Tensor) -> anyhow::Result<Tensor> {
         let t = a.to_tch::<f32>()?;
-        Tensor::try_from(&t.softmax(-1, Some(tch::kind::Kind::Float)))
+        Tensor::try_from(t.softmax(-1, Some(tch::kind::Kind::Float)))
     }
 
     fn run_softmax_trial(problem: SoftmaxProblem) {
diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 86add21b..2baa0483 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -865,13 +865,13 @@ impl Tensor {
 }
 
 #[cfg(feature = "testing")]
-impl TryFrom<&tch::Tensor> for Tensor {
+impl TryFrom<tch::Tensor> for Tensor {
     type Error = anyhow::Error;
-    fn try_from(array: &tch::Tensor) -> anyhow::Result<Self> {
-        let kind = array.kind();
+    fn try_from(t: tch::Tensor) -> anyhow::Result<Self> {
+        let kind = t.kind();
         match kind {
             tch::Kind::Float => {
-                let base: ArrayD<f32> = array.try_into()?;
+                let base: ArrayD<f32> = (&t).try_into()?;
                 Ok(Self::from(base))
             }
             tch::Kind::QInt8 => todo!(),

From 8ed7e938a6e0bcd8ebfe12fea0eb8177f3164ebe Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Tue, 14 May 2024 22:17:36 +0100
Subject: [PATCH 09/21] chore: migrate more ops

---
 crates/ratchet-core/src/ops/binary.rs         |  2 +-
 crates/ratchet-core/src/ops/matmul.rs         |  4 ++--
 .../ratchet-core/src/ops/reindex/broadcast.rs | 19 ++++++-------------
 .../ratchet-core/src/ops/reindex/permute.rs   | 18 ++++++------------
 crates/ratchet-core/src/ops/unary.rs          |  2 +-
 5 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/crates/ratchet-core/src/ops/binary.rs b/crates/ratchet-core/src/ops/binary.rs
index 0e1fd0a7..561c172f 100644
--- a/crates/ratchet-core/src/ops/binary.rs
+++ b/crates/ratchet-core/src/ops/binary.rs
@@ -176,7 +176,7 @@ mod tests {
             BinaryOp::Mul => a.f_mul(&b)?,
             BinaryOp::Div => a.f_div(&b)?,
         };
-        Tensor::try_from(&result)
+        Tensor::try_from(result)
     }
 
     fn run_binary_trial(prob: BinaryProblem) -> anyhow::Result<()> {
diff --git a/crates/ratchet-core/src/ops/matmul.rs b/crates/ratchet-core/src/ops/matmul.rs
index 51d7229a..b0dadbd0 100644
--- a/crates/ratchet-core/src/ops/matmul.rs
+++ b/crates/ratchet-core/src/ops/matmul.rs
@@ -623,9 +623,9 @@ mod tests {
             None => a.matmul(&b),
         };
         if trans_out {
-            return Tensor::try_from(&result.permute([0, 2, 1]).contiguous());
+            return Tensor::try_from(result.permute([0, 2, 1]).contiguous());
         }
-        Tensor::try_from(&result)
+        Tensor::try_from(result)
     }
 
     #[derive(Arbitrary, Clone, Debug)]
diff --git a/crates/ratchet-core/src/ops/reindex/broadcast.rs b/crates/ratchet-core/src/ops/reindex/broadcast.rs
index 1c92f014..14c43934 100644
--- a/crates/ratchet-core/src/ops/reindex/broadcast.rs
+++ b/crates/ratchet-core/src/ops/reindex/broadcast.rs
@@ -84,18 +84,11 @@ mod tests {
         op: Broadcast,
     }
 
-    fn ground_truth(a: &Tensor, args: &str) -> anyhow::Result<Tensor> {
-        let prg = format!(
-            r#"
-import torch
-import numpy as np
-def slice(a):
-    torch_a = torch.from_numpy(a)
-    return np.ascontiguousarray(torch_a.broadcast_to({}).numpy())
-"#,
-            args
-        );
-        run_py_prg(prg.to_string(), &[a], &[])
+    fn ground_truth(a: &Tensor, shape: &Shape) -> anyhow::Result<Tensor> {
+        let a_tch = a.to_tch::<f32>()?;
+        let dims = shape.iter().map(|&x| x as i64).collect::<Vec<_>>();
+        let broadcasted = a_tch.broadcast_to(dims).contiguous();
+        Tensor::try_from(broadcasted)
     }
 
     fn run_reindex_trial(prob: BroadcastProblem) -> anyhow::Result<()> {
@@ -105,7 +98,7 @@ def slice(a):
         let device = GPU_DEVICE.with(|d| d.clone());
 
         let a_gpu = a.to(&device)?;
-        let ground = ground_truth(&a, &op.to.as_torch())?;
+        let ground = ground_truth(&a, &op.to)?;
         let ours = a_gpu.broadcast_to(op.to.clone())?.resolve()?;
         let d_gpu = ours.to(&Device::CPU)?;
         ground.all_close(&d_gpu, 1e-5, 1e-5)?;
diff --git a/crates/ratchet-core/src/ops/reindex/permute.rs b/crates/ratchet-core/src/ops/reindex/permute.rs
index 6e73fdcf..0f38451f 100644
--- a/crates/ratchet-core/src/ops/reindex/permute.rs
+++ b/crates/ratchet-core/src/ops/reindex/permute.rs
@@ -83,17 +83,11 @@ mod tests {
         op: Permute,
     }
 
-    fn ground_truth(a: &Tensor, args: &str) -> anyhow::Result<Tensor> {
-        let prg = format!(
-            r#"
-import torch
-import numpy as np
-def permute(a):
-    return np.ascontiguousarray(torch.permute(torch.from_numpy(a), {}).numpy())
-"#,
-            args
-        );
-        run_py_prg(prg.to_string(), &[a], &[])
+    fn ground_truth(a: &Tensor, dims: &[usize]) -> anyhow::Result<Tensor> {
+        let tch_dims = dims.iter().map(|&x| x as i64).collect::<Vec<i64>>();
+        let a_tch = a.to_tch::<f32>()?;
+        let permuted = a_tch.permute(&tch_dims).contiguous();
+        Tensor::try_from(permuted)
     }
 
     fn run_reindex_trial(prob: PermuteProblem) -> anyhow::Result<()> {
@@ -102,7 +96,7 @@ def permute(a):
         let a = op.src.clone();
 
         let a_gpu = a.to(&device)?;
-        let ground = ground_truth(&a, format!("{:?}", op.dims).as_str())?;
+        let ground = ground_truth(&a, &op.dims)?;
         let ours = a_gpu.permute(&op.dims)?.resolve()?;
         let d_gpu = ours.to(&Device::CPU)?;
         ground.all_close(&d_gpu, 1e-5, 1e-5)?;
diff --git a/crates/ratchet-core/src/ops/unary.rs b/crates/ratchet-core/src/ops/unary.rs
index bed6d8b0..af61efef 100644
--- a/crates/ratchet-core/src/ops/unary.rs
+++ b/crates/ratchet-core/src/ops/unary.rs
@@ -193,7 +193,7 @@ mod tests {
             UnaryOp::Silu => a.silu(),
             UnaryOp::Sigmoid => a.sigmoid(),
         };
-        Tensor::try_from(&result)
+        Tensor::try_from(result)
     }
 
     thread_local! {

From 9c4318f5a81e4ebb94f12eb3d8311a157e3fe9c1 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Tue, 14 May 2024 22:31:55 +0100
Subject: [PATCH 10/21] chore: migrate more ops

---
 .../ratchet-core/src/ops/reindex/broadcast.rs |  2 +-
 crates/ratchet-core/src/ops/reindex/slice.rs  | 43 ++++++-------------
 2 files changed, 14 insertions(+), 31 deletions(-)

diff --git a/crates/ratchet-core/src/ops/reindex/broadcast.rs b/crates/ratchet-core/src/ops/reindex/broadcast.rs
index 14c43934..50ffc69e 100644
--- a/crates/ratchet-core/src/ops/reindex/broadcast.rs
+++ b/crates/ratchet-core/src/ops/reindex/broadcast.rs
@@ -43,7 +43,7 @@ mod tests {
     };
     use test_strategy::proptest;
 
-    use crate::{shape, test_util::run_py_prg, Broadcast, Device, DeviceRequest, Shape, Tensor};
+    use crate::{shape, Broadcast, Device, DeviceRequest, Shape, Tensor};
 
     thread_local! {
         static GPU_DEVICE: Device = Device::request_device(DeviceRequest::GPU).unwrap();
diff --git a/crates/ratchet-core/src/ops/reindex/slice.rs b/crates/ratchet-core/src/ops/reindex/slice.rs
index d0f924aa..37e7624b 100644
--- a/crates/ratchet-core/src/ops/reindex/slice.rs
+++ b/crates/ratchet-core/src/ops/reindex/slice.rs
@@ -50,29 +50,16 @@ impl Operation for Slice {
 mod tests {
     use std::ops::Range;
 
-    use crate::{test_util::run_py_prg, Device, DeviceRequest, Tensor};
-    use crate::{Shape, Slice};
+    use crate::{Device, DeviceRequest, Tensor};
+    use crate::{RVec, Shape, Slice};
     use proptest::prelude::*;
+    use tch::IndexOp;
     use test_strategy::proptest;
 
     thread_local! {
         static GPU_DEVICE: Device = Device::request_device(DeviceRequest::GPU).unwrap();
     }
 
-    impl Slice {
-        fn as_torch(&self) -> String {
-            let mut s = String::from("[");
-            for (idx, range) in self.indices.iter().enumerate() {
-                if idx > 0 {
-                    s.push_str(", ");
-                }
-                s.push_str(&format!("{}:{}", range.start, range.end));
-            }
-            s.push(']');
-            s
-        }
-    }
-
     #[derive(Debug, Clone)]
     pub struct SubSlice(pub Range<usize>);
 
@@ -126,28 +113,24 @@ mod tests {
         }
     }
 
-    fn ground_truth(a: &Tensor, args: &str) -> anyhow::Result<Tensor> {
-        let prg = format!(
-            r#"
-import torch
-import numpy as np
-def slice(a):
-    torch_a = torch.from_numpy(a)
-    return np.ascontiguousarray(torch_a{})
-"#,
-            args
-        );
-        run_py_prg(prg.to_string(), &[a], &[])
+    fn ground_truth(a: &Tensor, indices: &[Range<usize>]) -> anyhow::Result<Tensor> {
+        let a_tch = a.to_tch::<f32>()?;
+        let mut ci = indices
+            .iter()
+            .map(|range| (range.start as i64)..(range.end as i64))
+            .collect::<Vec<_>>();
+        let tch_indices = (ci.remove(0), ci.remove(0), ci.remove(0), ci.remove(0));
+        let sliced = a_tch.i(tch_indices).contiguous();
+        Tensor::try_from(sliced)
     }
 
     fn run_reindex_trial(prob: SliceProblem) -> anyhow::Result<()> {
         let SliceProblem { op } = prob;
-        println!("SLICE PROBLEM: {:?}", op);
         let device = GPU_DEVICE.with(|d| d.clone());
         let a = op.src.clone();
 
         let a_gpu = a.to(&device)?;
-        let ground = ground_truth(&a, &op.as_torch())?;
+        let ground = ground_truth(&a, &op.indices)?;
         let ours = a_gpu.slice(&op.indices)?.resolve()?;
         let d_gpu = ours.to(&Device::CPU)?;
         ground.all_close(&d_gpu, 1e-5, 1e-5)?;

From 2bc17e17a02f3b28eed656b75496cad4a7158042 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Tue, 14 May 2024 22:40:43 +0100
Subject: [PATCH 11/21] chore: migrate more ops

---
 crates/ratchet-core/src/dtype/mod.rs           | 17 ++++++++++-------
 crates/ratchet-core/src/ops/reindex/permute.rs |  2 +-
 crates/ratchet-core/src/ops/select.rs          | 13 ++++---------
 3 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/crates/ratchet-core/src/dtype/mod.rs b/crates/ratchet-core/src/dtype/mod.rs
index 6a03a2ee..dbd2b5b6 100644
--- a/crates/ratchet-core/src/dtype/mod.rs
+++ b/crates/ratchet-core/src/dtype/mod.rs
@@ -19,6 +19,7 @@ pub enum DType {
     I32,
     U32,
     GGUF(gguf::GGUFDType),
+    I64,
 }
 
 impl DType {
@@ -52,6 +53,7 @@ impl DType {
             DType::I32 => 4,
             DType::U32 => 4,
             DType::GGUF(g) => g.size_of(),
+            DType::I64 => 8,
         }
     }
 
@@ -111,7 +113,7 @@ pub trait TensorDType:
     fn one() -> Self;
 }
 
-macro_rules! map_type {
+macro_rules! tensor_dt {
     ($t:ty, $v:ident) => {
         impl TensorDType for $t {
             fn dt() -> DType {
@@ -125,7 +127,7 @@ macro_rules! map_type {
     };
 }
 
-macro_rules! map_half_type {
+macro_rules! tensor_half_dt {
     ($t:ty, $v:ident) => {
         impl TensorDType for $t {
             fn dt() -> DType {
@@ -139,11 +141,12 @@ macro_rules! map_half_type {
     };
 }
 
-map_type!(f32, F32);
-map_type!(i32, I32);
-map_type!(u32, U32);
-map_half_type!(f16, F16);
-map_half_type!(bf16, BF16);
+tensor_dt!(f32, F32);
+tensor_dt!(i32, I32);
+tensor_dt!(u32, U32);
+tensor_dt!(i64, I64);
+tensor_half_dt!(f16, F16);
+tensor_half_dt!(bf16, BF16);
 
 //Handy trait for WebGPU buffer alignment
 pub trait Align {
diff --git a/crates/ratchet-core/src/ops/reindex/permute.rs b/crates/ratchet-core/src/ops/reindex/permute.rs
index 0f38451f..6cdd0caf 100644
--- a/crates/ratchet-core/src/ops/reindex/permute.rs
+++ b/crates/ratchet-core/src/ops/reindex/permute.rs
@@ -55,7 +55,7 @@ impl OpGuards for Permute {
 
 #[cfg(all(test, feature = "pyo3"))]
 mod tests {
-    use crate::{test_util::run_py_prg, Device, DeviceRequest, Permute, Shape, Tensor};
+    use crate::{Device, DeviceRequest, Permute, Shape, Tensor};
     use proptest::prelude::*;
     use test_strategy::{proptest, Arbitrary};
 
diff --git a/crates/ratchet-core/src/ops/select.rs b/crates/ratchet-core/src/ops/select.rs
index 191f82da..2c46e820 100644
--- a/crates/ratchet-core/src/ops/select.rs
+++ b/crates/ratchet-core/src/ops/select.rs
@@ -123,6 +123,7 @@ mod tests {
 
     use crate::test_util::run_py_prg;
     use crate::{rvec, shape, Device, DeviceRequest, Quantization, Quantizer, Shape, Tensor};
+    use tch::Tensor as TchTensor;
 
     thread_local! {
         static GPU_DEVICE: Device = Device::request_device(DeviceRequest::GPU).unwrap();
@@ -148,15 +149,9 @@ mod tests {
     }
 
     fn ground_truth(input: &Tensor, indices: &Tensor, dim: usize) -> anyhow::Result<Tensor> {
-        let prg = format!(
-            r#"
-import torch
-def index_select(input, indices):
-    return torch.index_select(torch.from_numpy(input),{},torch.from_numpy(indices)).numpy()
-"#,
-            dim
-        );
-        run_py_prg(prg.to_string(), &[input, indices], &[])
+        let tch_input = input.to_tch::<f32>()?;
+        let tch_indices = indices.to_tch::<i64>()?;
+        Tensor::try_from(TchTensor::index_select(&tch_input, dim as i64, &tch_indices).contiguous())
     }
 
     fn run_index_select_trial(problem: IndexSelectProblem, quantize: bool) {

From c4ae38b4a53b01976b58688405b6a7619dcb9641 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Tue, 14 May 2024 22:43:06 +0100
Subject: [PATCH 12/21] chore: migrate more ops

---
 crates/ratchet-core/src/ops/select.rs | 1 -
 crates/ratchet-core/src/ops/unary.rs  | 5 +----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/crates/ratchet-core/src/ops/select.rs b/crates/ratchet-core/src/ops/select.rs
index 2c46e820..196ca42d 100644
--- a/crates/ratchet-core/src/ops/select.rs
+++ b/crates/ratchet-core/src/ops/select.rs
@@ -121,7 +121,6 @@ mod tests {
     use proptest::strategy::{BoxedStrategy, Just, Strategy};
     use test_strategy::proptest;
 
-    use crate::test_util::run_py_prg;
     use crate::{rvec, shape, Device, DeviceRequest, Quantization, Quantizer, Shape, Tensor};
     use tch::Tensor as TchTensor;
 
diff --git a/crates/ratchet-core/src/ops/unary.rs b/crates/ratchet-core/src/ops/unary.rs
index af61efef..315bddeb 100644
--- a/crates/ratchet-core/src/ops/unary.rs
+++ b/crates/ratchet-core/src/ops/unary.rs
@@ -175,10 +175,7 @@ mod tests {
     fn ground_truth(a: &Tensor, op: &UnaryOp) -> anyhow::Result<Tensor> {
         let a = a.to_tch::<f32>()?;
         let result = match op {
-            UnaryOp::Gelu => {
-                // UnaryOp::Gelu => "approximate=\"tanh\"",
-                a.f_gelu("tanh")?
-            }
+            UnaryOp::Gelu => a.f_gelu("tanh")?,
             UnaryOp::Tanh => a.tanh(),
             UnaryOp::Exp => a.exp(),
             UnaryOp::Log => a.log(),

From 4f3b06dc801e90fa07e98a607b7925f3824ae860 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Tue, 14 May 2024 22:43:34 +0100
Subject: [PATCH 13/21] chore: clip

---
 crates/ratchet-core/src/gpu/buffer_allocator/allocator.rs   | 6 ++++++
 crates/ratchet-core/src/gpu/pools/bind_group_layout_pool.rs | 6 ++++++
 crates/ratchet-core/src/gpu/pools/bind_group_pool.rs        | 6 ++++++
 crates/ratchet-core/src/gpu/pools/buffer_pool.rs            | 6 ++++++
 crates/ratchet-core/src/gpu/pools/pipeline_pool.rs          | 6 ++++++
 crates/ratchet-core/src/ops/reindex/permute.rs              | 2 +-
 crates/ratchet-core/src/ops/reindex/slice.rs                | 2 +-
 7 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/crates/ratchet-core/src/gpu/buffer_allocator/allocator.rs b/crates/ratchet-core/src/gpu/buffer_allocator/allocator.rs
index 4ee9f65f..f62d3a3e 100644
--- a/crates/ratchet-core/src/gpu/buffer_allocator/allocator.rs
+++ b/crates/ratchet-core/src/gpu/buffer_allocator/allocator.rs
@@ -21,6 +21,12 @@ pub struct BufferAllocator {
     pool: RwLock<BufferPool>,
 }
 
+impl Default for BufferAllocator {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl BufferAllocator {
     pub fn new() -> Self {
         Self {
diff --git a/crates/ratchet-core/src/gpu/pools/bind_group_layout_pool.rs b/crates/ratchet-core/src/gpu/pools/bind_group_layout_pool.rs
index 7940cfa3..056d93a7 100644
--- a/crates/ratchet-core/src/gpu/pools/bind_group_layout_pool.rs
+++ b/crates/ratchet-core/src/gpu/pools/bind_group_layout_pool.rs
@@ -106,6 +106,12 @@ pub struct BindGroupLayoutPool {
         StaticResourcePool<BindGroupLayoutHandle, BindGroupLayoutDescriptor, wgpu::BindGroupLayout>,
 }
 
+impl Default for BindGroupLayoutPool {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl BindGroupLayoutPool {
     pub fn new() -> Self {
         Self {
diff --git a/crates/ratchet-core/src/gpu/pools/bind_group_pool.rs b/crates/ratchet-core/src/gpu/pools/bind_group_pool.rs
index 05f6d647..850d9752 100644
--- a/crates/ratchet-core/src/gpu/pools/bind_group_pool.rs
+++ b/crates/ratchet-core/src/gpu/pools/bind_group_pool.rs
@@ -85,6 +85,12 @@ pub struct BindGroupPool {
     inner: DynamicResourcePool<GpuBindGroupHandle, BindGroupDescriptor, wgpu::BindGroup>,
 }
 
+impl Default for BindGroupPool {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl BindGroupPool {
     pub fn new() -> Self {
         Self {
diff --git a/crates/ratchet-core/src/gpu/pools/buffer_pool.rs b/crates/ratchet-core/src/gpu/pools/buffer_pool.rs
index fdfa4614..8ff79349 100644
--- a/crates/ratchet-core/src/gpu/pools/buffer_pool.rs
+++ b/crates/ratchet-core/src/gpu/pools/buffer_pool.rs
@@ -60,6 +60,12 @@ pub struct BufferPool {
     inner: DynamicResourcePool<GpuBufferHandle, BufferDescriptor, RawGPUBuffer>,
 }
 
+impl Default for BufferPool {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl BufferPool {
     pub fn new() -> Self {
         Self {
diff --git a/crates/ratchet-core/src/gpu/pools/pipeline_pool.rs b/crates/ratchet-core/src/gpu/pools/pipeline_pool.rs
index e22580e6..c1d5b662 100644
--- a/crates/ratchet-core/src/gpu/pools/pipeline_pool.rs
+++ b/crates/ratchet-core/src/gpu/pools/pipeline_pool.rs
@@ -21,6 +21,12 @@ pub struct ComputePipelinePool {
         StaticResourcePool<ComputePipelineHandle, ComputePipelineDescriptor, wgpu::ComputePipeline>,
 }
 
+impl Default for ComputePipelinePool {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl ComputePipelinePool {
     pub fn new() -> Self {
         Self {
diff --git a/crates/ratchet-core/src/ops/reindex/permute.rs b/crates/ratchet-core/src/ops/reindex/permute.rs
index 6cdd0caf..38c04cd7 100644
--- a/crates/ratchet-core/src/ops/reindex/permute.rs
+++ b/crates/ratchet-core/src/ops/reindex/permute.rs
@@ -86,7 +86,7 @@ mod tests {
     fn ground_truth(a: &Tensor, dims: &[usize]) -> anyhow::Result<Tensor> {
         let tch_dims = dims.iter().map(|&x| x as i64).collect::<Vec<i64>>();
         let a_tch = a.to_tch::<f32>()?;
-        let permuted = a_tch.permute(&tch_dims).contiguous();
+        let permuted = a_tch.permute(tch_dims).contiguous();
         Tensor::try_from(permuted)
     }
 
diff --git a/crates/ratchet-core/src/ops/reindex/slice.rs b/crates/ratchet-core/src/ops/reindex/slice.rs
index 37e7624b..94aca7b3 100644
--- a/crates/ratchet-core/src/ops/reindex/slice.rs
+++ b/crates/ratchet-core/src/ops/reindex/slice.rs
@@ -51,7 +51,7 @@ mod tests {
     use std::ops::Range;
 
     use crate::{Device, DeviceRequest, Tensor};
-    use crate::{RVec, Shape, Slice};
+    use crate::{Shape, Slice};
     use proptest::prelude::*;
     use tch::IndexOp;
     use test_strategy::proptest;

From 5421f202fbb2675d5d5341ca6353b82fdc9ff575 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Tue, 14 May 2024 22:52:15 +0100
Subject: [PATCH 14/21] chore: py

---
 .github/workflows/rust.yml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 170f6e5e..48665314 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -132,6 +132,25 @@ jobs:
           fi
         shell: bash
 
+      - name: Download and install libtorch
+        run: |
+          import requests
+          import zipfile
+          import os
+      
+          with open('requirements.txt') as f:
+              for line in f:
+                  if 'torch' in line:
+                      version = line.split('==')[1].strip()
+                      break
+          response = requests.get(f'https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-{version}.zip')
+          with open('libtorch.zip', 'wb') as f:
+              f.write(response.content)
+          with zipfile.ZipFile('libtorch.zip', 'r') as zip_ref:
+              zip_ref.extractall()
+          os.environ['LIBTORCH'] = os.path.join(os.getcwd(), f'libtorch')
+        shell: python
+
       - uses: FedericoCarboni/setup-ffmpeg@v3
         if: matrix.os != 'macos-14'
 

From 52b1f591f89c893a19d7a4348145adac566d9645 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Tue, 14 May 2024 23:00:00 +0100
Subject: [PATCH 15/21] chore: py

---
 .github/workflows/rust.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 48665314..cff6617d 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -138,9 +138,10 @@ jobs:
           import zipfile
           import os
       
+          version = '2.3.0' # default version
           with open('requirements.txt') as f:
               for line in f:
-                  if 'torch' in line:
+                  if 'torch==' in line:
                       version = line.split('==')[1].strip()
                       break
           response = requests.get(f'https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-{version}.zip')

From 7c615774610b687caf01f23ec4a012182b8a139f Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Tue, 14 May 2024 23:08:18 +0100
Subject: [PATCH 16/21] chore: try simlpe

---
 .github/workflows/rust.yml | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index cff6617d..9ef710de 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -14,6 +14,7 @@ env:
   DXC_RELEASE: "v1.7.2308"
   DXC_FILENAME: "dxc_2023_08_14.zip"
   WASM_BINDGEN_TEST_TIMEOUT: 300 # 5 minutes
+  LIBTORCH_USE_PYTORCH: 1
 
 jobs:
   build:
@@ -132,26 +133,6 @@ jobs:
           fi
         shell: bash
 
-      - name: Download and install libtorch
-        run: |
-          import requests
-          import zipfile
-          import os
-      
-          version = '2.3.0' # default version
-          with open('requirements.txt') as f:
-              for line in f:
-                  if 'torch==' in line:
-                      version = line.split('==')[1].strip()
-                      break
-          response = requests.get(f'https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-{version}.zip')
-          with open('libtorch.zip', 'wb') as f:
-              f.write(response.content)
-          with zipfile.ZipFile('libtorch.zip', 'r') as zip_ref:
-              zip_ref.extractall()
-          os.environ['LIBTORCH'] = os.path.join(os.getcwd(), f'libtorch')
-        shell: python
-
       - uses: FedericoCarboni/setup-ffmpeg@v3
         if: matrix.os != 'macos-14'
 

From b95001a077279311d19d2b631cdb683497788a12 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Tue, 14 May 2024 23:17:12 +0100
Subject: [PATCH 17/21] chore: does it work

---
 .github/workflows/rust.yml | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 9ef710de..cff6617d 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -14,7 +14,6 @@ env:
   DXC_RELEASE: "v1.7.2308"
   DXC_FILENAME: "dxc_2023_08_14.zip"
   WASM_BINDGEN_TEST_TIMEOUT: 300 # 5 minutes
-  LIBTORCH_USE_PYTORCH: 1
 
 jobs:
   build:
@@ -133,6 +132,26 @@ jobs:
           fi
         shell: bash
 
+      - name: Download and install libtorch
+        run: |
+          import requests
+          import zipfile
+          import os
+      
+          version = '2.3.0' # default version
+          with open('requirements.txt') as f:
+              for line in f:
+                  if 'torch==' in line:
+                      version = line.split('==')[1].strip()
+                      break
+          response = requests.get(f'https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-{version}.zip')
+          with open('libtorch.zip', 'wb') as f:
+              f.write(response.content)
+          with zipfile.ZipFile('libtorch.zip', 'r') as zip_ref:
+              zip_ref.extractall()
+          os.environ['LIBTORCH'] = os.path.join(os.getcwd(), f'libtorch')
+        shell: python
+
       - uses: FedericoCarboni/setup-ffmpeg@v3
         if: matrix.os != 'macos-14'
 

From 997ca5617855101ef1c328a479a27cafd44c679e Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Wed, 15 May 2024 09:51:56 +0100
Subject: [PATCH 18/21] chore: try

---
 .github/workflows/rust.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index cff6617d..496090c5 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -150,6 +150,7 @@ jobs:
           with zipfile.ZipFile('libtorch.zip', 'r') as zip_ref:
               zip_ref.extractall()
           os.environ['LIBTORCH'] = os.path.join(os.getcwd(), f'libtorch')
+          os.environ['DYLD_LIBRARY_PATH'] = os.path.join(os.getcwd(), f'libtorch')
         shell: python
 
       - uses: FedericoCarboni/setup-ffmpeg@v3

From 87e37eff69c1cf083b89465f89c2ba9e3bc33c63 Mon Sep 17 00:00:00 2001
From: AmineDiro <aminedirhoussi1@gmail.com>
Date: Wed, 15 May 2024 12:06:58 +0200
Subject: [PATCH 19/21] Update rust.yml use installed torch==2.3.0 version

Added
---
 .github/workflows/rust.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 496090c5..e86225a3 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -166,9 +166,11 @@ jobs:
 
       - name: run tests
         shell: bash
+        env: 
+           LIBTORCH_USE_PYTORCH: 1
         run: |
           set -e
-          cargo nextest run -j 1 --no-fail-fast --features=ci,pyo3
+          cargo nextest run -j 1 --no-fail-fast --features=ci,pyo3,testing
 
       - name: Set up WebDriver for Ubuntu
         if: matrix.os == 'ubuntu-22.04'

From 15a3b2c99d3d79c5ec65fbbc00fb1a919b90bc7b Mon Sep 17 00:00:00 2001
From: AmineDiro <aminedirhoussi1@gmail.com>
Date: Wed, 15 May 2024 12:33:09 +0200
Subject: [PATCH 20/21] Added DyLD_LIB

---
 .github/workflows/rust.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index e86225a3..e755740a 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -167,7 +167,9 @@ jobs:
       - name: run tests
         shell: bash
         env: 
-           LIBTORCH_USE_PYTORCH: 1
+          LIBTORCH_USE_PYTORCH: 1
+          SITE_PACKAGES: $(python -c "import site; print(site.getsitepackages()[0])")
+          DYLD_LIBRARY_PATH: $SITE_PACKAGES/torch/lib:$DYLD_LIBRARY_PATH
         run: |
           set -e
           cargo nextest run -j 1 --no-fail-fast --features=ci,pyo3,testing

From 88f63b55ddd84d339823b7851568e25df6b5d79d Mon Sep 17 00:00:00 2001
From: AmineDiro <aminedirhoussi1@gmail.com>
Date: Wed, 15 May 2024 15:07:51 +0200
Subject: [PATCH 21/21] DYLD_LIBRARY_PATH fix

---
 .github/workflows/rust.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index e755740a..34093114 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -169,7 +169,7 @@ jobs:
         env: 
           LIBTORCH_USE_PYTORCH: 1
           SITE_PACKAGES: $(python -c "import site; print(site.getsitepackages()[0])")
-          DYLD_LIBRARY_PATH: $SITE_PACKAGES/torch/lib:$DYLD_LIBRARY_PATH
+          DYLD_LIBRARY_PATH: ${{ env.SITE_PACKAGES }}/torch/lib:$DYLD_LIBRARY_PATH
         run: |
           set -e
           cargo nextest run -j 1 --no-fail-fast --features=ci,pyo3,testing