From db02a8eeba840b273fcc73cbe97b8d588b4530ce Mon Sep 17 00:00:00 2001
From: CarrotzRule123 <jwtan02@gmail.com>
Date: Mon, 6 Nov 2023 18:35:35 +0800
Subject: [PATCH] gpu dense & activation

---
 crates/core-gpu/src/ffi.rs                   |  18 +-
 crates/core-gpu/src/gpu/activation.rs        |  94 +-------
 crates/core-gpu/src/gpu/backend.rs           | 126 ++++++-----
 crates/core-gpu/src/gpu/cost.rs              | 122 ++++------
 crates/core-gpu/src/gpu/gpu.rs               |  84 ++++---
 crates/core-gpu/src/gpu/layers/activation.rs | 162 +++++++-------
 crates/core-gpu/src/gpu/layers/dense.rs      | 224 +++++++++++++++----
 crates/core-gpu/src/gpu/layers/mod.rs        |  50 +++--
 crates/core-gpu/src/gpu/mod.rs               |   2 -
 crates/core-gpu/src/gpu/optimizers/adam.rs   |  74 ------
 crates/core-gpu/src/gpu/optimizers/mod.rs    |  75 -------
 crates/core-gpu/src/gpu/optimizers/sgd.rs    |  27 ---
 crates/core-gpu/src/tensor.rs                |   1 +
 src/backends/gpu/backend.ts                  |   5 +
 src/backends/gpu/mod.ts                      |   6 +-
 15 files changed, 513 insertions(+), 557 deletions(-)
 delete mode 100644 crates/core-gpu/src/gpu/optimizers/adam.rs
 delete mode 100644 crates/core-gpu/src/gpu/optimizers/mod.rs
 delete mode 100644 crates/core-gpu/src/gpu/optimizers/sgd.rs

diff --git a/crates/core-gpu/src/ffi.rs b/crates/core-gpu/src/ffi.rs
index 094a68a..20aa13e 100644
--- a/crates/core-gpu/src/ffi.rs
+++ b/crates/core-gpu/src/ffi.rs
@@ -27,6 +27,12 @@ pub extern "C" fn ffi_backend_create(ptr: *const u8, len: usize, alloc: AllocBuf
         len = backend.len();
         backend.push(net_backend);
     });
+
+    std::panic::set_hook(Box::new(|info| {
+        println!("{}", info);
+        ffi_backend_drop(0);
+    }));
+
     len
 }
 
@@ -79,7 +85,7 @@ pub extern "C" fn ffi_backend_predict(
 #[no_mangle]
 pub extern "C" fn ffi_backend_save(id: usize, alloc: AllocBufferFn) {
     RESOURCES.with(|cell| {
-        let backend = cell.backend.borrow_mut();
+        let mut backend = cell.backend.borrow_mut();
         let data = backend[id].save();
         let file_ptr = alloc(data.len());
         let file = unsafe { from_raw_parts_mut(file_ptr, data.len()) };
@@ -109,3 +115,13 @@ pub extern "C" fn ffi_backend_load(
     });
     len
 }
+
+#[no_mangle]
+pub extern "C" fn ffi_backend_drop(id: usize) {
+    RESOURCES.with(|cell| {
+        let mut backend = cell.backend.borrow_mut();
+        if backend.len() > id {
+            backend.remove(id);
+        }
+    });
+}
diff --git a/crates/core-gpu/src/gpu/activation.rs b/crates/core-gpu/src/gpu/activation.rs
index 204cd07..cdf93fe 100644
--- a/crates/core-gpu/src/gpu/activation.rs
+++ b/crates/core-gpu/src/gpu/activation.rs
@@ -1,29 +1,21 @@
 use crate::Activation;
 pub struct GPUActivation {
     pub activation: Activation,
-    pub activate: ActivationFn,
-    pub prime: ActivationFn,
+    pub activate: String,
+    pub prime: String,
 }
 
-type ActivationFn = fn(x: &f32) -> f32;
-
 impl GPUActivation {
     pub fn from(activation: Activation) -> Self {
-        let (activate, prime): (ActivationFn, ActivationFn) = match activation {
-            Activation::Elu => (elu, elu_prime),
-            Activation::LeakyRelu => (leaky_relu, leaky_relu_prime),
-            Activation::Linear => (linear, linear_prime),
-            Activation::Relu => (relu, relu_prime),
-            Activation::Relu6 => (relu6, relu6_prime),
-            Activation::Selu => (selu, selu_prime),
-            Activation::Sigmoid => (sigmoid, sigmoid_prime),
-            Activation::Tanh => (tanh, tanh_prime),
+        let (activate, prime): (&str, &str) = match activation {
+            Activation::Sigmoid => (SIGMOID, SIGMOID_PRIME),
+            _ => unimplemented!()
         };
 
         Self {
             activation,
-            activate,
-            prime,
+            activate: String::from(activate),
+            prime: String::from(prime),
         }
     }
 
@@ -35,78 +27,14 @@ impl GPUActivation {
         }
     }
 
-    pub fn memoize_output(activation: &GPUActivation) -> bool {
-        match activation.activation {
+    pub fn memoize_output(activation: &Activation) -> bool {
+        match activation {
             Activation::Sigmoid | Activation::Tanh => true,
             _ => true,
         }
     }
 }
 
-fn sigmoid(x: &f32) -> f32 {
-    return 1.0 / (1.0 + (-x).exp());
-}
-
-fn sigmoid_prime(x: &f32) -> f32 {
-    return x * (1.0 - x);
-}
-
-fn tanh(x: &f32) -> f32 {
-    return x.tanh();
-}
-
-fn tanh_prime(x: &f32) -> f32 {
-    return 1.0 - tanh(x).powi(2);
-}
-
-fn linear(x: &f32) -> f32 {
-    return *x;
-}
-
-fn linear_prime(_x: &f32) -> f32 {
-    return 1.0;
-}
-
-fn relu(x: &f32) -> f32 {
-    return x.max(0.0);
-}
-
-fn relu_prime(x: &f32) -> f32 {
-    return if *x > 0.0 { 1.0 } else { 0.0 };
-}
+const SIGMOID: &str = "1.0 / (1.0 + exp(-x))";
 
-fn relu6(x: &f32) -> f32 {
-    return x.max(0.0).min(6.0);
-}
-
-fn relu6_prime(x: &f32) -> f32 {
-    return if *x > 0.0 && *x < 6.0 { 1.0 } else { 0.0 };
-}
-
-fn leaky_relu(x: &f32) -> f32 {
-    return if *x > 0.0 { *x } else { x.max(0.01 * x) };
-}
-
-fn leaky_relu_prime(x: &f32) -> f32 {
-    return if *x > 0.0 { 1.0 } else { 0.01 };
-}
-
-fn elu(x: &f32) -> f32 {
-    return if *x >= 0.0 { *x } else { x.exp() - 1.0 };
-}
-
-fn elu_prime(x: &f32) -> f32 {
-    return if *x > 0.0 { 1.0 } else { x.exp() };
-}
-
-fn selu(x: &f32) -> f32 {
-    return if *x >= 0.0 {
-        *x
-    } else {
-        1.0507 * (x.exp() - 1.0)
-    };
-}
-
-fn selu_prime(x: &f32) -> f32 {
-    return if *x > 0.0 { 1.0 } else { 1.0507 * x.exp() };
-}
+const SIGMOID_PRIME: &str = "x * (1.0 - x)";
\ No newline at end of file
diff --git a/crates/core-gpu/src/gpu/backend.rs b/crates/core-gpu/src/gpu/backend.rs
index 483092d..ce498ef 100644
--- a/crates/core-gpu/src/gpu/backend.rs
+++ b/crates/core-gpu/src/gpu/backend.rs
@@ -1,11 +1,12 @@
 use std::collections::HashMap;
 
-use ndarray::{ArrayD, ArrayViewD, IxDyn};
+use ndarray::{ArrayD, IxDyn, Dimension};
 use safetensors::{serialize, SafeTensors};
 
 use crate::{
     to_arr, ActivationGPULayer, BackendConfig, Dataset, DenseGPULayer, DenseTensors, GPUCost,
-    GPULayer, GPUOptimizer, GPUScheduler, GetTensor, Layer, Logger, Tensor, Tensors, WGPUBackend,
+    GPULayer, GPUScheduler, GetTensor, Layer, Logger, Tensor, Tensors, WGPUBackend, WGPUBuffer,
+    WGPUDataset,
 };
 
 pub struct Backend {
@@ -15,86 +16,97 @@ pub struct Backend {
     pub layers: Vec<GPULayer>,
     pub size: Vec<usize>,
     pub cost: GPUCost,
-    pub optimizer: GPUOptimizer,
     pub scheduler: GPUScheduler,
     pub logger: Logger,
 }
 
 impl Backend {
     pub fn new(
-        backend: WGPUBackend,
+        mut backend: WGPUBackend,
         config: BackendConfig,
         logger: Logger,
         mut tensors: Option<Vec<Tensors>>,
     ) -> Self {
         let mut layers = Vec::new();
-        let mut size = config.size.clone();
+        let mut size = IxDyn(&config.size);
         for layer in config.layers.iter() {
             match layer.clone() {
                 Layer::Activation(config) => {
-                    let layer = ActivationGPULayer::new(config, IxDyn(&size));
+                    let layer = ActivationGPULayer::new(&mut backend, config, &mut size);
                     layers.push(GPULayer::Activation(layer));
                 }
                 Layer::Dense(config) => {
-                    let layer = DenseGPULayer::new(config, IxDyn(&size), tensors.get());
-                    size = layer.output_size().to_vec();
+                    let layer = DenseGPULayer::new(&mut backend, config, &mut size, tensors.get());
                     layers.push(GPULayer::Dense(layer));
                 }
                 _ => unimplemented!(),
-            }
+            };
         }
-        let optimizer = GPUOptimizer::from(config.optimizer.clone(), &mut layers);
-        let scheduler = GPUScheduler::from(&config.scheduler);
-        let cost = GPUCost::from(config.cost.clone());
-        let silent = config.silent.is_some_and(|x| x == true);
+
         Self {
-            backend,
             logger,
-            silent,
-            config,
             layers,
-            cost,
-            optimizer,
-            scheduler,
-            size,
+            size: size.as_array_view().to_vec(),
+            silent: config.silent.is_some_and(|x| x == true),
+            cost: GPUCost::from(&mut backend, config.cost.clone(), size),
+            scheduler: GPUScheduler::from(&config.scheduler),
+            config,
+            backend,
         }
     }
 
-    pub fn forward_propagate(&mut self, mut inputs: ArrayD<f32>, training: bool) -> ArrayD<f32> {
+    pub fn forward_propagate<'a>(&'a mut self, mut inputs: &'a WGPUBuffer, training: bool) {
         for layer in &mut self.layers {
-            inputs = layer.forward_propagate(inputs, training);
+            layer.forward_propagate(&mut self.backend, inputs, training);
+            inputs = layer.outputs()
         }
-        inputs
     }
 
-    pub fn backward_propagate<'b>(
-        &mut self,
-        outputs: ArrayViewD<'b, f32>,
-        data: ArrayViewD<'b, f32>,
-    ) -> ArrayD<f32> {
-        let mut d_outputs = (self.cost.prime)(data, outputs);
-        for layer in self.layers.iter_mut().rev() {
-            d_outputs = layer.backward_propagate(d_outputs);
+    pub fn backward_propagate(&mut self, inputs: &WGPUBuffer, dataset: &WGPUBuffer) {
+        let outputs = self.layers.last().unwrap().outputs();
+        self.cost.prime(&mut self.backend, dataset, outputs);
+        let mut d_outputs = &self.cost.d_inputs;
+
+        for i in (1..self.layers.len()).rev() {
+            let (left, right) = self.layers.split_at(i);
+            let inputs = left.last().unwrap().outputs();
+            right[0].backward_propagate(&mut self.backend, &inputs, d_outputs);
+            d_outputs = right[0].d_inputs()
         }
-        d_outputs
+
+        self.layers[0].backward_propagate(&mut self.backend, &inputs, d_outputs);
     }
 
-    pub fn train(&mut self, datasets: Vec<Dataset>, epochs: usize, batches: usize, rate: f32) {
+    pub fn train(&mut self, datasets: Vec<Dataset>, epochs: usize, batches: usize, _rate: f32) {
         let mut epoch = 0;
+
+        let mut gpu_datasets = Vec::new();
+        for dataset in datasets {
+            gpu_datasets.push(WGPUDataset {
+                inputs: WGPUBuffer::from(&mut self.backend, dataset.inputs),
+                outputs: WGPUBuffer::from(&mut self.backend, dataset.outputs),
+            })
+        }
+
         while epoch < epochs {
             let mut total = 0.0;
-            for (i, dataset) in datasets.iter().enumerate() {
-                let outputs = self.forward_propagate(dataset.inputs.clone(), true);
-                self.backward_propagate(outputs.view(), dataset.outputs.view());
-                self.optimizer
-                    .update_grads(&mut self.layers, &self.scheduler, rate, epoch);
-                total += (self.cost.cost)(outputs.view(), dataset.outputs.view());
-                let minibatch = outputs.dim()[0];
-                if !self.silent && ((i + 1) * minibatch) % batches == 0 {
-                    let cost = total / (batches) as f32;
-                    let msg = format!("Epoch={}, Dataset={}, Cost={}", epoch, i * minibatch, cost);
-                    (self.logger.log)(msg);
-                    total = 0.0;
+            for (i, dataset) in gpu_datasets.iter().enumerate() {
+                self.forward_propagate(&dataset.inputs, true);
+                self.backward_propagate(&dataset.inputs, &dataset.outputs);
+
+                if !self.silent {
+                    let outputs = self.layers.last().unwrap().outputs();
+                    total += self
+                        .cost
+                        .cost(&mut self.backend, &outputs, &dataset.outputs);
+                    let minibatch = outputs.shape[0];
+                    if ((i + 1) * minibatch) % batches == 0 {
+                        let cost = total / (batches) as f32;
+                        let msg =
+                            format!("Epoch={}, Dataset={}, Cost={}", epoch, i * minibatch, cost);
+                        (self.logger.log)(msg);
+                        total = 0.0;
+                    }
                 }
             }
             epoch += 1
@@ -103,18 +115,28 @@ impl Backend {
 
     pub fn predict(&mut self, data: ArrayD<f32>) -> ArrayD<f32> {
         for layer in &mut self.layers {
-            layer.reset(1)
+            layer.reset(&mut self.backend, 1)
         }
-        self.forward_propagate(data, false)
+        let inputs = WGPUBuffer::from(&mut self.backend, data);
+        self.forward_propagate(&inputs, false);
+        self.layers
+            .last()
+            .unwrap()
+            .outputs()
+            .read(&mut self.backend)
     }
 
-    pub fn save(&self) -> Vec<u8> {
+    pub fn save(&mut self) -> Vec<u8> {
+        let mut layers = Vec::new();
+        for layer in &self.layers {
+            layers.push(layer.save(&mut self.backend))
+        }
         let mut tensors = Vec::new();
-        for (i, layer) in self.layers.iter().enumerate() {
+        for (i, layer) in layers.iter().enumerate() {
             match layer {
-                GPULayer::Dense(layer) => {
-                    let weights = Tensor::new(layer.weights.view().into_dyn());
-                    let biases = Tensor::new(layer.biases.view().into_dyn());
+                Tensors::Dense(layer) => {
+                    let weights = Tensor::new(layer.weights.view());
+                    let biases = Tensor::new(layer.biases.view());
                     tensors.push((format!("{}w", i), weights));
                     tensors.push((format!("{}b", i), biases));
                 }
diff --git a/crates/core-gpu/src/gpu/cost.rs b/crates/core-gpu/src/gpu/cost.rs
index 53e2cdf..6e0f813 100644
--- a/crates/core-gpu/src/gpu/cost.rs
+++ b/crates/core-gpu/src/gpu/cost.rs
@@ -1,89 +1,65 @@
-use std::ops::{Div, Mul, Sub};
+use ndarray::{Dimension, IxDyn};
 
-use ndarray::{s, ArrayD, ArrayViewD};
-
-use crate::Cost;
+use crate::{Cost, WGPUBackend, WGPUBuffer, WGPUKernel};
 
 pub struct GPUCost {
-    pub cost: for<'a> fn(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32,
-    pub prime: for<'a> fn(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32>,
+    pub d_inputs: WGPUBuffer,
+    pub cost_kernel: WGPUKernel,
+    pub prime_kernel: WGPUKernel,
 }
 
 impl GPUCost {
-    pub fn from(cost: Cost) -> GPUCost {
-        match cost {
-            Cost::MSE => GPUCost {
-                cost: mse,
-                prime: mse_prime,
-            },
-            Cost::CrossEntropy => GPUCost {
-                cost: cross_entropy,
-                prime: cross_entropy_prime,
-            },
-            Cost::BinCrossEntropy => GPUCost {
-                cost: bin_cross_entropy,
-                prime: bin_cross_entropy_prime,
-            },
-            Cost::Hinge => GPUCost {
-                cost: hinge,
-                prime: hinge_prime,
-            },
+    pub fn from(backend: &mut WGPUBackend, cost: Cost, size: IxDyn) -> GPUCost {
+        let (cost, prime) = match cost {
+            Cost::MSE => (MSE, MSE_PRIME),
+            _ => unimplemented!(),
+        };
+        GPUCost {
+            d_inputs: WGPUBuffer::new(backend, size.clone()),
+            cost_kernel: kernel_cost(backend, cost.to_string(), size.size()),
+            prime_kernel: kernel_cost(backend, prime.to_string(), size.size()),
         }
     }
-}
-
-fn mse<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
-    let sub = y.sub(&y_hat);
-    return sub.clone().mul(sub).sum();
-}
 
-fn mse_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
-    return y.sub(&y_hat);
-}
-
-fn cross_entropy<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
-    let batches = y_hat.dim()[0];
-    let mut total = 0.0;
-    for b in 0..batches {
-        total -= y_hat.slice(s![b, ..]).mul(&y.slice(s![b, ..])).sum().ln()
+    pub fn cost(
+        &self,
+        backend: &mut WGPUBackend,
+        dataset: &WGPUBuffer,
+        outputs: &WGPUBuffer,
+    ) -> f32 {
+        backend.execute(&self.cost_kernel, vec![dataset, outputs, &self.d_inputs]);
+        self.d_inputs.read(backend)[0]
     }
-    return total / batches as f32;
-}
-
-fn cross_entropy_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
-    return -y_hat.div(&y);
-}
 
-fn bin_cross_entropy<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
-    return -y_hat
-        .mul(y.map(|x| x.ln()))
-        .sub(((1.0).sub(&y_hat)).mul(y.map(|x| 1.0 - x.ln())))
-        .sum()
-        / y.len() as f32;
+    pub fn prime(&self, backend: &mut WGPUBackend, dataset: &WGPUBuffer, outputs: &WGPUBuffer) {
+        backend.execute(&self.prime_kernel, vec![dataset, outputs, &self.d_inputs]);
+    }
 }
 
-fn bin_cross_entropy_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
-    return y.sub(&y_hat).div(y.mul(1.0.sub(&y)));
-}
+const MSE: &str = "cost.values[global_id.x] = y.values[global_id.x] - y_hat.values[global_id.x];";
 
-fn hinge<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
-    let mut sum = 0.0;
-    for (y_hat_i, y_i) in y_hat.iter().zip(y.iter()) {
-        let margin = 1.0 - y_hat_i * y_i;
-        if margin > 0.0 {
-            sum += margin;
-        }
-    }
-    return sum;
-}
+const MSE_PRIME: &str =
+    "cost.values[global_id.x] = y.values[global_id.x] - y_hat.values[global_id.x];";
 
-fn hinge_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
-    let mut result = ArrayD::zeros(y_hat.shape());
-    for ((result_i, y_hat_i), y_i) in result.iter_mut().zip(y_hat.iter()).zip(y.iter()) {
-        let margin = 1.0 - y_hat_i * y_i;
-        if margin > 0.0 {
-            *result_i = -y_i;
-        }
-    }
-    return result;
+fn kernel_cost(backend: &mut WGPUBackend, cost: String, size: usize) -> WGPUKernel {
+    let source = format!(
+        "struct Matrix {{
+            values: array<f32>
+        }};
+        
+        @group(0) @binding(0)
+        var<storage, read> y_hat: Matrix;
+        @group(0) @binding(1)
+        var<storage, read> y: Matrix;
+        @group(0) @binding(2)
+        var<storage, read_write> cost: Matrix;
+        
+        @compute @workgroup_size(64, 1, 1)
+        fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {{
+            if (global_id.x < {size}u) {{
+                {cost}
+            }}
+        }}"
+    );
+    backend.register(source, ((size as f64 / 64.0).ceil() as u32, 1, 1))
 }
diff --git a/crates/core-gpu/src/gpu/gpu.rs b/crates/core-gpu/src/gpu/gpu.rs
index ce19c25..966637f 100644
--- a/crates/core-gpu/src/gpu/gpu.rs
+++ b/crates/core-gpu/src/gpu/gpu.rs
@@ -1,31 +1,30 @@
 use std::borrow::Cow;
 
+use ndarray::{ArrayD, Dimension, IxDyn};
+
+pub struct WGPUDataset {
+    pub inputs: WGPUBuffer,
+    pub outputs: WGPUBuffer,
+}
+
 pub struct WGPUBackend {
     pub device: wgpu::Device,
     pub queue: wgpu::Queue,
-    pub kernels: Vec<WGPUKernel>,
 }
 
 pub struct WGPUKernel {
     pub pipeline: wgpu::ComputePipeline,
     pub layout: wgpu::BindGroupLayout,
+    pub workgroups: (u32, u32, u32),
 }
 
 impl WGPUBackend {
     pub fn new() -> Self {
-        let instance = wgpu::Instance::new(wgpu::InstanceDescriptor {
-            backends: wgpu::Backends::all(),
-            dx12_shader_compiler: Default::default(),
-            flags: wgpu::InstanceFlags::empty(),
-            gles_minor_version: wgpu::Gles3MinorVersion::Automatic,
-        });
+        let instance = wgpu::Instance::default();
 
-        let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions {
-            power_preference: wgpu::PowerPreference::default(),
-            compatible_surface: None,
-            force_fallback_adapter: false,
-        }))
-        .unwrap();
+        let adapter =
+            pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions::default()))
+                .unwrap();
 
         let (device, queue) = pollster::block_on(adapter.request_device(
             &wgpu::DeviceDescriptor {
@@ -37,19 +36,15 @@ impl WGPUBackend {
         ))
         .unwrap();
 
-        Self {
-            device,
-            queue,
-            kernels: Vec::new(),
-        }
+        Self { device, queue }
     }
 
-    pub fn register(&mut self, source: Cow<'_, str>) {
+    pub fn register(&mut self, source: String, workgroups: (u32, u32, u32)) -> WGPUKernel {
         let module = self
             .device
             .create_shader_module(wgpu::ShaderModuleDescriptor {
                 label: None,
-                source: wgpu::ShaderSource::Wgsl(source),
+                source: wgpu::ShaderSource::Wgsl(Cow::Owned(source)),
             });
 
         let pipeline = self
@@ -62,10 +57,14 @@ impl WGPUBackend {
             });
 
         let layout = pipeline.get_bind_group_layout(0);
-        self.kernels.push(WGPUKernel { pipeline, layout });
+        WGPUKernel {
+            pipeline,
+            layout,
+            workgroups,
+        }
     }
 
-    pub fn execute(&mut self, kernel: usize, buffers: Vec<WGPUBuffer>) {
+    pub fn execute(&mut self, kernel: &WGPUKernel, buffers: Vec<&WGPUBuffer>) {
         let entries: Vec<wgpu::BindGroupEntry<'_>> = buffers
             .iter()
             .enumerate()
@@ -80,7 +79,7 @@ impl WGPUBackend {
             .collect();
         let bindgroup = self.device.create_bind_group(&wgpu::BindGroupDescriptor {
             label: None,
-            layout: &self.kernels[kernel].layout,
+            layout: &kernel.layout,
             entries: &entries,
         });
 
@@ -93,8 +92,9 @@ impl WGPUBackend {
                 timestamp_writes: None,
             });
             pass.set_bind_group(0, &bindgroup, &[]);
-            pass.set_pipeline(&self.kernels[kernel].pipeline);
-            pass.dispatch_workgroups(8, 8, 8);
+            pass.set_pipeline(&kernel.pipeline);
+            let (group_x, group_y, group_z) = kernel.workgroups;
+            pass.dispatch_workgroups(group_x, group_y, group_z);
         }
         self.queue.submit([encoder.finish()]);
     }
@@ -103,27 +103,40 @@ impl WGPUBackend {
 pub struct WGPUBuffer {
     pub buffer: wgpu::Buffer,
     pub size: u64,
+    pub shape: IxDyn,
 }
 
 impl WGPUBuffer {
-    pub fn new(backend: &mut WGPUBackend, size: u64) -> Self {
+    pub fn new(backend: &mut WGPUBackend, shape: IxDyn) -> Self {
         let buffer = backend.device.create_buffer(&wgpu::BufferDescriptor {
             label: None,
-            size,
+            size: shape.size() as u64 * 4,
             usage: wgpu::BufferUsages::STORAGE
                 | wgpu::BufferUsages::COPY_DST
                 | wgpu::BufferUsages::COPY_SRC,
             mapped_at_creation: false,
         });
 
-        Self { buffer, size }
+        Self {
+            buffer,
+            size: shape.size() as u64 * 4,
+            shape,
+        }
+    }
+
+    pub fn from(backend: &mut WGPUBackend, data: ArrayD<f32>) -> Self {
+        let slice = data.as_slice().unwrap();
+        let buffer = WGPUBuffer::new(backend, data.dim());
+        let (_, bytes, _) = unsafe { slice.align_to() };
+        backend.queue.write_buffer(&buffer.buffer, 0, bytes);
+        buffer
     }
 
-    pub fn read(&self, backend: &mut WGPUBackend) -> Vec<u8> {
+    pub fn read(&self, backend: &mut WGPUBackend) -> ArrayD<f32> {
         let buffer = backend.device.create_buffer(&wgpu::BufferDescriptor {
             label: None,
             size: self.size,
-            usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_SRC,
+            usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
             mapped_at_creation: false,
         });
 
@@ -140,13 +153,16 @@ impl WGPUBuffer {
         });
         backend.device.poll(wgpu::Maintain::Wait);
         receiver.recv().unwrap().unwrap();
-        let data = slice.get_mapped_range();
 
-        data.to_vec()
+        let bytes = slice.get_mapped_range();
+        let (_, data, _) = unsafe { bytes.align_to() };
+        ArrayD::from_shape_vec(self.shape.clone(), data.to_vec()).unwrap()
     }
 
-    pub fn write(&self, backend: &mut WGPUBackend, data: &[u8]) {
-        backend.queue.write_buffer(&self.buffer, 0, data)
+    pub fn write(&self, backend: &mut WGPUBackend, data: ArrayD<f32>) {
+        let slice = data.as_slice().unwrap();
+        let (_, bytes, _) = unsafe { slice.align_to() };
+        backend.queue.write_buffer(&self.buffer, 0, bytes)
     }
 }
 
diff --git a/crates/core-gpu/src/gpu/layers/activation.rs b/crates/core-gpu/src/gpu/layers/activation.rs
index 0536efc..aa767f8 100644
--- a/crates/core-gpu/src/gpu/layers/activation.rs
+++ b/crates/core-gpu/src/gpu/layers/activation.rs
@@ -1,99 +1,107 @@
-use ndarray::{s, ArrayD, Dimension, IxDyn};
-use std::ops::{Div, Mul, Sub};
+use ndarray::{Dimension, IxDyn};
 
-use crate::{ActivationLayer, GPUActivation};
+use crate::{ActivationLayer, GPUActivation, WGPUBackend, WGPUBuffer, WGPUKernel};
 
 pub struct ActivationGPULayer {
-    pub outputs: ArrayD<f32>,
-    pub activation: GPUActivation,
+    // data
+    pub memoize_output: bool,
+    pub outputs: WGPUBuffer,
+
+    // gradients
+    pub d_inputs: WGPUBuffer,
+
+    // kernels
+    pub forward_kernel: WGPUKernel,
+    pub backward_kernel: WGPUKernel,
 }
 
 impl ActivationGPULayer {
-    pub fn new(config: ActivationLayer, size: IxDyn) -> Self {
+    pub fn new(backend: &mut WGPUBackend, config: ActivationLayer, size: &mut IxDyn) -> Self {
+        let activation = GPUActivation::from(config.activation);
+        let forward_kernel = kernel_forward(backend, size.size(), activation.activate);
+        let backward_kernel = kernel_backward(backend, size.size(), activation.prime);
+
         Self {
-            outputs: ArrayD::zeros(size),
-            activation: GPUActivation::from(config.activation),
+            memoize_output: GPUActivation::memoize_output(&activation.activation),
+            outputs: WGPUBuffer::new(backend, size.clone()),
+            d_inputs: WGPUBuffer::new(backend, size.clone()),
+            forward_kernel,
+            backward_kernel,
         }
     }
 
-    pub fn output_size(&self) -> Vec<usize> {
-        self.outputs.shape().to_vec()
+    pub fn reset(&mut self, backend: &mut WGPUBackend, batches: usize) {
+        let output_size = self.outputs.shape.as_array_view()[1];
+        self.outputs = WGPUBuffer::new(backend, IxDyn(&[batches, output_size]))
     }
 
-    pub fn reset(&mut self, batches: usize) {
-        let mut output_size = self.outputs.shape().to_vec();
-        output_size[0] = batches;
-        self.outputs = ArrayD::zeros(output_size);
+    pub fn forward_propagate(&self, backend: &mut WGPUBackend, inputs: &WGPUBuffer) {
+        backend.execute(&self.forward_kernel, vec![inputs, &self.outputs]);
     }
 
-    pub fn forward_propagate(&mut self, inputs: ArrayD<f32>) -> ArrayD<f32> {
-        let outputs = if GPUActivation::memoize_output(&self.activation) {
-            self.outputs = inputs.map(self.activation.activate);
-            self.outputs.clone()
+    pub fn backward_propagate(
+        &self,
+        backend: &mut WGPUBackend,
+        inputs: &WGPUBuffer,
+        d_outputs: &WGPUBuffer,
+    ) {
+        if self.memoize_output {
+            backend.execute(
+                &self.backward_kernel,
+                vec![&self.outputs, d_outputs, &self.d_inputs],
+            );
         } else {
-            self.outputs = inputs.clone();
-            inputs.map(self.activation.activate)
+            backend.execute(
+                &self.backward_kernel,
+                vec![inputs, d_outputs, &self.d_inputs],
+            );
         };
-        outputs.into_dyn()
-    }
-
-    pub fn backward_propagate(&mut self, d_outputs: ArrayD<f32>) -> ArrayD<f32> {
-        let d_inputs = d_outputs.mul(self.outputs.map(self.activation.prime));
-        d_inputs.into_dyn()
     }
 }
 
-pub struct SoftmaxGPULayer {
-    pub outputs: ArrayD<f32>,
+fn kernel_forward(backend: &mut WGPUBackend, size: usize, activation: String) -> WGPUKernel {
+    let source = format!(
+        "struct Matrix {{
+            values: array<f32>
+        }};
+          
+        @group(0) @binding(0)
+        var<storage, read> inputs: Matrix;
+        @group(0) @binding(1)
+        var<storage, read_write> outputs: Matrix;
+          
+        @compute @workgroup_size(64, 1, 1)
+        fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {{
+            if (global_id.x < {size}u) {{
+                var x = inputs.values[global_id.x];
+                outputs.values[global_id.x] = {activation};
+            }}
+        }}"
+    );
+    backend.register(source, ((size as f64 / 64.0).ceil() as u32, 1, 1))
 }
 
-impl SoftmaxGPULayer {
-    pub fn new(size: IxDyn) -> Self {
-        Self {
-            outputs: ArrayD::zeros(size),
-        }
-    }
-
-    pub fn output_size(&self) -> Vec<usize> {
-        self.outputs.shape().to_vec()
-    }
-
-    pub fn reset(&mut self, batches: usize) {
-        let mut output_size = self.outputs.shape().to_vec();
-        output_size[0] = batches;
-        self.outputs = ArrayD::zeros(output_size);
-    }
-
-    pub fn forward_propagate(&mut self, inputs: ArrayD<f32>) -> ArrayD<f32> {
-        let batches = self.outputs.dim()[0];
-        for b in 0..batches {
-            let exp = inputs.slice(s![b, ..]).map(|x| x.exp());
-            self.outputs
-                .slice_mut(s![b, ..])
-                .assign(&exp.clone().div(exp.sum()));
-        }
-        self.outputs.clone().into_dyn()
-    }
-
-    pub fn backward_propagate(&mut self, d_outputs: ArrayD<f32>) -> ArrayD<f32> {
-        let batches = self.outputs.dim()[0];
-        let array_size = self.outputs.dim().size() / batches;
-
-        let mut d_inputs = ArrayD::zeros(self.outputs.dim());
-        for b in 0..batches {
-            for y in 0..array_size {
-                for x in 0..array_size {
-                    let out1 = self.outputs[[b, y]];
-                    let out2 = self.outputs[[b, x]];
-                    let d_out = d_outputs[[b, x]];
-                    if x == y {
-                        d_inputs[[b, y]] += out1.sub(out1.powi(2)).mul(d_out);
-                    } else {
-                        d_inputs[[b, y]] += -out1.mul(out2).mul(d_out);
-                    }
-                }
-            }
-        }
-        d_inputs
-    }
+fn kernel_backward(backend: &mut WGPUBackend, size: usize, activation: String) -> WGPUKernel {
+    let source = format!(
+        "struct Matrix {{
+            values: array<f32>
+        }};
+          
+        @group(0) @binding(0)
+        var<storage, read> inputs: Matrix;
+        @group(0) @binding(1)
+        var<storage, read> d_outputs: Matrix;
+        @group(0) @binding(2)
+        var<storage, read_write> d_inputs: Matrix;
+          
+        @compute @workgroup_size(64, 1, 1)
+        fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {{
+            if (global_id.x < {size}u) {{
+                var d_output = d_outputs.values[global_id.x];
+                var x = inputs.values[global_id.x];
+                d_inputs.values[global_id.x] = {activation} * d_output;
+            }}
+        }}"
+    );
+    backend.register(source, ((size as f64 / 64.0).ceil() as u32, 1, 1))
 }
diff --git a/crates/core-gpu/src/gpu/layers/dense.rs b/crates/core-gpu/src/gpu/layers/dense.rs
index 9d664a7..a1b8297 100644
--- a/crates/core-gpu/src/gpu/layers/dense.rs
+++ b/crates/core-gpu/src/gpu/layers/dense.rs
@@ -1,71 +1,205 @@
-use ndarray::{Array1, Array2, ArrayD, Axis, Dimension, Ix1, Ix2, IxDyn};
-use std::ops::Add;
+use ndarray::{ArrayD, Dimension, IxDyn};
 
-use crate::{DenseLayer, GPUInit, Init, Tensors};
+use crate::{
+    DenseLayer, DenseTensors, GPUInit, Init, Tensors, WGPUBackend, WGPUBuffer, WGPUKernel,
+};
 
 pub struct DenseGPULayer {
-    // cache
-    pub output_size: Ix2,
-    pub inputs: Array2<f32>,
+    // data
+    pub outputs: WGPUBuffer,
 
     // parameters
-    pub weights: Array2<f32>,
-    pub biases: Array1<f32>,
+    pub weights: WGPUBuffer,
+    pub biases: WGPUBuffer,
 
     // gradients
-    pub d_weights: Array2<f32>,
-    pub d_biases: Array1<f32>,
+    pub d_weights: WGPUBuffer,
+    pub d_biases: WGPUBuffer,
+    pub d_inputs: WGPUBuffer,
+
+    // kernels
+    pub forward_kernel: WGPUKernel,
+    pub backward_kernel: WGPUKernel,
 }
 
 impl DenseGPULayer {
-    pub fn new(config: DenseLayer, size: IxDyn, tensors: Option<Tensors>) -> Self {
+    pub fn new(
+        backend: &mut WGPUBackend,
+        config: DenseLayer,
+        size: &mut IxDyn,
+        tensors: Option<Tensors>,
+    ) -> Self {
         let init = GPUInit::from_default(config.init, Init::Uniform);
-        let input_size = Ix2(size[0], size[1]);
-        let weight_size = Ix2(size[1], config.size[0]);
-        let output_size = Ix2(size[0], config.size[0]);
+        let input_size = IxDyn(&[size[0], size[1]]);
+        let weight_size = IxDyn(&[size[1], config.size[0]]);
+        let bias_size = IxDyn(&[size[0]]);
+        let output_size = IxDyn(&[size[0], config.size[0]]);
+        *size = output_size.clone();
 
-        let (weights, biases) = if let Some(Tensors::Dense(tensors)) = tensors {
-            (tensors.weights, tensors.biases)
-        } else {
-            let weights = init.init(weight_size.into_dyn(), size[1], config.size[0]);
-            let biases = ArrayD::zeros(config.size.clone());
-            (weights, biases)
+        let tensors = match tensors {
+            Some(Tensors::Dense(tensors)) => tensors,
+            _ => DenseTensors {
+                weights: init.init(weight_size.clone(), input_size[1], config.size[0]),
+                biases: ArrayD::zeros(config.size.clone()),
+            },
         };
 
+        let forward_kernel = kernel_forward(backend, input_size.clone(), output_size.clone());
+        let backward_kernel = kernel_backward(backend, input_size.clone(), output_size.clone());
+
         Self {
-            output_size,
-            inputs: Array2::zeros(input_size),
-            weights: weights.into_dimensionality::<Ix2>().unwrap(),
-            biases: biases.into_dimensionality::<Ix1>().unwrap(),
-            d_weights: Array2::zeros(weight_size),
-            d_biases: Array1::zeros(config.size[0]),
+            outputs: WGPUBuffer::new(backend, output_size),
+            weights: WGPUBuffer::from(backend, tensors.weights),
+            biases: WGPUBuffer::from(backend, tensors.biases),
+            d_weights: WGPUBuffer::new(backend, weight_size),
+            d_biases: WGPUBuffer::new(backend, bias_size),
+            d_inputs: WGPUBuffer::new(backend, input_size),
+            forward_kernel,
+            backward_kernel,
         }
     }
 
-    pub fn output_size(&self) -> Vec<usize> {
-        self.output_size.as_array_view().to_vec()
+    pub fn reset(&mut self, backend: &mut WGPUBackend, batches: usize) {
+        let output_size = self.outputs.shape.as_array_view()[1];
+        self.outputs = WGPUBuffer::new(backend, IxDyn(&[batches, output_size]))
     }
 
-    pub fn reset(&mut self, batches: usize) {
-        let input_size = self.inputs.dim().1;
-        self.inputs = Array2::zeros((batches, input_size));
-        self.output_size[0] = batches;
+    pub fn forward_propagate(&self, backend: &mut WGPUBackend, inputs: &WGPUBuffer) {
+        backend.execute(
+            &self.forward_kernel,
+            vec![inputs, &self.weights, &self.biases, &self.outputs],
+        );
     }
 
-    pub fn forward_propagate(&mut self, inputs: ArrayD<f32>) -> ArrayD<f32> {
-        self.inputs = inputs.into_dimensionality::<Ix2>().unwrap();
-        self.inputs.dot(&self.weights).add(&self.biases).into_dyn()
+    pub fn backward_propagate(
+        &self,
+        backend: &mut WGPUBackend,
+        inputs: &WGPUBuffer,
+        d_outputs: &WGPUBuffer,
+    ) {
+        backend.execute(
+            &self.backward_kernel,
+            vec![
+                inputs,
+                &self.weights,
+                &self.biases,
+                d_outputs,
+                &self.d_inputs,
+            ],
+        );
     }
 
-    pub fn backward_propagate(&mut self, d_outputs: ArrayD<f32>) -> ArrayD<f32> {
-        let d_outputs = d_outputs.into_dimensionality::<Ix2>().unwrap();
-        let mut weights_t = self.weights.view();
-        weights_t.swap_axes(0, 1);
-        let d_inputs = d_outputs.dot(&weights_t);
-        let mut inputs_t = self.inputs.view();
-        inputs_t.swap_axes(0, 1);
-        self.d_weights = inputs_t.dot(&d_outputs);
-        self.d_biases = d_outputs.sum_axis(Axis(0));
-        d_inputs.into_dyn()
+    pub fn save(&self, backend: &mut WGPUBackend) -> Tensors {
+        Tensors::Dense(DenseTensors {
+            weights: self.weights.read(backend),
+            biases: self.weights.read(backend),
+        })
     }
 }
+
+fn kernel_forward(backend: &mut WGPUBackend, input_size: IxDyn, output_size: IxDyn) -> WGPUKernel {
+    let input = input_size[1];
+    let output = output_size[1];
+    let batches = input_size[0];
+    let source = format!(
+        "struct Matrix {{
+            values: array<f32>
+        }};
+        
+        @group(0) @binding(0)
+        var<storage, read> inputs: Matrix;
+        @group(0) @binding(1)
+        var<storage, read> weights: Matrix;
+        @group(0) @binding(2)
+        var<storage, read> biases: Matrix;
+        @group(0) @binding(3)
+        var<storage, read_write> outputs: Matrix;
+        
+        @compute @workgroup_size(8, 8, 1)
+        fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {{
+            if (global_id.x < {output}u && global_id.y < {batches}u) {{
+            var weighted_sum = biases.values[global_id.x];
+            for (var k = 0u; k < {input}u; k += 1u) {{
+                var a = k + global_id.y * {input}u;
+                var b = global_id.x + k * {output}u;    
+                weighted_sum += inputs.values[a] * weights.values[b];
+            }};
+            
+            let idx = global_id.x + global_id.y * {output}u;
+            outputs.values[idx] = weighted_sum;
+            }}
+        }}"
+    );
+    backend.register(
+        source,
+        (
+            (output_size[1] as f64 / 8.0).ceil() as u32,
+            (output_size[0] as f64 / 8.0).ceil() as u32,
+            1,
+        ),
+    )
+}
+
+fn kernel_backward(backend: &mut WGPUBackend, input_size: IxDyn, output_size: IxDyn) -> WGPUKernel {
+    let input = input_size[1];
+    let output = output_size[1];
+    let batches = input_size[0];
+    let source = format!(
+        "struct Matrix {{
+            values: array<f32>,
+        }};
+        
+        @group(0) @binding(0)
+        var<storage, read> inputs: Matrix;
+        @group(0) @binding(1)
+        var<storage, read_write> weights: Matrix;
+        @group(0) @binding(2)
+        var<storage, read_write> biases: Matrix;
+        @group(0) @binding(3)
+        var<storage, read_write> d_outputs: Matrix;
+        @group(0) @binding(4)
+        var<storage, read_write> d_inputs: Matrix;
+        
+        @compute @workgroup_size(8, 8, 1)
+        fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {{
+            if (global_id.x < 3u && global_id.y < 4u) {{
+                var d_input = 0.0;
+                for (var k = 0u; k < 1u; k++) {{
+                    var a = k + global_id.x * 1u;
+                    var b = k + global_id.y * 1u;    
+                    d_input = d_outputs.values[b] * weights.values[a];
+                }};
+                let idx = global_id.x + global_id.y * {input}u;
+                d_inputs.values[idx] = d_input;
+            }}
+        
+            if (global_id.x < {output}u && global_id.y < 1u) {{
+                for (var k = 0u; k < {batches}u; k++) {{
+                    let idx = global_id.x + k * {output}u;
+                    biases.values[global_id.x] -= d_outputs.values[idx] * 0.1;
+                }}
+            }};
+        
+            if (global_id.x < {input}u && global_id.y < {output}u) {{
+                var d_weight = 0.0;
+                for (var k = 0u; k < {batches}u; k++) {{
+                    var a = global_id.x + k * {input}u;
+                    var b = global_id.y + k * {output}u;    
+                    d_weight += d_outputs.values[b] * inputs.values[a];
+                }};
+                let idx = global_id.y + global_id.x * {output}u;
+                weights.values[idx] -= d_weight * 0.1;
+            }};
+        }}"
+    );
+    let max_x = std::cmp::max(input_size[1], output_size[1]);
+    let max_y = std::cmp::max(input_size[1], output_size[0]);
+    backend.register(
+        source,
+        (
+            (max_x as f64 / 8.0).ceil() as u32,
+            (max_y as f64 / 8.0).ceil() as u32,
+            1,
+        ),
+    )
+}
diff --git a/crates/core-gpu/src/gpu/layers/mod.rs b/crates/core-gpu/src/gpu/layers/mod.rs
index 4465a75..136b71a 100644
--- a/crates/core-gpu/src/gpu/layers/mod.rs
+++ b/crates/core-gpu/src/gpu/layers/mod.rs
@@ -4,7 +4,7 @@ mod dense;
 pub use activation::*;
 pub use dense::*;
 
-use ndarray::ArrayD;
+use crate::{Tensors, WGPUBackend, WGPUBuffer};
 
 pub enum GPULayer {
     Activation(ActivationGPULayer),
@@ -12,31 +12,55 @@ pub enum GPULayer {
 }
 
 impl GPULayer {
-    pub fn output_size(&mut self) -> Vec<usize> {
+    pub fn outputs<'a>(&'a self) -> &'a WGPUBuffer {
         match self {
-            GPULayer::Activation(layer) => layer.output_size(),
-            GPULayer::Dense(layer) => layer.output_size(),
+            GPULayer::Activation(layer) => &layer.outputs,
+            GPULayer::Dense(layer) => &layer.outputs,
         }
     }
 
-    pub fn forward_propagate(&mut self, inputs: ArrayD<f32>, _training: bool) -> ArrayD<f32> {
+    pub fn d_inputs<'a>(&'a self) -> &'a WGPUBuffer {
         match self {
-            GPULayer::Activation(layer) => layer.forward_propagate(inputs),
-            GPULayer::Dense(layer) => layer.forward_propagate(inputs),
+            GPULayer::Activation(layer) => &layer.d_inputs,
+            GPULayer::Dense(layer) => &layer.d_inputs,
         }
     }
 
-    pub fn backward_propagate(&mut self, d_outputs: ArrayD<f32>) -> ArrayD<f32> {
+    pub fn forward_propagate(
+        &self,
+        backend: &mut WGPUBackend,
+        inputs: &WGPUBuffer,
+        _training: bool,
+    ) {
         match self {
-            GPULayer::Activation(layer) => layer.backward_propagate(d_outputs),
-            GPULayer::Dense(layer) => layer.backward_propagate(d_outputs),
+            GPULayer::Activation(layer) => layer.forward_propagate(backend, inputs),
+            GPULayer::Dense(layer) => layer.forward_propagate(backend, inputs),
         }
     }
 
-    pub fn reset(&mut self, batches: usize) {
+    pub fn backward_propagate(
+        &self,
+        backend: &mut WGPUBackend,
+        inputs: &WGPUBuffer,
+        d_outputs: &WGPUBuffer,
+    ) {
         match self {
-            GPULayer::Activation(layer) => layer.reset(batches),
-            GPULayer::Dense(layer) => layer.reset(batches),
+            GPULayer::Activation(layer) => layer.backward_propagate(backend, inputs, d_outputs),
+            GPULayer::Dense(layer) => layer.backward_propagate(backend, inputs, d_outputs),
+        }
+    }
+
+    pub fn reset(&mut self, backend: &mut WGPUBackend, batches: usize) {
+        match self {
+            GPULayer::Activation(layer) => layer.reset(backend, batches),
+            GPULayer::Dense(layer) => layer.reset(backend, batches),
+        }
+    }
+
+    pub fn save(&self, backend: &mut WGPUBackend) -> Tensors {
+        match self {
+            GPULayer::Activation(_) => Tensors::None,
+            GPULayer::Dense(layer) => layer.save(backend),
         }
     }
 }
diff --git a/crates/core-gpu/src/gpu/mod.rs b/crates/core-gpu/src/gpu/mod.rs
index a7a3ab5..9de364a 100644
--- a/crates/core-gpu/src/gpu/mod.rs
+++ b/crates/core-gpu/src/gpu/mod.rs
@@ -4,7 +4,6 @@ mod cost;
 mod gpu;
 mod init;
 mod layers;
-mod optimizers;
 mod schedulers;
 
 pub use activation::*;
@@ -13,5 +12,4 @@ pub use cost::*;
 pub use gpu::*;
 pub use init::*;
 pub use layers::*;
-pub use optimizers::*;
 pub use schedulers::*;
diff --git a/crates/core-gpu/src/gpu/optimizers/adam.rs b/crates/core-gpu/src/gpu/optimizers/adam.rs
deleted file mode 100644
index f77365f..0000000
--- a/crates/core-gpu/src/gpu/optimizers/adam.rs
+++ /dev/null
@@ -1,74 +0,0 @@
-use std::ops::{Add, Div, Mul, SubAssign};
-
-use ndarray::{ArrayD, ArrayViewD, ArrayViewMutD};
-
-use crate::{AdamOptimizer, GPUScheduler};
-
-pub struct GPUAdamOptimizer {
-    pub beta1: f32,
-    pub beta2: f32,
-    pub epsilon: f32,
-    pub m: Vec<Vec<ArrayD<f32>>>,
-    pub v: Vec<Vec<ArrayD<f32>>>,
-    pub t: f32,
-}
-
-impl GPUAdamOptimizer {
-    pub fn new(config: AdamOptimizer, params: Vec<Vec<ArrayViewMutD<f32>>>) -> Self {
-        let mut m = Vec::new();
-        let mut v = Vec::new();
-        for params in params {
-            m.push(
-                params
-                    .iter()
-                    .map(|param| ArrayD::zeros(param.dim()))
-                    .collect(),
-            );
-            v.push(
-                params
-                    .iter()
-                    .map(|param| ArrayD::zeros(param.dim()))
-                    .collect(),
-            );
-        }
-        Self {
-            beta1: config.beta1,
-            beta2: config.beta2,
-            epsilon: config.epsilon,
-            m,
-            v,
-            t: 0.0,
-        }
-    }
-
-    pub fn update_grads(
-        &mut self,
-        mut params: Vec<ArrayViewMutD<f32>>,
-        grads: Vec<ArrayViewD<f32>>,
-        idx: usize,
-        scheduler: &GPUScheduler,
-        rate: f32,
-    ) {
-        for (j, (param, grad)) in params.iter_mut().zip(grads).enumerate() {
-            self.m[idx][j] = self
-                .beta1
-                .mul(&self.m[idx][j])
-                .add((1.0 - self.beta1).mul(&grad));
-            self.v[idx][j] = self
-                .beta2
-                .mul(&self.v[idx][j])
-                .add((1.0 - self.beta2).mul(&grad.map(|x| x.powi(2))));
-
-            let m_hat = self.m[idx][j].view().div(1.0 - self.beta1.powf(self.t));
-            let v_hat = self.v[idx][j].view().div(1.0 - self.beta2.powf(self.t));
-
-            let rate = scheduler.eta(rate, self.t as usize);
-
-            param.sub_assign(
-                &rate
-                    .mul(m_hat)
-                    .div(v_hat.map(|x| x.sqrt()).add(self.epsilon)),
-            )
-        }
-    }
-}
diff --git a/crates/core-gpu/src/gpu/optimizers/mod.rs b/crates/core-gpu/src/gpu/optimizers/mod.rs
deleted file mode 100644
index cb3d936..0000000
--- a/crates/core-gpu/src/gpu/optimizers/mod.rs
+++ /dev/null
@@ -1,75 +0,0 @@
-mod adam;
-mod sgd;
-
-pub use adam::*;
-use ndarray::{ArrayViewD, ArrayViewMutD};
-pub use sgd::*;
-
-use crate::{GPULayer, GPUScheduler, Optimizer};
-
-pub enum GPUOptimizer {
-    SGD(GPUSGDOptimizer),
-    Adam(GPUAdamOptimizer),
-}
-
-impl GPUOptimizer {
-    pub fn from(optimizer: Optimizer, layers: &mut Vec<GPULayer>) -> Self {
-        let mut all_params = Vec::new();
-        for layer in layers {
-            if let Some((params, _)) = GPUOptimizer::get_params(layer) {
-                all_params.push(params)
-            }
-        }
-        match optimizer {
-            Optimizer::SGD => GPUOptimizer::SGD(GPUSGDOptimizer::new()),
-            Optimizer::Adam(config) => {
-                GPUOptimizer::Adam(GPUAdamOptimizer::new(config, all_params))
-            }
-        }
-    }
-
-    pub fn update_grads(
-        &mut self,
-        layers: &mut Vec<GPULayer>,
-        scheduler: &GPUScheduler,
-        rate: f32,
-        epoch: usize,
-    ) {
-        match self {
-            GPUOptimizer::Adam(adam) => adam.t += 1.0,
-            _ => {}
-        }
-        let mut idx = 0;
-        for layer in layers.iter_mut() {
-            if let Some((params, grads)) = GPUOptimizer::get_params(layer) {
-                match self {
-                    GPUOptimizer::SGD(sgd) => {
-                        sgd.update_grads(params, grads, scheduler, rate, epoch)
-                    }
-                    GPUOptimizer::Adam(adam) => {
-                        adam.update_grads(params, grads, idx, scheduler, rate)
-                    }
-                }
-                idx += 1;
-            }
-        }
-    }
-
-    pub fn get_params<'a>(
-        layer: &'a mut GPULayer,
-    ) -> Option<(Vec<ArrayViewMutD<'a, f32>>, Vec<ArrayViewD<'a, f32>>)> {
-        match layer {
-            GPULayer::Dense(layer) => Some((
-                vec![
-                    layer.weights.view_mut().into_dyn(),
-                    layer.biases.view_mut().into_dyn(),
-                ],
-                vec![
-                    layer.d_weights.view().into_dyn(),
-                    layer.d_biases.view().into_dyn(),
-                ],
-            )),
-            _ => return None,
-        }
-    }
-}
diff --git a/crates/core-gpu/src/gpu/optimizers/sgd.rs b/crates/core-gpu/src/gpu/optimizers/sgd.rs
deleted file mode 100644
index 995679c..0000000
--- a/crates/core-gpu/src/gpu/optimizers/sgd.rs
+++ /dev/null
@@ -1,27 +0,0 @@
-use std::ops::{Mul, SubAssign};
-
-use ndarray::{ArrayViewD, ArrayViewMutD};
-
-use crate::GPUScheduler;
-
-pub struct GPUSGDOptimizer {}
-
-impl GPUSGDOptimizer {
-    pub fn new() -> Self {
-        Self {}
-    }
-
-    pub fn update_grads(
-        &mut self,
-        mut params: Vec<ArrayViewMutD<f32>>,
-        grads: Vec<ArrayViewD<f32>>,
-        scheduler: &GPUScheduler,
-        rate: f32,
-        epoch: usize,
-    ) {
-        let eta = scheduler.eta(rate, epoch);
-        for (param, grad) in params.iter_mut().zip(grads) {
-            param.sub_assign(&grad.mul(eta));
-        }
-    }
-}
diff --git a/crates/core-gpu/src/tensor.rs b/crates/core-gpu/src/tensor.rs
index f2a0d2c..9c7b72f 100644
--- a/crates/core-gpu/src/tensor.rs
+++ b/crates/core-gpu/src/tensor.rs
@@ -59,6 +59,7 @@ pub enum Tensors {
     Dense(DenseTensors),
     Conv(ConvTensors),
     BatchNorm(BatchNormTensors),
+    None,
 }
 
 pub trait GetTensor {
diff --git a/src/backends/gpu/backend.ts b/src/backends/gpu/backend.ts
index f9a5b42..f22cbe0 100644
--- a/src/backends/gpu/backend.ts
+++ b/src/backends/gpu/backend.ts
@@ -38,6 +38,11 @@ export class GPUBackend implements Backend {
       shape.allocBuffer,
     ) as bigint;
     const outputShape = Array.from(shape.buffer.slice(1)) as Shape[Rank];
+    
+    // free gpu resources when program loop exits
+    globalThis.onunload = () => {
+      library.symbols.ffi_backend_drop(id)
+    };
 
     return new GPUBackend(library, outputShape, id);
   }
diff --git a/src/backends/gpu/mod.ts b/src/backends/gpu/mod.ts
index 04669ea..a24259a 100644
--- a/src/backends/gpu/mod.ts
+++ b/src/backends/gpu/mod.ts
@@ -18,7 +18,7 @@ const options: FetchOptions = {
       "https://github.com/denosaurs/netsaur/releases/download/0.2.14/",
       import.meta.url,
     )
-    : "./target/release/",
+    : "./target/debug/",
   cache: "reloadAll",
 };
 
@@ -43,6 +43,10 @@ const symbols = {
     parameters: ["buffer", "usize", "pointer"],
     result: "usize",
   } as const,
+  ffi_backend_drop: {
+    parameters: ["usize"],
+    result: "void",
+  } as const,
 };
 
 export type Library = Deno.DynamicLibrary<typeof symbols>;