gpu dense & activation

denosaurs · Nov 6, 2023 · db02a8e · db02a8e
1 parent 0a4b23a
commit db02a8e
Show file tree

Hide file tree

Showing 15 changed files with 513 additions and 557 deletions.
diff --git a/crates/core-gpu/src/ffi.rs b/crates/core-gpu/src/ffi.rs
@@ -27,6 +27,12 @@ pub extern "C" fn ffi_backend_create(ptr: *const u8, len: usize, alloc: AllocBuf
         len = backend.len();
         backend.push(net_backend);
     });
+
+    std::panic::set_hook(Box::new(|info| {
+        println!("{}", info);
+        ffi_backend_drop(0);
+    }));
+
     len
 }
 
@@ -79,7 +85,7 @@ pub extern "C" fn ffi_backend_predict(
 #[no_mangle]
 pub extern "C" fn ffi_backend_save(id: usize, alloc: AllocBufferFn) {
     RESOURCES.with(|cell| {
-        let backend = cell.backend.borrow_mut();
+        let mut backend = cell.backend.borrow_mut();
         let data = backend[id].save();
         let file_ptr = alloc(data.len());
         let file = unsafe { from_raw_parts_mut(file_ptr, data.len()) };
@@ -109,3 +115,13 @@ pub extern "C" fn ffi_backend_load(
     });
     len
 }
+
+#[no_mangle]
+pub extern "C" fn ffi_backend_drop(id: usize) {
+    RESOURCES.with(|cell| {
+        let mut backend = cell.backend.borrow_mut();
+        if backend.len() > id {
+            backend.remove(id);
+        }
+    });
+}
diff --git a/crates/core-gpu/src/gpu/activation.rs b/crates/core-gpu/src/gpu/activation.rs
@@ -1,29 +1,21 @@
 use crate::Activation;
 pub struct GPUActivation {
     pub activation: Activation,
-    pub activate: ActivationFn,
-    pub prime: ActivationFn,
+    pub activate: String,
+    pub prime: String,
 }
 
-type ActivationFn = fn(x: &f32) -> f32;
-
 impl GPUActivation {
     pub fn from(activation: Activation) -> Self {
-        let (activate, prime): (ActivationFn, ActivationFn) = match activation {
-            Activation::Elu => (elu, elu_prime),
-            Activation::LeakyRelu => (leaky_relu, leaky_relu_prime),
-            Activation::Linear => (linear, linear_prime),
-            Activation::Relu => (relu, relu_prime),
-            Activation::Relu6 => (relu6, relu6_prime),
-            Activation::Selu => (selu, selu_prime),
-            Activation::Sigmoid => (sigmoid, sigmoid_prime),
-            Activation::Tanh => (tanh, tanh_prime),
+        let (activate, prime): (&str, &str) = match activation {
+            Activation::Sigmoid => (SIGMOID, SIGMOID_PRIME),
+            _ => unimplemented!()
         };
 
         Self {
             activation,
-            activate,
-            prime,
+            activate: String::from(activate),
+            prime: String::from(prime),
         }
     }
 
@@ -35,78 +27,14 @@ impl GPUActivation {
         }
     }
 
-    pub fn memoize_output(activation: &GPUActivation) -> bool {
-        match activation.activation {
+    pub fn memoize_output(activation: &Activation) -> bool {
+        match activation {
             Activation::Sigmoid | Activation::Tanh => true,
             _ => true,
         }
     }
 }
 
-fn sigmoid(x: &f32) -> f32 {
-    return 1.0 / (1.0 + (-x).exp());
-}
-
-fn sigmoid_prime(x: &f32) -> f32 {
-    return x * (1.0 - x);
-}
-
-fn tanh(x: &f32) -> f32 {
-    return x.tanh();
-}
-
-fn tanh_prime(x: &f32) -> f32 {
-    return 1.0 - tanh(x).powi(2);
-}
-
-fn linear(x: &f32) -> f32 {
-    return *x;
-}
-
-fn linear_prime(_x: &f32) -> f32 {
-    return 1.0;
-}
-
-fn relu(x: &f32) -> f32 {
-    return x.max(0.0);
-}
-
-fn relu_prime(x: &f32) -> f32 {
-    return if *x > 0.0 { 1.0 } else { 0.0 };
-}
+const SIGMOID: &str = "1.0 / (1.0 + exp(-x))";
 
-fn relu6(x: &f32) -> f32 {
-    return x.max(0.0).min(6.0);
-}
-
-fn relu6_prime(x: &f32) -> f32 {
-    return if *x > 0.0 && *x < 6.0 { 1.0 } else { 0.0 };
-}
-
-fn leaky_relu(x: &f32) -> f32 {
-    return if *x > 0.0 { *x } else { x.max(0.01 * x) };
-}
-
-fn leaky_relu_prime(x: &f32) -> f32 {
-    return if *x > 0.0 { 1.0 } else { 0.01 };
-}
-
-fn elu(x: &f32) -> f32 {
-    return if *x >= 0.0 { *x } else { x.exp() - 1.0 };
-}
-
-fn elu_prime(x: &f32) -> f32 {
-    return if *x > 0.0 { 1.0 } else { x.exp() };
-}
-
-fn selu(x: &f32) -> f32 {
-    return if *x >= 0.0 {
-        *x
-    } else {
-        1.0507 * (x.exp() - 1.0)
-    };
-}
-
-fn selu_prime(x: &f32) -> f32 {
-    return if *x > 0.0 { 1.0 } else { 1.0507 * x.exp() };
-}
+const SIGMOID_PRIME: &str = "x * (1.0 - x)";
diff --git a/crates/core-gpu/src/gpu/backend.rs b/crates/core-gpu/src/gpu/backend.rs
@@ -1,11 +1,12 @@
 use std::collections::HashMap;
 
-use ndarray::{ArrayD, ArrayViewD, IxDyn};
+use ndarray::{ArrayD, IxDyn, Dimension};
 use safetensors::{serialize, SafeTensors};
 
 use crate::{
     to_arr, ActivationGPULayer, BackendConfig, Dataset, DenseGPULayer, DenseTensors, GPUCost,
-    GPULayer, GPUOptimizer, GPUScheduler, GetTensor, Layer, Logger, Tensor, Tensors, WGPUBackend,
+    GPULayer, GPUScheduler, GetTensor, Layer, Logger, Tensor, Tensors, WGPUBackend, WGPUBuffer,
+    WGPUDataset,
 };
 
 pub struct Backend {
@@ -15,86 +16,97 @@ pub struct Backend {
     pub layers: Vec<GPULayer>,
     pub size: Vec<usize>,
     pub cost: GPUCost,
-    pub optimizer: GPUOptimizer,
     pub scheduler: GPUScheduler,
     pub logger: Logger,
 }
 
 impl Backend {
     pub fn new(
-        backend: WGPUBackend,
+        mut backend: WGPUBackend,
         config: BackendConfig,
         logger: Logger,
         mut tensors: Option<Vec<Tensors>>,
     ) -> Self {
         let mut layers = Vec::new();
-        let mut size = config.size.clone();
+        let mut size = IxDyn(&config.size);
         for layer in config.layers.iter() {
             match layer.clone() {
                 Layer::Activation(config) => {
-                    let layer = ActivationGPULayer::new(config, IxDyn(&size));
+                    let layer = ActivationGPULayer::new(&mut backend, config, &mut size);
                     layers.push(GPULayer::Activation(layer));
                 }
                 Layer::Dense(config) => {
-                    let layer = DenseGPULayer::new(config, IxDyn(&size), tensors.get());
-                    size = layer.output_size().to_vec();
+                    let layer = DenseGPULayer::new(&mut backend, config, &mut size, tensors.get());
                     layers.push(GPULayer::Dense(layer));
                 }
                 _ => unimplemented!(),
-            }
+            };
         }
-        let optimizer = GPUOptimizer::from(config.optimizer.clone(), &mut layers);
-        let scheduler = GPUScheduler::from(&config.scheduler);
-        let cost = GPUCost::from(config.cost.clone());
-        let silent = config.silent.is_some_and(|x| x == true);
+
         Self {
-            backend,
             logger,
-            silent,
-            config,
             layers,
-            cost,
-            optimizer,
-            scheduler,
-            size,
+            size: size.as_array_view().to_vec(),
+            silent: config.silent.is_some_and(|x| x == true),
+            cost: GPUCost::from(&mut backend, config.cost.clone(), size),
+            scheduler: GPUScheduler::from(&config.scheduler),
+            config,
+            backend,
         }
     }
 
-    pub fn forward_propagate(&mut self, mut inputs: ArrayD<f32>, training: bool) -> ArrayD<f32> {
+    pub fn forward_propagate<'a>(&'a mut self, mut inputs: &'a WGPUBuffer, training: bool) {
         for layer in &mut self.layers {
-            inputs = layer.forward_propagate(inputs, training);
+            layer.forward_propagate(&mut self.backend, inputs, training);
+            inputs = layer.outputs()
         }
-        inputs
     }
 
-    pub fn backward_propagate<'b>(
-        &mut self,
-        outputs: ArrayViewD<'b, f32>,
-        data: ArrayViewD<'b, f32>,
-    ) -> ArrayD<f32> {
-        let mut d_outputs = (self.cost.prime)(data, outputs);
-        for layer in self.layers.iter_mut().rev() {
-            d_outputs = layer.backward_propagate(d_outputs);
+    pub fn backward_propagate(&mut self, inputs: &WGPUBuffer, dataset: &WGPUBuffer) {
+        let outputs = self.layers.last().unwrap().outputs();
+        self.cost.prime(&mut self.backend, dataset, outputs);
+        let mut d_outputs = &self.cost.d_inputs;
+
+        for i in (1..self.layers.len()).rev() {
+            let (left, right) = self.layers.split_at(i);
+            let inputs = left.last().unwrap().outputs();
+            right[0].backward_propagate(&mut self.backend, &inputs, d_outputs);
+            d_outputs = right[0].d_inputs()
         }
-        d_outputs
+
+        self.layers[0].backward_propagate(&mut self.backend, &inputs, d_outputs);
     }
 
-    pub fn train(&mut self, datasets: Vec<Dataset>, epochs: usize, batches: usize, rate: f32) {
+    pub fn train(&mut self, datasets: Vec<Dataset>, epochs: usize, batches: usize, _rate: f32) {
         let mut epoch = 0;
+
+        let mut gpu_datasets = Vec::new();
+        for dataset in datasets {
+            gpu_datasets.push(WGPUDataset {
+                inputs: WGPUBuffer::from(&mut self.backend, dataset.inputs),
+                outputs: WGPUBuffer::from(&mut self.backend, dataset.outputs),
+            })
+        }
+
         while epoch < epochs {
             let mut total = 0.0;
-            for (i, dataset) in datasets.iter().enumerate() {
-                let outputs = self.forward_propagate(dataset.inputs.clone(), true);
-                self.backward_propagate(outputs.view(), dataset.outputs.view());
-                self.optimizer
-                    .update_grads(&mut self.layers, &self.scheduler, rate, epoch);
-                total += (self.cost.cost)(outputs.view(), dataset.outputs.view());
-                let minibatch = outputs.dim()[0];
-                if !self.silent && ((i + 1) * minibatch) % batches == 0 {
-                    let cost = total / (batches) as f32;
-                    let msg = format!("Epoch={}, Dataset={}, Cost={}", epoch, i * minibatch, cost);
-                    (self.logger.log)(msg);
-                    total = 0.0;
+            for (i, dataset) in gpu_datasets.iter().enumerate() {
+                self.forward_propagate(&dataset.inputs, true);
+                self.backward_propagate(&dataset.inputs, &dataset.outputs);
+
+                if !self.silent {
+                    let outputs = self.layers.last().unwrap().outputs();
+                    total += self
+                        .cost
+                        .cost(&mut self.backend, &outputs, &dataset.outputs);
+                    let minibatch = outputs.shape[0];
+                    if ((i + 1) * minibatch) % batches == 0 {
+                        let cost = total / (batches) as f32;
+                        let msg =
+                            format!("Epoch={}, Dataset={}, Cost={}", epoch, i * minibatch, cost);
+                        (self.logger.log)(msg);
+                        total = 0.0;
+                    }
                 }
             }
             epoch += 1
@@ -103,18 +115,28 @@ impl Backend {
 
     pub fn predict(&mut self, data: ArrayD<f32>) -> ArrayD<f32> {
         for layer in &mut self.layers {
-            layer.reset(1)
+            layer.reset(&mut self.backend, 1)
         }
-        self.forward_propagate(data, false)
+        let inputs = WGPUBuffer::from(&mut self.backend, data);
+        self.forward_propagate(&inputs, false);
+        self.layers
+            .last()
+            .unwrap()
+            .outputs()
+            .read(&mut self.backend)
     }
 
-    pub fn save(&self) -> Vec<u8> {
+    pub fn save(&mut self) -> Vec<u8> {
+        let mut layers = Vec::new();
+        for layer in &self.layers {
+            layers.push(layer.save(&mut self.backend))
+        }
         let mut tensors = Vec::new();
-        for (i, layer) in self.layers.iter().enumerate() {
+        for (i, layer) in layers.iter().enumerate() {
             match layer {
-                GPULayer::Dense(layer) => {
-                    let weights = Tensor::new(layer.weights.view().into_dyn());
-                    let biases = Tensor::new(layer.biases.view().into_dyn());
+                Tensors::Dense(layer) => {
+                    let weights = Tensor::new(layer.weights.view());
+                    let biases = Tensor::new(layer.biases.view());
                     tensors.push((format!("{}w", i), weights));
                     tensors.push((format!("{}b", i), biases));
                 }