diff --git a/crates/core-gpu/src/ffi.rs b/crates/core-gpu/src/ffi.rs index 094a68a..20aa13e 100644 --- a/crates/core-gpu/src/ffi.rs +++ b/crates/core-gpu/src/ffi.rs @@ -27,6 +27,12 @@ pub extern "C" fn ffi_backend_create(ptr: *const u8, len: usize, alloc: AllocBuf len = backend.len(); backend.push(net_backend); }); + + std::panic::set_hook(Box::new(|info| { + println!("{}", info); + ffi_backend_drop(0); + })); + len } @@ -79,7 +85,7 @@ pub extern "C" fn ffi_backend_predict( #[no_mangle] pub extern "C" fn ffi_backend_save(id: usize, alloc: AllocBufferFn) { RESOURCES.with(|cell| { - let backend = cell.backend.borrow_mut(); + let mut backend = cell.backend.borrow_mut(); let data = backend[id].save(); let file_ptr = alloc(data.len()); let file = unsafe { from_raw_parts_mut(file_ptr, data.len()) }; @@ -109,3 +115,13 @@ pub extern "C" fn ffi_backend_load( }); len } + +#[no_mangle] +pub extern "C" fn ffi_backend_drop(id: usize) { + RESOURCES.with(|cell| { + let mut backend = cell.backend.borrow_mut(); + if backend.len() > id { + backend.remove(id); + } + }); +} diff --git a/crates/core-gpu/src/gpu/activation.rs b/crates/core-gpu/src/gpu/activation.rs index 204cd07..cdf93fe 100644 --- a/crates/core-gpu/src/gpu/activation.rs +++ b/crates/core-gpu/src/gpu/activation.rs @@ -1,29 +1,21 @@ use crate::Activation; pub struct GPUActivation { pub activation: Activation, - pub activate: ActivationFn, - pub prime: ActivationFn, + pub activate: String, + pub prime: String, } -type ActivationFn = fn(x: &f32) -> f32; - impl GPUActivation { pub fn from(activation: Activation) -> Self { - let (activate, prime): (ActivationFn, ActivationFn) = match activation { - Activation::Elu => (elu, elu_prime), - Activation::LeakyRelu => (leaky_relu, leaky_relu_prime), - Activation::Linear => (linear, linear_prime), - Activation::Relu => (relu, relu_prime), - Activation::Relu6 => (relu6, relu6_prime), - Activation::Selu => (selu, selu_prime), - Activation::Sigmoid => (sigmoid, sigmoid_prime), - Activation::Tanh => (tanh, tanh_prime), + let (activate, prime): (&str, &str) = match activation { + Activation::Sigmoid => (SIGMOID, SIGMOID_PRIME), + _ => unimplemented!() }; Self { activation, - activate, - prime, + activate: String::from(activate), + prime: String::from(prime), } } @@ -35,78 +27,14 @@ impl GPUActivation { } } - pub fn memoize_output(activation: &GPUActivation) -> bool { - match activation.activation { + pub fn memoize_output(activation: &Activation) -> bool { + match activation { Activation::Sigmoid | Activation::Tanh => true, _ => true, } } } -fn sigmoid(x: &f32) -> f32 { - return 1.0 / (1.0 + (-x).exp()); -} - -fn sigmoid_prime(x: &f32) -> f32 { - return x * (1.0 - x); -} - -fn tanh(x: &f32) -> f32 { - return x.tanh(); -} - -fn tanh_prime(x: &f32) -> f32 { - return 1.0 - tanh(x).powi(2); -} - -fn linear(x: &f32) -> f32 { - return *x; -} - -fn linear_prime(_x: &f32) -> f32 { - return 1.0; -} - -fn relu(x: &f32) -> f32 { - return x.max(0.0); -} - -fn relu_prime(x: &f32) -> f32 { - return if *x > 0.0 { 1.0 } else { 0.0 }; -} +const SIGMOID: &str = "1.0 / (1.0 + exp(-x))"; -fn relu6(x: &f32) -> f32 { - return x.max(0.0).min(6.0); -} - -fn relu6_prime(x: &f32) -> f32 { - return if *x > 0.0 && *x < 6.0 { 1.0 } else { 0.0 }; -} - -fn leaky_relu(x: &f32) -> f32 { - return if *x > 0.0 { *x } else { x.max(0.01 * x) }; -} - -fn leaky_relu_prime(x: &f32) -> f32 { - return if *x > 0.0 { 1.0 } else { 0.01 }; -} - -fn elu(x: &f32) -> f32 { - return if *x >= 0.0 { *x } else { x.exp() - 1.0 }; -} - -fn elu_prime(x: &f32) -> f32 { - return if *x > 0.0 { 1.0 } else { x.exp() }; -} - -fn selu(x: &f32) -> f32 { - return if *x >= 0.0 { - *x - } else { - 1.0507 * (x.exp() - 1.0) - }; -} - -fn selu_prime(x: &f32) -> f32 { - return if *x > 0.0 { 1.0 } else { 1.0507 * x.exp() }; -} +const SIGMOID_PRIME: &str = "x * (1.0 - x)"; \ No newline at end of file diff --git a/crates/core-gpu/src/gpu/backend.rs b/crates/core-gpu/src/gpu/backend.rs index 483092d..ce498ef 100644 --- a/crates/core-gpu/src/gpu/backend.rs +++ b/crates/core-gpu/src/gpu/backend.rs @@ -1,11 +1,12 @@ use std::collections::HashMap; -use ndarray::{ArrayD, ArrayViewD, IxDyn}; +use ndarray::{ArrayD, IxDyn, Dimension}; use safetensors::{serialize, SafeTensors}; use crate::{ to_arr, ActivationGPULayer, BackendConfig, Dataset, DenseGPULayer, DenseTensors, GPUCost, - GPULayer, GPUOptimizer, GPUScheduler, GetTensor, Layer, Logger, Tensor, Tensors, WGPUBackend, + GPULayer, GPUScheduler, GetTensor, Layer, Logger, Tensor, Tensors, WGPUBackend, WGPUBuffer, + WGPUDataset, }; pub struct Backend { @@ -15,86 +16,97 @@ pub struct Backend { pub layers: Vec, pub size: Vec, pub cost: GPUCost, - pub optimizer: GPUOptimizer, pub scheduler: GPUScheduler, pub logger: Logger, } impl Backend { pub fn new( - backend: WGPUBackend, + mut backend: WGPUBackend, config: BackendConfig, logger: Logger, mut tensors: Option>, ) -> Self { let mut layers = Vec::new(); - let mut size = config.size.clone(); + let mut size = IxDyn(&config.size); for layer in config.layers.iter() { match layer.clone() { Layer::Activation(config) => { - let layer = ActivationGPULayer::new(config, IxDyn(&size)); + let layer = ActivationGPULayer::new(&mut backend, config, &mut size); layers.push(GPULayer::Activation(layer)); } Layer::Dense(config) => { - let layer = DenseGPULayer::new(config, IxDyn(&size), tensors.get()); - size = layer.output_size().to_vec(); + let layer = DenseGPULayer::new(&mut backend, config, &mut size, tensors.get()); layers.push(GPULayer::Dense(layer)); } _ => unimplemented!(), - } + }; } - let optimizer = GPUOptimizer::from(config.optimizer.clone(), &mut layers); - let scheduler = GPUScheduler::from(&config.scheduler); - let cost = GPUCost::from(config.cost.clone()); - let silent = config.silent.is_some_and(|x| x == true); + Self { - backend, logger, - silent, - config, layers, - cost, - optimizer, - scheduler, - size, + size: size.as_array_view().to_vec(), + silent: config.silent.is_some_and(|x| x == true), + cost: GPUCost::from(&mut backend, config.cost.clone(), size), + scheduler: GPUScheduler::from(&config.scheduler), + config, + backend, } } - pub fn forward_propagate(&mut self, mut inputs: ArrayD, training: bool) -> ArrayD { + pub fn forward_propagate<'a>(&'a mut self, mut inputs: &'a WGPUBuffer, training: bool) { for layer in &mut self.layers { - inputs = layer.forward_propagate(inputs, training); + layer.forward_propagate(&mut self.backend, inputs, training); + inputs = layer.outputs() } - inputs } - pub fn backward_propagate<'b>( - &mut self, - outputs: ArrayViewD<'b, f32>, - data: ArrayViewD<'b, f32>, - ) -> ArrayD { - let mut d_outputs = (self.cost.prime)(data, outputs); - for layer in self.layers.iter_mut().rev() { - d_outputs = layer.backward_propagate(d_outputs); + pub fn backward_propagate(&mut self, inputs: &WGPUBuffer, dataset: &WGPUBuffer) { + let outputs = self.layers.last().unwrap().outputs(); + self.cost.prime(&mut self.backend, dataset, outputs); + let mut d_outputs = &self.cost.d_inputs; + + for i in (1..self.layers.len()).rev() { + let (left, right) = self.layers.split_at(i); + let inputs = left.last().unwrap().outputs(); + right[0].backward_propagate(&mut self.backend, &inputs, d_outputs); + d_outputs = right[0].d_inputs() } - d_outputs + + self.layers[0].backward_propagate(&mut self.backend, &inputs, d_outputs); } - pub fn train(&mut self, datasets: Vec, epochs: usize, batches: usize, rate: f32) { + pub fn train(&mut self, datasets: Vec, epochs: usize, batches: usize, _rate: f32) { let mut epoch = 0; + + let mut gpu_datasets = Vec::new(); + for dataset in datasets { + gpu_datasets.push(WGPUDataset { + inputs: WGPUBuffer::from(&mut self.backend, dataset.inputs), + outputs: WGPUBuffer::from(&mut self.backend, dataset.outputs), + }) + } + while epoch < epochs { let mut total = 0.0; - for (i, dataset) in datasets.iter().enumerate() { - let outputs = self.forward_propagate(dataset.inputs.clone(), true); - self.backward_propagate(outputs.view(), dataset.outputs.view()); - self.optimizer - .update_grads(&mut self.layers, &self.scheduler, rate, epoch); - total += (self.cost.cost)(outputs.view(), dataset.outputs.view()); - let minibatch = outputs.dim()[0]; - if !self.silent && ((i + 1) * minibatch) % batches == 0 { - let cost = total / (batches) as f32; - let msg = format!("Epoch={}, Dataset={}, Cost={}", epoch, i * minibatch, cost); - (self.logger.log)(msg); - total = 0.0; + for (i, dataset) in gpu_datasets.iter().enumerate() { + self.forward_propagate(&dataset.inputs, true); + self.backward_propagate(&dataset.inputs, &dataset.outputs); + + if !self.silent { + let outputs = self.layers.last().unwrap().outputs(); + total += self + .cost + .cost(&mut self.backend, &outputs, &dataset.outputs); + let minibatch = outputs.shape[0]; + if ((i + 1) * minibatch) % batches == 0 { + let cost = total / (batches) as f32; + let msg = + format!("Epoch={}, Dataset={}, Cost={}", epoch, i * minibatch, cost); + (self.logger.log)(msg); + total = 0.0; + } } } epoch += 1 @@ -103,18 +115,28 @@ impl Backend { pub fn predict(&mut self, data: ArrayD) -> ArrayD { for layer in &mut self.layers { - layer.reset(1) + layer.reset(&mut self.backend, 1) } - self.forward_propagate(data, false) + let inputs = WGPUBuffer::from(&mut self.backend, data); + self.forward_propagate(&inputs, false); + self.layers + .last() + .unwrap() + .outputs() + .read(&mut self.backend) } - pub fn save(&self) -> Vec { + pub fn save(&mut self) -> Vec { + let mut layers = Vec::new(); + for layer in &self.layers { + layers.push(layer.save(&mut self.backend)) + } let mut tensors = Vec::new(); - for (i, layer) in self.layers.iter().enumerate() { + for (i, layer) in layers.iter().enumerate() { match layer { - GPULayer::Dense(layer) => { - let weights = Tensor::new(layer.weights.view().into_dyn()); - let biases = Tensor::new(layer.biases.view().into_dyn()); + Tensors::Dense(layer) => { + let weights = Tensor::new(layer.weights.view()); + let biases = Tensor::new(layer.biases.view()); tensors.push((format!("{}w", i), weights)); tensors.push((format!("{}b", i), biases)); } diff --git a/crates/core-gpu/src/gpu/cost.rs b/crates/core-gpu/src/gpu/cost.rs index 53e2cdf..6e0f813 100644 --- a/crates/core-gpu/src/gpu/cost.rs +++ b/crates/core-gpu/src/gpu/cost.rs @@ -1,89 +1,65 @@ -use std::ops::{Div, Mul, Sub}; +use ndarray::{Dimension, IxDyn}; -use ndarray::{s, ArrayD, ArrayViewD}; - -use crate::Cost; +use crate::{Cost, WGPUBackend, WGPUBuffer, WGPUKernel}; pub struct GPUCost { - pub cost: for<'a> fn(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32, - pub prime: for<'a> fn(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD, + pub d_inputs: WGPUBuffer, + pub cost_kernel: WGPUKernel, + pub prime_kernel: WGPUKernel, } impl GPUCost { - pub fn from(cost: Cost) -> GPUCost { - match cost { - Cost::MSE => GPUCost { - cost: mse, - prime: mse_prime, - }, - Cost::CrossEntropy => GPUCost { - cost: cross_entropy, - prime: cross_entropy_prime, - }, - Cost::BinCrossEntropy => GPUCost { - cost: bin_cross_entropy, - prime: bin_cross_entropy_prime, - }, - Cost::Hinge => GPUCost { - cost: hinge, - prime: hinge_prime, - }, + pub fn from(backend: &mut WGPUBackend, cost: Cost, size: IxDyn) -> GPUCost { + let (cost, prime) = match cost { + Cost::MSE => (MSE, MSE_PRIME), + _ => unimplemented!(), + }; + GPUCost { + d_inputs: WGPUBuffer::new(backend, size.clone()), + cost_kernel: kernel_cost(backend, cost.to_string(), size.size()), + prime_kernel: kernel_cost(backend, prime.to_string(), size.size()), } } -} - -fn mse<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 { - let sub = y.sub(&y_hat); - return sub.clone().mul(sub).sum(); -} -fn mse_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD { - return y.sub(&y_hat); -} - -fn cross_entropy<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 { - let batches = y_hat.dim()[0]; - let mut total = 0.0; - for b in 0..batches { - total -= y_hat.slice(s![b, ..]).mul(&y.slice(s![b, ..])).sum().ln() + pub fn cost( + &self, + backend: &mut WGPUBackend, + dataset: &WGPUBuffer, + outputs: &WGPUBuffer, + ) -> f32 { + backend.execute(&self.cost_kernel, vec![dataset, outputs, &self.d_inputs]); + self.d_inputs.read(backend)[0] } - return total / batches as f32; -} - -fn cross_entropy_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD { - return -y_hat.div(&y); -} -fn bin_cross_entropy<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 { - return -y_hat - .mul(y.map(|x| x.ln())) - .sub(((1.0).sub(&y_hat)).mul(y.map(|x| 1.0 - x.ln()))) - .sum() - / y.len() as f32; + pub fn prime(&self, backend: &mut WGPUBackend, dataset: &WGPUBuffer, outputs: &WGPUBuffer) { + backend.execute(&self.prime_kernel, vec![dataset, outputs, &self.d_inputs]); + } } -fn bin_cross_entropy_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD { - return y.sub(&y_hat).div(y.mul(1.0.sub(&y))); -} +const MSE: &str = "cost.values[global_id.x] = y.values[global_id.x] - y_hat.values[global_id.x];"; -fn hinge<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 { - let mut sum = 0.0; - for (y_hat_i, y_i) in y_hat.iter().zip(y.iter()) { - let margin = 1.0 - y_hat_i * y_i; - if margin > 0.0 { - sum += margin; - } - } - return sum; -} +const MSE_PRIME: &str = + "cost.values[global_id.x] = y.values[global_id.x] - y_hat.values[global_id.x];"; -fn hinge_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD { - let mut result = ArrayD::zeros(y_hat.shape()); - for ((result_i, y_hat_i), y_i) in result.iter_mut().zip(y_hat.iter()).zip(y.iter()) { - let margin = 1.0 - y_hat_i * y_i; - if margin > 0.0 { - *result_i = -y_i; - } - } - return result; +fn kernel_cost(backend: &mut WGPUBackend, cost: String, size: usize) -> WGPUKernel { + let source = format!( + "struct Matrix {{ + values: array + }}; + + @group(0) @binding(0) + var y_hat: Matrix; + @group(0) @binding(1) + var y: Matrix; + @group(0) @binding(2) + var cost: Matrix; + + @compute @workgroup_size(64, 1, 1) + fn main(@builtin(global_invocation_id) global_id: vec3) {{ + if (global_id.x < {size}u) {{ + {cost} + }} + }}" + ); + backend.register(source, ((size as f64 / 64.0).ceil() as u32, 1, 1)) } diff --git a/crates/core-gpu/src/gpu/gpu.rs b/crates/core-gpu/src/gpu/gpu.rs index ce19c25..966637f 100644 --- a/crates/core-gpu/src/gpu/gpu.rs +++ b/crates/core-gpu/src/gpu/gpu.rs @@ -1,31 +1,30 @@ use std::borrow::Cow; +use ndarray::{ArrayD, Dimension, IxDyn}; + +pub struct WGPUDataset { + pub inputs: WGPUBuffer, + pub outputs: WGPUBuffer, +} + pub struct WGPUBackend { pub device: wgpu::Device, pub queue: wgpu::Queue, - pub kernels: Vec, } pub struct WGPUKernel { pub pipeline: wgpu::ComputePipeline, pub layout: wgpu::BindGroupLayout, + pub workgroups: (u32, u32, u32), } impl WGPUBackend { pub fn new() -> Self { - let instance = wgpu::Instance::new(wgpu::InstanceDescriptor { - backends: wgpu::Backends::all(), - dx12_shader_compiler: Default::default(), - flags: wgpu::InstanceFlags::empty(), - gles_minor_version: wgpu::Gles3MinorVersion::Automatic, - }); + let instance = wgpu::Instance::default(); - let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions { - power_preference: wgpu::PowerPreference::default(), - compatible_surface: None, - force_fallback_adapter: false, - })) - .unwrap(); + let adapter = + pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions::default())) + .unwrap(); let (device, queue) = pollster::block_on(adapter.request_device( &wgpu::DeviceDescriptor { @@ -37,19 +36,15 @@ impl WGPUBackend { )) .unwrap(); - Self { - device, - queue, - kernels: Vec::new(), - } + Self { device, queue } } - pub fn register(&mut self, source: Cow<'_, str>) { + pub fn register(&mut self, source: String, workgroups: (u32, u32, u32)) -> WGPUKernel { let module = self .device .create_shader_module(wgpu::ShaderModuleDescriptor { label: None, - source: wgpu::ShaderSource::Wgsl(source), + source: wgpu::ShaderSource::Wgsl(Cow::Owned(source)), }); let pipeline = self @@ -62,10 +57,14 @@ impl WGPUBackend { }); let layout = pipeline.get_bind_group_layout(0); - self.kernels.push(WGPUKernel { pipeline, layout }); + WGPUKernel { + pipeline, + layout, + workgroups, + } } - pub fn execute(&mut self, kernel: usize, buffers: Vec) { + pub fn execute(&mut self, kernel: &WGPUKernel, buffers: Vec<&WGPUBuffer>) { let entries: Vec> = buffers .iter() .enumerate() @@ -80,7 +79,7 @@ impl WGPUBackend { .collect(); let bindgroup = self.device.create_bind_group(&wgpu::BindGroupDescriptor { label: None, - layout: &self.kernels[kernel].layout, + layout: &kernel.layout, entries: &entries, }); @@ -93,8 +92,9 @@ impl WGPUBackend { timestamp_writes: None, }); pass.set_bind_group(0, &bindgroup, &[]); - pass.set_pipeline(&self.kernels[kernel].pipeline); - pass.dispatch_workgroups(8, 8, 8); + pass.set_pipeline(&kernel.pipeline); + let (group_x, group_y, group_z) = kernel.workgroups; + pass.dispatch_workgroups(group_x, group_y, group_z); } self.queue.submit([encoder.finish()]); } @@ -103,27 +103,40 @@ impl WGPUBackend { pub struct WGPUBuffer { pub buffer: wgpu::Buffer, pub size: u64, + pub shape: IxDyn, } impl WGPUBuffer { - pub fn new(backend: &mut WGPUBackend, size: u64) -> Self { + pub fn new(backend: &mut WGPUBackend, shape: IxDyn) -> Self { let buffer = backend.device.create_buffer(&wgpu::BufferDescriptor { label: None, - size, + size: shape.size() as u64 * 4, usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::COPY_SRC, mapped_at_creation: false, }); - Self { buffer, size } + Self { + buffer, + size: shape.size() as u64 * 4, + shape, + } + } + + pub fn from(backend: &mut WGPUBackend, data: ArrayD) -> Self { + let slice = data.as_slice().unwrap(); + let buffer = WGPUBuffer::new(backend, data.dim()); + let (_, bytes, _) = unsafe { slice.align_to() }; + backend.queue.write_buffer(&buffer.buffer, 0, bytes); + buffer } - pub fn read(&self, backend: &mut WGPUBackend) -> Vec { + pub fn read(&self, backend: &mut WGPUBackend) -> ArrayD { let buffer = backend.device.create_buffer(&wgpu::BufferDescriptor { label: None, size: self.size, - usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_SRC, + usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST, mapped_at_creation: false, }); @@ -140,13 +153,16 @@ impl WGPUBuffer { }); backend.device.poll(wgpu::Maintain::Wait); receiver.recv().unwrap().unwrap(); - let data = slice.get_mapped_range(); - data.to_vec() + let bytes = slice.get_mapped_range(); + let (_, data, _) = unsafe { bytes.align_to() }; + ArrayD::from_shape_vec(self.shape.clone(), data.to_vec()).unwrap() } - pub fn write(&self, backend: &mut WGPUBackend, data: &[u8]) { - backend.queue.write_buffer(&self.buffer, 0, data) + pub fn write(&self, backend: &mut WGPUBackend, data: ArrayD) { + let slice = data.as_slice().unwrap(); + let (_, bytes, _) = unsafe { slice.align_to() }; + backend.queue.write_buffer(&self.buffer, 0, bytes) } } diff --git a/crates/core-gpu/src/gpu/layers/activation.rs b/crates/core-gpu/src/gpu/layers/activation.rs index 0536efc..aa767f8 100644 --- a/crates/core-gpu/src/gpu/layers/activation.rs +++ b/crates/core-gpu/src/gpu/layers/activation.rs @@ -1,99 +1,107 @@ -use ndarray::{s, ArrayD, Dimension, IxDyn}; -use std::ops::{Div, Mul, Sub}; +use ndarray::{Dimension, IxDyn}; -use crate::{ActivationLayer, GPUActivation}; +use crate::{ActivationLayer, GPUActivation, WGPUBackend, WGPUBuffer, WGPUKernel}; pub struct ActivationGPULayer { - pub outputs: ArrayD, - pub activation: GPUActivation, + // data + pub memoize_output: bool, + pub outputs: WGPUBuffer, + + // gradients + pub d_inputs: WGPUBuffer, + + // kernels + pub forward_kernel: WGPUKernel, + pub backward_kernel: WGPUKernel, } impl ActivationGPULayer { - pub fn new(config: ActivationLayer, size: IxDyn) -> Self { + pub fn new(backend: &mut WGPUBackend, config: ActivationLayer, size: &mut IxDyn) -> Self { + let activation = GPUActivation::from(config.activation); + let forward_kernel = kernel_forward(backend, size.size(), activation.activate); + let backward_kernel = kernel_backward(backend, size.size(), activation.prime); + Self { - outputs: ArrayD::zeros(size), - activation: GPUActivation::from(config.activation), + memoize_output: GPUActivation::memoize_output(&activation.activation), + outputs: WGPUBuffer::new(backend, size.clone()), + d_inputs: WGPUBuffer::new(backend, size.clone()), + forward_kernel, + backward_kernel, } } - pub fn output_size(&self) -> Vec { - self.outputs.shape().to_vec() + pub fn reset(&mut self, backend: &mut WGPUBackend, batches: usize) { + let output_size = self.outputs.shape.as_array_view()[1]; + self.outputs = WGPUBuffer::new(backend, IxDyn(&[batches, output_size])) } - pub fn reset(&mut self, batches: usize) { - let mut output_size = self.outputs.shape().to_vec(); - output_size[0] = batches; - self.outputs = ArrayD::zeros(output_size); + pub fn forward_propagate(&self, backend: &mut WGPUBackend, inputs: &WGPUBuffer) { + backend.execute(&self.forward_kernel, vec![inputs, &self.outputs]); } - pub fn forward_propagate(&mut self, inputs: ArrayD) -> ArrayD { - let outputs = if GPUActivation::memoize_output(&self.activation) { - self.outputs = inputs.map(self.activation.activate); - self.outputs.clone() + pub fn backward_propagate( + &self, + backend: &mut WGPUBackend, + inputs: &WGPUBuffer, + d_outputs: &WGPUBuffer, + ) { + if self.memoize_output { + backend.execute( + &self.backward_kernel, + vec![&self.outputs, d_outputs, &self.d_inputs], + ); } else { - self.outputs = inputs.clone(); - inputs.map(self.activation.activate) + backend.execute( + &self.backward_kernel, + vec![inputs, d_outputs, &self.d_inputs], + ); }; - outputs.into_dyn() - } - - pub fn backward_propagate(&mut self, d_outputs: ArrayD) -> ArrayD { - let d_inputs = d_outputs.mul(self.outputs.map(self.activation.prime)); - d_inputs.into_dyn() } } -pub struct SoftmaxGPULayer { - pub outputs: ArrayD, +fn kernel_forward(backend: &mut WGPUBackend, size: usize, activation: String) -> WGPUKernel { + let source = format!( + "struct Matrix {{ + values: array + }}; + + @group(0) @binding(0) + var inputs: Matrix; + @group(0) @binding(1) + var outputs: Matrix; + + @compute @workgroup_size(64, 1, 1) + fn main(@builtin(global_invocation_id) global_id: vec3) {{ + if (global_id.x < {size}u) {{ + var x = inputs.values[global_id.x]; + outputs.values[global_id.x] = {activation}; + }} + }}" + ); + backend.register(source, ((size as f64 / 64.0).ceil() as u32, 1, 1)) } -impl SoftmaxGPULayer { - pub fn new(size: IxDyn) -> Self { - Self { - outputs: ArrayD::zeros(size), - } - } - - pub fn output_size(&self) -> Vec { - self.outputs.shape().to_vec() - } - - pub fn reset(&mut self, batches: usize) { - let mut output_size = self.outputs.shape().to_vec(); - output_size[0] = batches; - self.outputs = ArrayD::zeros(output_size); - } - - pub fn forward_propagate(&mut self, inputs: ArrayD) -> ArrayD { - let batches = self.outputs.dim()[0]; - for b in 0..batches { - let exp = inputs.slice(s![b, ..]).map(|x| x.exp()); - self.outputs - .slice_mut(s![b, ..]) - .assign(&exp.clone().div(exp.sum())); - } - self.outputs.clone().into_dyn() - } - - pub fn backward_propagate(&mut self, d_outputs: ArrayD) -> ArrayD { - let batches = self.outputs.dim()[0]; - let array_size = self.outputs.dim().size() / batches; - - let mut d_inputs = ArrayD::zeros(self.outputs.dim()); - for b in 0..batches { - for y in 0..array_size { - for x in 0..array_size { - let out1 = self.outputs[[b, y]]; - let out2 = self.outputs[[b, x]]; - let d_out = d_outputs[[b, x]]; - if x == y { - d_inputs[[b, y]] += out1.sub(out1.powi(2)).mul(d_out); - } else { - d_inputs[[b, y]] += -out1.mul(out2).mul(d_out); - } - } - } - } - d_inputs - } +fn kernel_backward(backend: &mut WGPUBackend, size: usize, activation: String) -> WGPUKernel { + let source = format!( + "struct Matrix {{ + values: array + }}; + + @group(0) @binding(0) + var inputs: Matrix; + @group(0) @binding(1) + var d_outputs: Matrix; + @group(0) @binding(2) + var d_inputs: Matrix; + + @compute @workgroup_size(64, 1, 1) + fn main(@builtin(global_invocation_id) global_id: vec3) {{ + if (global_id.x < {size}u) {{ + var d_output = d_outputs.values[global_id.x]; + var x = inputs.values[global_id.x]; + d_inputs.values[global_id.x] = {activation} * d_output; + }} + }}" + ); + backend.register(source, ((size as f64 / 64.0).ceil() as u32, 1, 1)) } diff --git a/crates/core-gpu/src/gpu/layers/dense.rs b/crates/core-gpu/src/gpu/layers/dense.rs index 9d664a7..a1b8297 100644 --- a/crates/core-gpu/src/gpu/layers/dense.rs +++ b/crates/core-gpu/src/gpu/layers/dense.rs @@ -1,71 +1,205 @@ -use ndarray::{Array1, Array2, ArrayD, Axis, Dimension, Ix1, Ix2, IxDyn}; -use std::ops::Add; +use ndarray::{ArrayD, Dimension, IxDyn}; -use crate::{DenseLayer, GPUInit, Init, Tensors}; +use crate::{ + DenseLayer, DenseTensors, GPUInit, Init, Tensors, WGPUBackend, WGPUBuffer, WGPUKernel, +}; pub struct DenseGPULayer { - // cache - pub output_size: Ix2, - pub inputs: Array2, + // data + pub outputs: WGPUBuffer, // parameters - pub weights: Array2, - pub biases: Array1, + pub weights: WGPUBuffer, + pub biases: WGPUBuffer, // gradients - pub d_weights: Array2, - pub d_biases: Array1, + pub d_weights: WGPUBuffer, + pub d_biases: WGPUBuffer, + pub d_inputs: WGPUBuffer, + + // kernels + pub forward_kernel: WGPUKernel, + pub backward_kernel: WGPUKernel, } impl DenseGPULayer { - pub fn new(config: DenseLayer, size: IxDyn, tensors: Option) -> Self { + pub fn new( + backend: &mut WGPUBackend, + config: DenseLayer, + size: &mut IxDyn, + tensors: Option, + ) -> Self { let init = GPUInit::from_default(config.init, Init::Uniform); - let input_size = Ix2(size[0], size[1]); - let weight_size = Ix2(size[1], config.size[0]); - let output_size = Ix2(size[0], config.size[0]); + let input_size = IxDyn(&[size[0], size[1]]); + let weight_size = IxDyn(&[size[1], config.size[0]]); + let bias_size = IxDyn(&[size[0]]); + let output_size = IxDyn(&[size[0], config.size[0]]); + *size = output_size.clone(); - let (weights, biases) = if let Some(Tensors::Dense(tensors)) = tensors { - (tensors.weights, tensors.biases) - } else { - let weights = init.init(weight_size.into_dyn(), size[1], config.size[0]); - let biases = ArrayD::zeros(config.size.clone()); - (weights, biases) + let tensors = match tensors { + Some(Tensors::Dense(tensors)) => tensors, + _ => DenseTensors { + weights: init.init(weight_size.clone(), input_size[1], config.size[0]), + biases: ArrayD::zeros(config.size.clone()), + }, }; + let forward_kernel = kernel_forward(backend, input_size.clone(), output_size.clone()); + let backward_kernel = kernel_backward(backend, input_size.clone(), output_size.clone()); + Self { - output_size, - inputs: Array2::zeros(input_size), - weights: weights.into_dimensionality::().unwrap(), - biases: biases.into_dimensionality::().unwrap(), - d_weights: Array2::zeros(weight_size), - d_biases: Array1::zeros(config.size[0]), + outputs: WGPUBuffer::new(backend, output_size), + weights: WGPUBuffer::from(backend, tensors.weights), + biases: WGPUBuffer::from(backend, tensors.biases), + d_weights: WGPUBuffer::new(backend, weight_size), + d_biases: WGPUBuffer::new(backend, bias_size), + d_inputs: WGPUBuffer::new(backend, input_size), + forward_kernel, + backward_kernel, } } - pub fn output_size(&self) -> Vec { - self.output_size.as_array_view().to_vec() + pub fn reset(&mut self, backend: &mut WGPUBackend, batches: usize) { + let output_size = self.outputs.shape.as_array_view()[1]; + self.outputs = WGPUBuffer::new(backend, IxDyn(&[batches, output_size])) } - pub fn reset(&mut self, batches: usize) { - let input_size = self.inputs.dim().1; - self.inputs = Array2::zeros((batches, input_size)); - self.output_size[0] = batches; + pub fn forward_propagate(&self, backend: &mut WGPUBackend, inputs: &WGPUBuffer) { + backend.execute( + &self.forward_kernel, + vec![inputs, &self.weights, &self.biases, &self.outputs], + ); } - pub fn forward_propagate(&mut self, inputs: ArrayD) -> ArrayD { - self.inputs = inputs.into_dimensionality::().unwrap(); - self.inputs.dot(&self.weights).add(&self.biases).into_dyn() + pub fn backward_propagate( + &self, + backend: &mut WGPUBackend, + inputs: &WGPUBuffer, + d_outputs: &WGPUBuffer, + ) { + backend.execute( + &self.backward_kernel, + vec![ + inputs, + &self.weights, + &self.biases, + d_outputs, + &self.d_inputs, + ], + ); } - pub fn backward_propagate(&mut self, d_outputs: ArrayD) -> ArrayD { - let d_outputs = d_outputs.into_dimensionality::().unwrap(); - let mut weights_t = self.weights.view(); - weights_t.swap_axes(0, 1); - let d_inputs = d_outputs.dot(&weights_t); - let mut inputs_t = self.inputs.view(); - inputs_t.swap_axes(0, 1); - self.d_weights = inputs_t.dot(&d_outputs); - self.d_biases = d_outputs.sum_axis(Axis(0)); - d_inputs.into_dyn() + pub fn save(&self, backend: &mut WGPUBackend) -> Tensors { + Tensors::Dense(DenseTensors { + weights: self.weights.read(backend), + biases: self.weights.read(backend), + }) } } + +fn kernel_forward(backend: &mut WGPUBackend, input_size: IxDyn, output_size: IxDyn) -> WGPUKernel { + let input = input_size[1]; + let output = output_size[1]; + let batches = input_size[0]; + let source = format!( + "struct Matrix {{ + values: array + }}; + + @group(0) @binding(0) + var inputs: Matrix; + @group(0) @binding(1) + var weights: Matrix; + @group(0) @binding(2) + var biases: Matrix; + @group(0) @binding(3) + var outputs: Matrix; + + @compute @workgroup_size(8, 8, 1) + fn main(@builtin(global_invocation_id) global_id: vec3) {{ + if (global_id.x < {output}u && global_id.y < {batches}u) {{ + var weighted_sum = biases.values[global_id.x]; + for (var k = 0u; k < {input}u; k += 1u) {{ + var a = k + global_id.y * {input}u; + var b = global_id.x + k * {output}u; + weighted_sum += inputs.values[a] * weights.values[b]; + }}; + + let idx = global_id.x + global_id.y * {output}u; + outputs.values[idx] = weighted_sum; + }} + }}" + ); + backend.register( + source, + ( + (output_size[1] as f64 / 8.0).ceil() as u32, + (output_size[0] as f64 / 8.0).ceil() as u32, + 1, + ), + ) +} + +fn kernel_backward(backend: &mut WGPUBackend, input_size: IxDyn, output_size: IxDyn) -> WGPUKernel { + let input = input_size[1]; + let output = output_size[1]; + let batches = input_size[0]; + let source = format!( + "struct Matrix {{ + values: array, + }}; + + @group(0) @binding(0) + var inputs: Matrix; + @group(0) @binding(1) + var weights: Matrix; + @group(0) @binding(2) + var biases: Matrix; + @group(0) @binding(3) + var d_outputs: Matrix; + @group(0) @binding(4) + var d_inputs: Matrix; + + @compute @workgroup_size(8, 8, 1) + fn main(@builtin(global_invocation_id) global_id: vec3) {{ + if (global_id.x < 3u && global_id.y < 4u) {{ + var d_input = 0.0; + for (var k = 0u; k < 1u; k++) {{ + var a = k + global_id.x * 1u; + var b = k + global_id.y * 1u; + d_input = d_outputs.values[b] * weights.values[a]; + }}; + let idx = global_id.x + global_id.y * {input}u; + d_inputs.values[idx] = d_input; + }} + + if (global_id.x < {output}u && global_id.y < 1u) {{ + for (var k = 0u; k < {batches}u; k++) {{ + let idx = global_id.x + k * {output}u; + biases.values[global_id.x] -= d_outputs.values[idx] * 0.1; + }} + }}; + + if (global_id.x < {input}u && global_id.y < {output}u) {{ + var d_weight = 0.0; + for (var k = 0u; k < {batches}u; k++) {{ + var a = global_id.x + k * {input}u; + var b = global_id.y + k * {output}u; + d_weight += d_outputs.values[b] * inputs.values[a]; + }}; + let idx = global_id.y + global_id.x * {output}u; + weights.values[idx] -= d_weight * 0.1; + }}; + }}" + ); + let max_x = std::cmp::max(input_size[1], output_size[1]); + let max_y = std::cmp::max(input_size[1], output_size[0]); + backend.register( + source, + ( + (max_x as f64 / 8.0).ceil() as u32, + (max_y as f64 / 8.0).ceil() as u32, + 1, + ), + ) +} diff --git a/crates/core-gpu/src/gpu/layers/mod.rs b/crates/core-gpu/src/gpu/layers/mod.rs index 4465a75..136b71a 100644 --- a/crates/core-gpu/src/gpu/layers/mod.rs +++ b/crates/core-gpu/src/gpu/layers/mod.rs @@ -4,7 +4,7 @@ mod dense; pub use activation::*; pub use dense::*; -use ndarray::ArrayD; +use crate::{Tensors, WGPUBackend, WGPUBuffer}; pub enum GPULayer { Activation(ActivationGPULayer), @@ -12,31 +12,55 @@ pub enum GPULayer { } impl GPULayer { - pub fn output_size(&mut self) -> Vec { + pub fn outputs<'a>(&'a self) -> &'a WGPUBuffer { match self { - GPULayer::Activation(layer) => layer.output_size(), - GPULayer::Dense(layer) => layer.output_size(), + GPULayer::Activation(layer) => &layer.outputs, + GPULayer::Dense(layer) => &layer.outputs, } } - pub fn forward_propagate(&mut self, inputs: ArrayD, _training: bool) -> ArrayD { + pub fn d_inputs<'a>(&'a self) -> &'a WGPUBuffer { match self { - GPULayer::Activation(layer) => layer.forward_propagate(inputs), - GPULayer::Dense(layer) => layer.forward_propagate(inputs), + GPULayer::Activation(layer) => &layer.d_inputs, + GPULayer::Dense(layer) => &layer.d_inputs, } } - pub fn backward_propagate(&mut self, d_outputs: ArrayD) -> ArrayD { + pub fn forward_propagate( + &self, + backend: &mut WGPUBackend, + inputs: &WGPUBuffer, + _training: bool, + ) { match self { - GPULayer::Activation(layer) => layer.backward_propagate(d_outputs), - GPULayer::Dense(layer) => layer.backward_propagate(d_outputs), + GPULayer::Activation(layer) => layer.forward_propagate(backend, inputs), + GPULayer::Dense(layer) => layer.forward_propagate(backend, inputs), } } - pub fn reset(&mut self, batches: usize) { + pub fn backward_propagate( + &self, + backend: &mut WGPUBackend, + inputs: &WGPUBuffer, + d_outputs: &WGPUBuffer, + ) { match self { - GPULayer::Activation(layer) => layer.reset(batches), - GPULayer::Dense(layer) => layer.reset(batches), + GPULayer::Activation(layer) => layer.backward_propagate(backend, inputs, d_outputs), + GPULayer::Dense(layer) => layer.backward_propagate(backend, inputs, d_outputs), + } + } + + pub fn reset(&mut self, backend: &mut WGPUBackend, batches: usize) { + match self { + GPULayer::Activation(layer) => layer.reset(backend, batches), + GPULayer::Dense(layer) => layer.reset(backend, batches), + } + } + + pub fn save(&self, backend: &mut WGPUBackend) -> Tensors { + match self { + GPULayer::Activation(_) => Tensors::None, + GPULayer::Dense(layer) => layer.save(backend), } } } diff --git a/crates/core-gpu/src/gpu/mod.rs b/crates/core-gpu/src/gpu/mod.rs index a7a3ab5..9de364a 100644 --- a/crates/core-gpu/src/gpu/mod.rs +++ b/crates/core-gpu/src/gpu/mod.rs @@ -4,7 +4,6 @@ mod cost; mod gpu; mod init; mod layers; -mod optimizers; mod schedulers; pub use activation::*; @@ -13,5 +12,4 @@ pub use cost::*; pub use gpu::*; pub use init::*; pub use layers::*; -pub use optimizers::*; pub use schedulers::*; diff --git a/crates/core-gpu/src/gpu/optimizers/adam.rs b/crates/core-gpu/src/gpu/optimizers/adam.rs deleted file mode 100644 index f77365f..0000000 --- a/crates/core-gpu/src/gpu/optimizers/adam.rs +++ /dev/null @@ -1,74 +0,0 @@ -use std::ops::{Add, Div, Mul, SubAssign}; - -use ndarray::{ArrayD, ArrayViewD, ArrayViewMutD}; - -use crate::{AdamOptimizer, GPUScheduler}; - -pub struct GPUAdamOptimizer { - pub beta1: f32, - pub beta2: f32, - pub epsilon: f32, - pub m: Vec>>, - pub v: Vec>>, - pub t: f32, -} - -impl GPUAdamOptimizer { - pub fn new(config: AdamOptimizer, params: Vec>>) -> Self { - let mut m = Vec::new(); - let mut v = Vec::new(); - for params in params { - m.push( - params - .iter() - .map(|param| ArrayD::zeros(param.dim())) - .collect(), - ); - v.push( - params - .iter() - .map(|param| ArrayD::zeros(param.dim())) - .collect(), - ); - } - Self { - beta1: config.beta1, - beta2: config.beta2, - epsilon: config.epsilon, - m, - v, - t: 0.0, - } - } - - pub fn update_grads( - &mut self, - mut params: Vec>, - grads: Vec>, - idx: usize, - scheduler: &GPUScheduler, - rate: f32, - ) { - for (j, (param, grad)) in params.iter_mut().zip(grads).enumerate() { - self.m[idx][j] = self - .beta1 - .mul(&self.m[idx][j]) - .add((1.0 - self.beta1).mul(&grad)); - self.v[idx][j] = self - .beta2 - .mul(&self.v[idx][j]) - .add((1.0 - self.beta2).mul(&grad.map(|x| x.powi(2)))); - - let m_hat = self.m[idx][j].view().div(1.0 - self.beta1.powf(self.t)); - let v_hat = self.v[idx][j].view().div(1.0 - self.beta2.powf(self.t)); - - let rate = scheduler.eta(rate, self.t as usize); - - param.sub_assign( - &rate - .mul(m_hat) - .div(v_hat.map(|x| x.sqrt()).add(self.epsilon)), - ) - } - } -} diff --git a/crates/core-gpu/src/gpu/optimizers/mod.rs b/crates/core-gpu/src/gpu/optimizers/mod.rs deleted file mode 100644 index cb3d936..0000000 --- a/crates/core-gpu/src/gpu/optimizers/mod.rs +++ /dev/null @@ -1,75 +0,0 @@ -mod adam; -mod sgd; - -pub use adam::*; -use ndarray::{ArrayViewD, ArrayViewMutD}; -pub use sgd::*; - -use crate::{GPULayer, GPUScheduler, Optimizer}; - -pub enum GPUOptimizer { - SGD(GPUSGDOptimizer), - Adam(GPUAdamOptimizer), -} - -impl GPUOptimizer { - pub fn from(optimizer: Optimizer, layers: &mut Vec) -> Self { - let mut all_params = Vec::new(); - for layer in layers { - if let Some((params, _)) = GPUOptimizer::get_params(layer) { - all_params.push(params) - } - } - match optimizer { - Optimizer::SGD => GPUOptimizer::SGD(GPUSGDOptimizer::new()), - Optimizer::Adam(config) => { - GPUOptimizer::Adam(GPUAdamOptimizer::new(config, all_params)) - } - } - } - - pub fn update_grads( - &mut self, - layers: &mut Vec, - scheduler: &GPUScheduler, - rate: f32, - epoch: usize, - ) { - match self { - GPUOptimizer::Adam(adam) => adam.t += 1.0, - _ => {} - } - let mut idx = 0; - for layer in layers.iter_mut() { - if let Some((params, grads)) = GPUOptimizer::get_params(layer) { - match self { - GPUOptimizer::SGD(sgd) => { - sgd.update_grads(params, grads, scheduler, rate, epoch) - } - GPUOptimizer::Adam(adam) => { - adam.update_grads(params, grads, idx, scheduler, rate) - } - } - idx += 1; - } - } - } - - pub fn get_params<'a>( - layer: &'a mut GPULayer, - ) -> Option<(Vec>, Vec>)> { - match layer { - GPULayer::Dense(layer) => Some(( - vec![ - layer.weights.view_mut().into_dyn(), - layer.biases.view_mut().into_dyn(), - ], - vec![ - layer.d_weights.view().into_dyn(), - layer.d_biases.view().into_dyn(), - ], - )), - _ => return None, - } - } -} diff --git a/crates/core-gpu/src/gpu/optimizers/sgd.rs b/crates/core-gpu/src/gpu/optimizers/sgd.rs deleted file mode 100644 index 995679c..0000000 --- a/crates/core-gpu/src/gpu/optimizers/sgd.rs +++ /dev/null @@ -1,27 +0,0 @@ -use std::ops::{Mul, SubAssign}; - -use ndarray::{ArrayViewD, ArrayViewMutD}; - -use crate::GPUScheduler; - -pub struct GPUSGDOptimizer {} - -impl GPUSGDOptimizer { - pub fn new() -> Self { - Self {} - } - - pub fn update_grads( - &mut self, - mut params: Vec>, - grads: Vec>, - scheduler: &GPUScheduler, - rate: f32, - epoch: usize, - ) { - let eta = scheduler.eta(rate, epoch); - for (param, grad) in params.iter_mut().zip(grads) { - param.sub_assign(&grad.mul(eta)); - } - } -} diff --git a/crates/core-gpu/src/tensor.rs b/crates/core-gpu/src/tensor.rs index f2a0d2c..9c7b72f 100644 --- a/crates/core-gpu/src/tensor.rs +++ b/crates/core-gpu/src/tensor.rs @@ -59,6 +59,7 @@ pub enum Tensors { Dense(DenseTensors), Conv(ConvTensors), BatchNorm(BatchNormTensors), + None, } pub trait GetTensor { diff --git a/src/backends/gpu/backend.ts b/src/backends/gpu/backend.ts index f9a5b42..f22cbe0 100644 --- a/src/backends/gpu/backend.ts +++ b/src/backends/gpu/backend.ts @@ -38,6 +38,11 @@ export class GPUBackend implements Backend { shape.allocBuffer, ) as bigint; const outputShape = Array.from(shape.buffer.slice(1)) as Shape[Rank]; + + // free gpu resources when program loop exits + globalThis.onunload = () => { + library.symbols.ffi_backend_drop(id) + }; return new GPUBackend(library, outputShape, id); } diff --git a/src/backends/gpu/mod.ts b/src/backends/gpu/mod.ts index 04669ea..a24259a 100644 --- a/src/backends/gpu/mod.ts +++ b/src/backends/gpu/mod.ts @@ -18,7 +18,7 @@ const options: FetchOptions = { "https://github.com/denosaurs/netsaur/releases/download/0.2.14/", import.meta.url, ) - : "./target/release/", + : "./target/debug/", cache: "reloadAll", }; @@ -43,6 +43,10 @@ const symbols = { parameters: ["buffer", "usize", "pointer"], result: "usize", } as const, + ffi_backend_drop: { + parameters: ["usize"], + result: "void", + } as const, }; export type Library = Deno.DynamicLibrary;