Skip to content

Commit

Permalink
gpu dense & activation
Browse files Browse the repository at this point in the history
  • Loading branch information
JinWeiTan committed Nov 6, 2023
1 parent 0a4b23a commit db02a8e
Show file tree
Hide file tree
Showing 15 changed files with 513 additions and 557 deletions.
18 changes: 17 additions & 1 deletion crates/core-gpu/src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ pub extern "C" fn ffi_backend_create(ptr: *const u8, len: usize, alloc: AllocBuf
len = backend.len();
backend.push(net_backend);
});

std::panic::set_hook(Box::new(|info| {
println!("{}", info);
ffi_backend_drop(0);
}));

len
}

Expand Down Expand Up @@ -79,7 +85,7 @@ pub extern "C" fn ffi_backend_predict(
#[no_mangle]
pub extern "C" fn ffi_backend_save(id: usize, alloc: AllocBufferFn) {
RESOURCES.with(|cell| {
let backend = cell.backend.borrow_mut();
let mut backend = cell.backend.borrow_mut();
let data = backend[id].save();
let file_ptr = alloc(data.len());
let file = unsafe { from_raw_parts_mut(file_ptr, data.len()) };
Expand Down Expand Up @@ -109,3 +115,13 @@ pub extern "C" fn ffi_backend_load(
});
len
}

#[no_mangle]
pub extern "C" fn ffi_backend_drop(id: usize) {
RESOURCES.with(|cell| {
let mut backend = cell.backend.borrow_mut();
if backend.len() > id {
backend.remove(id);
}
});
}
94 changes: 11 additions & 83 deletions crates/core-gpu/src/gpu/activation.rs
Original file line number Diff line number Diff line change
@@ -1,29 +1,21 @@
use crate::Activation;
pub struct GPUActivation {
pub activation: Activation,
pub activate: ActivationFn,
pub prime: ActivationFn,
pub activate: String,
pub prime: String,
}

type ActivationFn = fn(x: &f32) -> f32;

impl GPUActivation {
pub fn from(activation: Activation) -> Self {
let (activate, prime): (ActivationFn, ActivationFn) = match activation {
Activation::Elu => (elu, elu_prime),
Activation::LeakyRelu => (leaky_relu, leaky_relu_prime),
Activation::Linear => (linear, linear_prime),
Activation::Relu => (relu, relu_prime),
Activation::Relu6 => (relu6, relu6_prime),
Activation::Selu => (selu, selu_prime),
Activation::Sigmoid => (sigmoid, sigmoid_prime),
Activation::Tanh => (tanh, tanh_prime),
let (activate, prime): (&str, &str) = match activation {
Activation::Sigmoid => (SIGMOID, SIGMOID_PRIME),
_ => unimplemented!()
};

Self {
activation,
activate,
prime,
activate: String::from(activate),
prime: String::from(prime),
}
}

Expand All @@ -35,78 +27,14 @@ impl GPUActivation {
}
}

pub fn memoize_output(activation: &GPUActivation) -> bool {
match activation.activation {
pub fn memoize_output(activation: &Activation) -> bool {
match activation {
Activation::Sigmoid | Activation::Tanh => true,
_ => true,
}
}
}

fn sigmoid(x: &f32) -> f32 {
return 1.0 / (1.0 + (-x).exp());
}

fn sigmoid_prime(x: &f32) -> f32 {
return x * (1.0 - x);
}

fn tanh(x: &f32) -> f32 {
return x.tanh();
}

fn tanh_prime(x: &f32) -> f32 {
return 1.0 - tanh(x).powi(2);
}

fn linear(x: &f32) -> f32 {
return *x;
}

fn linear_prime(_x: &f32) -> f32 {
return 1.0;
}

fn relu(x: &f32) -> f32 {
return x.max(0.0);
}

fn relu_prime(x: &f32) -> f32 {
return if *x > 0.0 { 1.0 } else { 0.0 };
}
const SIGMOID: &str = "1.0 / (1.0 + exp(-x))";

fn relu6(x: &f32) -> f32 {
return x.max(0.0).min(6.0);
}

fn relu6_prime(x: &f32) -> f32 {
return if *x > 0.0 && *x < 6.0 { 1.0 } else { 0.0 };
}

fn leaky_relu(x: &f32) -> f32 {
return if *x > 0.0 { *x } else { x.max(0.01 * x) };
}

fn leaky_relu_prime(x: &f32) -> f32 {
return if *x > 0.0 { 1.0 } else { 0.01 };
}

fn elu(x: &f32) -> f32 {
return if *x >= 0.0 { *x } else { x.exp() - 1.0 };
}

fn elu_prime(x: &f32) -> f32 {
return if *x > 0.0 { 1.0 } else { x.exp() };
}

fn selu(x: &f32) -> f32 {
return if *x >= 0.0 {
*x
} else {
1.0507 * (x.exp() - 1.0)
};
}

fn selu_prime(x: &f32) -> f32 {
return if *x > 0.0 { 1.0 } else { 1.0507 * x.exp() };
}
const SIGMOID_PRIME: &str = "x * (1.0 - x)";
126 changes: 74 additions & 52 deletions crates/core-gpu/src/gpu/backend.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
use std::collections::HashMap;

use ndarray::{ArrayD, ArrayViewD, IxDyn};
use ndarray::{ArrayD, IxDyn, Dimension};
use safetensors::{serialize, SafeTensors};

use crate::{
to_arr, ActivationGPULayer, BackendConfig, Dataset, DenseGPULayer, DenseTensors, GPUCost,
GPULayer, GPUOptimizer, GPUScheduler, GetTensor, Layer, Logger, Tensor, Tensors, WGPUBackend,
GPULayer, GPUScheduler, GetTensor, Layer, Logger, Tensor, Tensors, WGPUBackend, WGPUBuffer,
WGPUDataset,
};

pub struct Backend {
Expand All @@ -15,86 +16,97 @@ pub struct Backend {
pub layers: Vec<GPULayer>,
pub size: Vec<usize>,
pub cost: GPUCost,
pub optimizer: GPUOptimizer,
pub scheduler: GPUScheduler,
pub logger: Logger,
}

impl Backend {
pub fn new(
backend: WGPUBackend,
mut backend: WGPUBackend,
config: BackendConfig,
logger: Logger,
mut tensors: Option<Vec<Tensors>>,
) -> Self {
let mut layers = Vec::new();
let mut size = config.size.clone();
let mut size = IxDyn(&config.size);
for layer in config.layers.iter() {
match layer.clone() {
Layer::Activation(config) => {
let layer = ActivationGPULayer::new(config, IxDyn(&size));
let layer = ActivationGPULayer::new(&mut backend, config, &mut size);
layers.push(GPULayer::Activation(layer));
}
Layer::Dense(config) => {
let layer = DenseGPULayer::new(config, IxDyn(&size), tensors.get());
size = layer.output_size().to_vec();
let layer = DenseGPULayer::new(&mut backend, config, &mut size, tensors.get());
layers.push(GPULayer::Dense(layer));
}
_ => unimplemented!(),
}
};
}
let optimizer = GPUOptimizer::from(config.optimizer.clone(), &mut layers);
let scheduler = GPUScheduler::from(&config.scheduler);
let cost = GPUCost::from(config.cost.clone());
let silent = config.silent.is_some_and(|x| x == true);

Self {
backend,
logger,
silent,
config,
layers,
cost,
optimizer,
scheduler,
size,
size: size.as_array_view().to_vec(),
silent: config.silent.is_some_and(|x| x == true),
cost: GPUCost::from(&mut backend, config.cost.clone(), size),
scheduler: GPUScheduler::from(&config.scheduler),
config,
backend,
}
}

pub fn forward_propagate(&mut self, mut inputs: ArrayD<f32>, training: bool) -> ArrayD<f32> {
pub fn forward_propagate<'a>(&'a mut self, mut inputs: &'a WGPUBuffer, training: bool) {
for layer in &mut self.layers {
inputs = layer.forward_propagate(inputs, training);
layer.forward_propagate(&mut self.backend, inputs, training);
inputs = layer.outputs()
}
inputs
}

pub fn backward_propagate<'b>(
&mut self,
outputs: ArrayViewD<'b, f32>,
data: ArrayViewD<'b, f32>,
) -> ArrayD<f32> {
let mut d_outputs = (self.cost.prime)(data, outputs);
for layer in self.layers.iter_mut().rev() {
d_outputs = layer.backward_propagate(d_outputs);
pub fn backward_propagate(&mut self, inputs: &WGPUBuffer, dataset: &WGPUBuffer) {
let outputs = self.layers.last().unwrap().outputs();
self.cost.prime(&mut self.backend, dataset, outputs);
let mut d_outputs = &self.cost.d_inputs;

for i in (1..self.layers.len()).rev() {
let (left, right) = self.layers.split_at(i);
let inputs = left.last().unwrap().outputs();
right[0].backward_propagate(&mut self.backend, &inputs, d_outputs);
d_outputs = right[0].d_inputs()
}
d_outputs

self.layers[0].backward_propagate(&mut self.backend, &inputs, d_outputs);
}

pub fn train(&mut self, datasets: Vec<Dataset>, epochs: usize, batches: usize, rate: f32) {
pub fn train(&mut self, datasets: Vec<Dataset>, epochs: usize, batches: usize, _rate: f32) {
let mut epoch = 0;

let mut gpu_datasets = Vec::new();
for dataset in datasets {
gpu_datasets.push(WGPUDataset {
inputs: WGPUBuffer::from(&mut self.backend, dataset.inputs),
outputs: WGPUBuffer::from(&mut self.backend, dataset.outputs),
})
}

while epoch < epochs {
let mut total = 0.0;
for (i, dataset) in datasets.iter().enumerate() {
let outputs = self.forward_propagate(dataset.inputs.clone(), true);
self.backward_propagate(outputs.view(), dataset.outputs.view());
self.optimizer
.update_grads(&mut self.layers, &self.scheduler, rate, epoch);
total += (self.cost.cost)(outputs.view(), dataset.outputs.view());
let minibatch = outputs.dim()[0];
if !self.silent && ((i + 1) * minibatch) % batches == 0 {
let cost = total / (batches) as f32;
let msg = format!("Epoch={}, Dataset={}, Cost={}", epoch, i * minibatch, cost);
(self.logger.log)(msg);
total = 0.0;
for (i, dataset) in gpu_datasets.iter().enumerate() {
self.forward_propagate(&dataset.inputs, true);
self.backward_propagate(&dataset.inputs, &dataset.outputs);

if !self.silent {
let outputs = self.layers.last().unwrap().outputs();
total += self
.cost
.cost(&mut self.backend, &outputs, &dataset.outputs);
let minibatch = outputs.shape[0];
if ((i + 1) * minibatch) % batches == 0 {
let cost = total / (batches) as f32;
let msg =
format!("Epoch={}, Dataset={}, Cost={}", epoch, i * minibatch, cost);
(self.logger.log)(msg);
total = 0.0;
}
}
}
epoch += 1
Expand All @@ -103,18 +115,28 @@ impl Backend {

pub fn predict(&mut self, data: ArrayD<f32>) -> ArrayD<f32> {
for layer in &mut self.layers {
layer.reset(1)
layer.reset(&mut self.backend, 1)
}
self.forward_propagate(data, false)
let inputs = WGPUBuffer::from(&mut self.backend, data);
self.forward_propagate(&inputs, false);
self.layers
.last()
.unwrap()
.outputs()
.read(&mut self.backend)
}

pub fn save(&self) -> Vec<u8> {
pub fn save(&mut self) -> Vec<u8> {
let mut layers = Vec::new();
for layer in &self.layers {
layers.push(layer.save(&mut self.backend))
}
let mut tensors = Vec::new();
for (i, layer) in self.layers.iter().enumerate() {
for (i, layer) in layers.iter().enumerate() {
match layer {
GPULayer::Dense(layer) => {
let weights = Tensor::new(layer.weights.view().into_dyn());
let biases = Tensor::new(layer.biases.view().into_dyn());
Tensors::Dense(layer) => {
let weights = Tensor::new(layer.weights.view());
let biases = Tensor::new(layer.biases.view());
tensors.push((format!("{}w", i), weights));
tensors.push((format!("{}b", i), biases));
}
Expand Down
Loading

0 comments on commit db02a8e

Please sign in to comment.