Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feat] Additional mechanisms #53

Merged
merged 15 commits into from
Sep 4, 2024
8 changes: 8 additions & 0 deletions crates/core-gpu/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,20 @@ pub struct AdamOptimizer {
pub epsilon: f32,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(rename_all = "lowercase")]
pub struct RMSPropOptimizer {
pub decay_rate: f32,
pub epsilon: f32,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(tag = "type", content = "config")]
#[serde(rename_all = "lowercase")]
pub enum Optimizer {
SGD,
Adam(AdamOptimizer),
RMSProp(RMSPropOptimizer),
}

#[derive(Serialize, Deserialize, Debug, Clone)]
Expand Down
37 changes: 35 additions & 2 deletions crates/core/src/cpu/backend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ use crate::{
pub struct Backend {
pub silent: bool,
pub config: BackendConfig,
pub tolerance: f32,
pub patience: usize,
pub layers: Vec<CPULayer>,
pub size: Vec<usize>,
pub cost: CPUCost,
Expand Down Expand Up @@ -83,10 +85,14 @@ impl Backend {
let scheduler = CPUScheduler::from(&config.scheduler);
let cost = CPUCost::from(config.cost.clone());
let silent = config.silent.is_some_and(|x| x == true);
let tolerance = config.tolerance.unwrap_or(0.0);
let patience = config.patience.unwrap_or(0);
Self {
logger,
silent,
config,
tolerance,
patience,
layers,
cost,
optimizer,
Expand Down Expand Up @@ -122,7 +128,7 @@ impl Backend {
outputs: ArrayViewD<'b, f32>,
data: ArrayViewD<'b, f32>,
) -> ArrayD<f32> {
let mut d_outputs = (self.cost.prime)(data, outputs);
let mut d_outputs = (self.cost.prime)(outputs, data);
for layer in self.layers.iter_mut().rev() {
d_outputs = layer.backward_propagate(d_outputs);
}
Expand All @@ -131,6 +137,10 @@ impl Backend {

pub fn train(&mut self, datasets: Vec<Dataset>, epochs: usize, batches: usize, rate: f32) {
let mut epoch = 0;
let mut best_cost = -1f32;
let mut disappointments = 0;
let mut best_net = self.save();
let mut cost = 0f32;
while epoch < epochs {
let mut total = 0.0;
for (i, dataset) in datasets.iter().enumerate() {
Expand All @@ -141,12 +151,35 @@ impl Backend {
total += (self.cost.cost)(outputs.view(), dataset.outputs.view());
let minibatch = outputs.dim()[0];
if !self.silent && ((i + 1) * minibatch) % batches == 0 {
let cost = total / (batches) as f32;
cost = total / (batches) as f32;
let msg = format!("Epoch={}, Dataset={}, Cost={}", epoch, i * minibatch, cost);
(self.logger.log)(msg);
total = 0.0;
}
}
if self.patience != 0 {
if best_cost < 0.0 {
best_cost = cost;
}
if cost < best_cost - self.tolerance {
disappointments = 0;
best_cost = cost;
best_net = self.save();
} else {
disappointments += 1;
if !self.silent {
println!("Patience counter: {} disappointing epochs out of {}.", disappointments, self.patience);
}
}
if disappointments >= self.patience {
if !self.silent {
println!("No improvement for {} epochs. Stopping early at cost={}", disappointments, best_cost);
}
let net = Self::load(&best_net, Logger { log: |x| println!("{}", x) });
self.layers = net.layers;
break;
}
}
epoch += 1
}
}
Expand Down
138 changes: 131 additions & 7 deletions crates/core/src/cpu/cost.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@ use std::{
ops::{Mul, Sub},
};

use ndarray::{ArrayD, ArrayViewD};
use ndarray::{Array1, ArrayD, ArrayViewD};

use crate::Cost;

const HUBER_DELTA: f32 = 1.5;
const TUKEY_C: f32 = 4.685;

pub struct CPUCost {
pub cost: for<'a> fn(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32,
pub prime: for<'a> fn(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32>,
Expand All @@ -19,6 +22,10 @@ impl CPUCost {
cost: mse,
prime: mse_prime,
},
Cost::MAE => CPUCost {
cost: mae,
prime: mae_prime,
},
Cost::CrossEntropy => CPUCost {
cost: cross_entropy,
prime: cross_entropy_prime,
Expand All @@ -31,27 +38,48 @@ impl CPUCost {
cost: hinge,
prime: hinge_prime,
},
Cost::Huber => CPUCost {
cost: huber,
prime: huber_prime,
},
Cost::SmoothHinge => CPUCost {
cost: smooth_hinge,
prime: smooth_hinge_prime,
},
Cost::Tukey => CPUCost {
cost: tukey,
prime: tukey_prime,
},
}
}
}

fn mse<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
let sub = y.sub(&y_hat);
let sub = y_hat.sub(&y);
return sub.clone().mul(sub).sum() / y.len() as f32;
}

fn mse_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
return y.sub(&y_hat);
return y_hat.sub(&y);
}

fn mae<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
let sub = y_hat.sub(&y);
return sub.map(|x| x.abs()).sum() / y.len() as f32;
}

fn mae_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
return y_hat.sub(&y).map(|x| x.signum());
}

fn cross_entropy<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
let batches = y_hat.dim()[0];
let total = (-&y_hat * (y.map(|x| x.max(EPSILON).min(1f32 - EPSILON).ln()))).sum();
let total = (-&y * (y_hat.map(|x| x.max(EPSILON).min(1f32 - EPSILON).ln()))).sum();
return total / batches as f32;
}

fn cross_entropy_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
return -&y_hat / (&y + EPSILON);
return -&y / (&y_hat + EPSILON);
}

fn bin_cross_entropy<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
Expand All @@ -63,7 +91,7 @@ fn bin_cross_entropy<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) ->
}

fn bin_cross_entropy_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
return (-&y_hat / (&y + EPSILON)) + (1.0 - &y_hat) / (1.0 - &y + EPSILON);
return (-&y / (&y_hat + EPSILON)) + (1.0 - &y) / (1.0 - &y_hat + EPSILON);
}

fn hinge<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
Expand All @@ -85,5 +113,101 @@ fn hinge_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD
*result_i = -y_i;
}
}
return result;
return result
}

pub fn smooth_hinge<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
y_hat
.iter()
.zip(y.iter())
.map(|(y_hat_i, y_i)| {
let margin = y_i * y_hat_i;
if margin > -1f32 {
(1.0 - margin).max(0.0)
} else {
-4f32 * margin
}
})
.collect::<Array1<f32>>()
.to_shape(y.shape())
.unwrap()
.to_owned()
.sum()
/ y.len() as f32
}

pub fn smooth_hinge_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
y_hat
.iter()
.zip(y.iter())
.map(|(y_hat_i, y_i)| {
let margin = y_i * y_hat_i;
if margin > -1f32 {
-y_i
} else {
-4f32 * y_i
}
})
.collect::<Array1<f32>>()
.to_shape(y.shape())
.unwrap()
.to_owned()
}

pub fn tukey<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
let c_squared = TUKEY_C * TUKEY_C / 6.0;
y.sub(&y_hat)
.map(|el| {
let r = el.abs();
if r <= TUKEY_C {
c_squared * (1.0 - (1.0 - (r / TUKEY_C).powi(2)).powi(3))
} else {
c_squared
}
})
.sum()
/ y.len() as f32
}

pub fn tukey_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
y.sub(&y_hat).map(|el| {
let r = el.abs();
if r <= TUKEY_C {
r * (1.0 - ((r / TUKEY_C).powi(2))).powi(2)
} else {
0f32
}
})
}

pub fn huber<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
let loss: Array1<f32> = y_hat
.iter()
.zip(y.iter())
.map(|(y_hat_i, y_i)| {
let residual = y_i - y_hat_i;
if residual.abs() <= HUBER_DELTA {
0.5 * residual.powi(2)
} else {
HUBER_DELTA * (residual.abs() - 0.5 * HUBER_DELTA)
}
})
.collect();
loss.to_shape(y.shape()).unwrap().sum() / y.len() as f32
}

pub fn huber_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
let gradient: Array1<f32> = y_hat
.iter()
.zip(y.iter())
.map(|(y_hat_i, y_i)| {
let residual = y_i - y_hat_i;
if residual.abs() <= HUBER_DELTA {
-residual
} else {
-HUBER_DELTA * residual.signum()
}
})
.collect();
gradient.to_shape(y.shape()).unwrap().to_owned()
}
7 changes: 7 additions & 0 deletions crates/core/src/cpu/layers/batchnorm1d.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ pub struct BatchNorm1DCPULayer {
// gradients
pub d_gamma: Array2<f32>,
pub d_beta: Array2<f32>,

// gradients
pub l_gamma: Array2<f32>,
pub l_beta: Array2<f32>,
}

impl BatchNorm1DCPULayer {
Expand Down Expand Up @@ -74,6 +78,9 @@ impl BatchNorm1DCPULayer {

d_gamma: Array2::zeros((1, size[1])),
d_beta: Array2::zeros((1, size[1])),

l_gamma: Array2::zeros((1, size[1])),
l_beta: Array2::zeros((1, size[1])),
}
}

Expand Down
7 changes: 7 additions & 0 deletions crates/core/src/cpu/layers/batchnorm2d.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ pub struct BatchNorm2DCPULayer {
// gradients
pub d_gamma: Array4<f32>,
pub d_beta: Array4<f32>,

// regularization
pub l_gamma: Array4<f32>,
pub l_beta: Array4<f32>,
}

impl BatchNorm2DCPULayer {
Expand Down Expand Up @@ -71,6 +75,9 @@ impl BatchNorm2DCPULayer {

d_gamma: Array4::zeros((1, size[1], 1, 1)),
d_beta: Array4::zeros((1, size[1], 1, 1)),

l_gamma: Array4::zeros((1, size[1], 1, 1)),
l_beta: Array4::zeros((1, size[1], 1, 1)),
}
}

Expand Down
18 changes: 15 additions & 3 deletions crates/core/src/cpu/layers/conv2d.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use ndarray::{s, Array1, Array4, ArrayD, Dimension, Ix1, Ix4, IxDyn};
use std::ops::{Add, AddAssign, Mul};

use crate::{CPUInit, Conv2DLayer, Init, Tensors};
use crate::{CPUInit, CPURegularizer, Conv2DLayer, Init, Tensors};

pub struct Conv2DCPULayer {
// cache
Expand All @@ -17,6 +17,12 @@ pub struct Conv2DCPULayer {
// gradients
pub d_weights: Array4<f32>,
pub d_biases: Array1<f32>,

// regulatization
pub l_weights: Array4<f32>,
pub l_biases: Array1<f32>,

pub regularizer: CPURegularizer,
}

impl Conv2DCPULayer {
Expand All @@ -30,7 +36,6 @@ impl Conv2DCPULayer {
let input_size = Ix4(size[0], size[1], input_y, input_x);
let weight_size = IxDyn(config.kernel_size.as_slice());
let output_size = Ix4(size[0], weight_size[0], output_y, output_x);

let (weights, biases) = if let Some(Tensors::Conv(tensors)) = tensors {
(tensors.weights, tensors.biases)
} else {
Expand All @@ -54,10 +59,15 @@ impl Conv2DCPULayer {
inputs: Array4::zeros(input_size),
weights: weights.into_dimensionality::<Ix4>().unwrap(),
biases: biases.into_dimensionality::<Ix1>().unwrap(),
d_weights: ArrayD::zeros(weight_size)
d_weights: ArrayD::zeros(weight_size.clone())
.into_dimensionality::<Ix4>()
.unwrap(),
d_biases: Array1::zeros(config.kernel_size[0]),
l_weights: ArrayD::zeros(weight_size)
.into_dimensionality::<Ix4>()
.unwrap(),
l_biases: Array1::zeros(config.kernel_size[0]),
regularizer: CPURegularizer::from(config.c.unwrap_or(0.0), config.l1_ratio.unwrap_or(1.0))
}
}

Expand Down Expand Up @@ -138,6 +148,8 @@ impl Conv2DCPULayer {
}
}
}
self.l_weights = self.regularizer.coeff(&self.weights.clone().into_dyn()).into_dimensionality::<Ix4>().unwrap();
self.l_biases = self.regularizer.coeff(&self.biases.clone().into_dyn()).into_dimensionality::<Ix1>().unwrap();

d_inputs.into_dyn()
}
Expand Down
Loading
Loading