Skip to content

Commit

Permalink
[feat] Additional mechanisms (#53)
Browse files Browse the repository at this point in the history
* add RMSProp optimizer

* add regularizer to dense layer

* add nadam

* fix regularizer

* fix reg

* fix nadam

* finalize nadam

* add regularization params to ts

* add tolerance and patience

* add huber, tukey, smooth hinge

* fix variable naming for cost functions

* fix multi-linreg example

* fix text classifier

* fix filters example

* update autoencoder example
  • Loading branch information
retraigo authored Sep 4, 2024
1 parent 3153acf commit 42dfb49
Show file tree
Hide file tree
Showing 28 changed files with 713 additions and 53 deletions.
8 changes: 8 additions & 0 deletions crates/core-gpu/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,20 @@ pub struct AdamOptimizer {
pub epsilon: f32,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(rename_all = "lowercase")]
pub struct RMSPropOptimizer {
pub decay_rate: f32,
pub epsilon: f32,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(tag = "type", content = "config")]
#[serde(rename_all = "lowercase")]
pub enum Optimizer {
SGD,
Adam(AdamOptimizer),
RMSProp(RMSPropOptimizer),
}

#[derive(Serialize, Deserialize, Debug, Clone)]
Expand Down
37 changes: 35 additions & 2 deletions crates/core/src/cpu/backend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ use crate::{
pub struct Backend {
pub silent: bool,
pub config: BackendConfig,
pub tolerance: f32,
pub patience: usize,
pub layers: Vec<CPULayer>,
pub size: Vec<usize>,
pub cost: CPUCost,
Expand Down Expand Up @@ -83,10 +85,14 @@ impl Backend {
let scheduler = CPUScheduler::from(&config.scheduler);
let cost = CPUCost::from(config.cost.clone());
let silent = config.silent.is_some_and(|x| x == true);
let tolerance = config.tolerance.unwrap_or(0.0);
let patience = config.patience.unwrap_or(0);
Self {
logger,
silent,
config,
tolerance,
patience,
layers,
cost,
optimizer,
Expand Down Expand Up @@ -122,7 +128,7 @@ impl Backend {
outputs: ArrayViewD<'b, f32>,
data: ArrayViewD<'b, f32>,
) -> ArrayD<f32> {
let mut d_outputs = (self.cost.prime)(data, outputs);
let mut d_outputs = (self.cost.prime)(outputs, data);
for layer in self.layers.iter_mut().rev() {
d_outputs = layer.backward_propagate(d_outputs);
}
Expand All @@ -131,6 +137,10 @@ impl Backend {

pub fn train(&mut self, datasets: Vec<Dataset>, epochs: usize, batches: usize, rate: f32) {
let mut epoch = 0;
let mut best_cost = -1f32;
let mut disappointments = 0;
let mut best_net = self.save();
let mut cost = 0f32;
while epoch < epochs {
let mut total = 0.0;
for (i, dataset) in datasets.iter().enumerate() {
Expand All @@ -141,12 +151,35 @@ impl Backend {
total += (self.cost.cost)(outputs.view(), dataset.outputs.view());
let minibatch = outputs.dim()[0];
if !self.silent && ((i + 1) * minibatch) % batches == 0 {
let cost = total / (batches) as f32;
cost = total / (batches) as f32;
let msg = format!("Epoch={}, Dataset={}, Cost={}", epoch, i * minibatch, cost);
(self.logger.log)(msg);
total = 0.0;
}
}
if self.patience != 0 {
if best_cost < 0.0 {
best_cost = cost;
}
if cost < best_cost - self.tolerance {
disappointments = 0;
best_cost = cost;
best_net = self.save();
} else {
disappointments += 1;
if !self.silent {
println!("Patience counter: {} disappointing epochs out of {}.", disappointments, self.patience);
}
}
if disappointments >= self.patience {
if !self.silent {
println!("No improvement for {} epochs. Stopping early at cost={}", disappointments, best_cost);
}
let net = Self::load(&best_net, Logger { log: |x| println!("{}", x) });
self.layers = net.layers;
break;
}
}
epoch += 1
}
}
Expand Down
138 changes: 131 additions & 7 deletions crates/core/src/cpu/cost.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@ use std::{
ops::{Mul, Sub},
};

use ndarray::{ArrayD, ArrayViewD};
use ndarray::{Array1, ArrayD, ArrayViewD};

use crate::Cost;

const HUBER_DELTA: f32 = 1.5;
const TUKEY_C: f32 = 4.685;

pub struct CPUCost {
pub cost: for<'a> fn(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32,
pub prime: for<'a> fn(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32>,
Expand All @@ -19,6 +22,10 @@ impl CPUCost {
cost: mse,
prime: mse_prime,
},
Cost::MAE => CPUCost {
cost: mae,
prime: mae_prime,
},
Cost::CrossEntropy => CPUCost {
cost: cross_entropy,
prime: cross_entropy_prime,
Expand All @@ -31,27 +38,48 @@ impl CPUCost {
cost: hinge,
prime: hinge_prime,
},
Cost::Huber => CPUCost {
cost: huber,
prime: huber_prime,
},
Cost::SmoothHinge => CPUCost {
cost: smooth_hinge,
prime: smooth_hinge_prime,
},
Cost::Tukey => CPUCost {
cost: tukey,
prime: tukey_prime,
},
}
}
}

fn mse<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
let sub = y.sub(&y_hat);
let sub = y_hat.sub(&y);
return sub.clone().mul(sub).sum() / y.len() as f32;
}

fn mse_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
return y.sub(&y_hat);
return y_hat.sub(&y);
}

fn mae<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
let sub = y_hat.sub(&y);
return sub.map(|x| x.abs()).sum() / y.len() as f32;
}

fn mae_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
return y_hat.sub(&y).map(|x| x.signum());
}

fn cross_entropy<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
let batches = y_hat.dim()[0];
let total = (-&y_hat * (y.map(|x| x.max(EPSILON).min(1f32 - EPSILON).ln()))).sum();
let total = (-&y * (y_hat.map(|x| x.max(EPSILON).min(1f32 - EPSILON).ln()))).sum();
return total / batches as f32;
}

fn cross_entropy_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
return -&y_hat / (&y + EPSILON);
return -&y / (&y_hat + EPSILON);
}

fn bin_cross_entropy<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
Expand All @@ -63,7 +91,7 @@ fn bin_cross_entropy<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) ->
}

fn bin_cross_entropy_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
return (-&y_hat / (&y + EPSILON)) + (1.0 - &y_hat) / (1.0 - &y + EPSILON);
return (-&y / (&y_hat + EPSILON)) + (1.0 - &y) / (1.0 - &y_hat + EPSILON);
}

fn hinge<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
Expand All @@ -85,5 +113,101 @@ fn hinge_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD
*result_i = -y_i;
}
}
return result;
return result
}

pub fn smooth_hinge<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
y_hat
.iter()
.zip(y.iter())
.map(|(y_hat_i, y_i)| {
let margin = y_i * y_hat_i;
if margin > -1f32 {
(1.0 - margin).max(0.0)
} else {
-4f32 * margin
}
})
.collect::<Array1<f32>>()
.to_shape(y.shape())
.unwrap()
.to_owned()
.sum()
/ y.len() as f32
}

pub fn smooth_hinge_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
y_hat
.iter()
.zip(y.iter())
.map(|(y_hat_i, y_i)| {
let margin = y_i * y_hat_i;
if margin > -1f32 {
-y_i
} else {
-4f32 * y_i
}
})
.collect::<Array1<f32>>()
.to_shape(y.shape())
.unwrap()
.to_owned()
}

pub fn tukey<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
let c_squared = TUKEY_C * TUKEY_C / 6.0;
y.sub(&y_hat)
.map(|el| {
let r = el.abs();
if r <= TUKEY_C {
c_squared * (1.0 - (1.0 - (r / TUKEY_C).powi(2)).powi(3))
} else {
c_squared
}
})
.sum()
/ y.len() as f32
}

pub fn tukey_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
y.sub(&y_hat).map(|el| {
let r = el.abs();
if r <= TUKEY_C {
r * (1.0 - ((r / TUKEY_C).powi(2))).powi(2)
} else {
0f32
}
})
}

pub fn huber<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
let loss: Array1<f32> = y_hat
.iter()
.zip(y.iter())
.map(|(y_hat_i, y_i)| {
let residual = y_i - y_hat_i;
if residual.abs() <= HUBER_DELTA {
0.5 * residual.powi(2)
} else {
HUBER_DELTA * (residual.abs() - 0.5 * HUBER_DELTA)
}
})
.collect();
loss.to_shape(y.shape()).unwrap().sum() / y.len() as f32
}

pub fn huber_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
let gradient: Array1<f32> = y_hat
.iter()
.zip(y.iter())
.map(|(y_hat_i, y_i)| {
let residual = y_i - y_hat_i;
if residual.abs() <= HUBER_DELTA {
-residual
} else {
-HUBER_DELTA * residual.signum()
}
})
.collect();
gradient.to_shape(y.shape()).unwrap().to_owned()
}
7 changes: 7 additions & 0 deletions crates/core/src/cpu/layers/batchnorm1d.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ pub struct BatchNorm1DCPULayer {
// gradients
pub d_gamma: Array2<f32>,
pub d_beta: Array2<f32>,

// gradients
pub l_gamma: Array2<f32>,
pub l_beta: Array2<f32>,
}

impl BatchNorm1DCPULayer {
Expand Down Expand Up @@ -74,6 +78,9 @@ impl BatchNorm1DCPULayer {

d_gamma: Array2::zeros((1, size[1])),
d_beta: Array2::zeros((1, size[1])),

l_gamma: Array2::zeros((1, size[1])),
l_beta: Array2::zeros((1, size[1])),
}
}

Expand Down
7 changes: 7 additions & 0 deletions crates/core/src/cpu/layers/batchnorm2d.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ pub struct BatchNorm2DCPULayer {
// gradients
pub d_gamma: Array4<f32>,
pub d_beta: Array4<f32>,

// regularization
pub l_gamma: Array4<f32>,
pub l_beta: Array4<f32>,
}

impl BatchNorm2DCPULayer {
Expand Down Expand Up @@ -71,6 +75,9 @@ impl BatchNorm2DCPULayer {

d_gamma: Array4::zeros((1, size[1], 1, 1)),
d_beta: Array4::zeros((1, size[1], 1, 1)),

l_gamma: Array4::zeros((1, size[1], 1, 1)),
l_beta: Array4::zeros((1, size[1], 1, 1)),
}
}

Expand Down
18 changes: 15 additions & 3 deletions crates/core/src/cpu/layers/conv2d.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use ndarray::{s, Array1, Array4, ArrayD, Dimension, Ix1, Ix4, IxDyn};
use std::ops::{Add, AddAssign, Mul};

use crate::{CPUInit, Conv2DLayer, Init, Tensors};
use crate::{CPUInit, CPURegularizer, Conv2DLayer, Init, Tensors};

pub struct Conv2DCPULayer {
// cache
Expand All @@ -17,6 +17,12 @@ pub struct Conv2DCPULayer {
// gradients
pub d_weights: Array4<f32>,
pub d_biases: Array1<f32>,

// regulatization
pub l_weights: Array4<f32>,
pub l_biases: Array1<f32>,

pub regularizer: CPURegularizer,
}

impl Conv2DCPULayer {
Expand All @@ -30,7 +36,6 @@ impl Conv2DCPULayer {
let input_size = Ix4(size[0], size[1], input_y, input_x);
let weight_size = IxDyn(config.kernel_size.as_slice());
let output_size = Ix4(size[0], weight_size[0], output_y, output_x);

let (weights, biases) = if let Some(Tensors::Conv(tensors)) = tensors {
(tensors.weights, tensors.biases)
} else {
Expand All @@ -54,10 +59,15 @@ impl Conv2DCPULayer {
inputs: Array4::zeros(input_size),
weights: weights.into_dimensionality::<Ix4>().unwrap(),
biases: biases.into_dimensionality::<Ix1>().unwrap(),
d_weights: ArrayD::zeros(weight_size)
d_weights: ArrayD::zeros(weight_size.clone())
.into_dimensionality::<Ix4>()
.unwrap(),
d_biases: Array1::zeros(config.kernel_size[0]),
l_weights: ArrayD::zeros(weight_size)
.into_dimensionality::<Ix4>()
.unwrap(),
l_biases: Array1::zeros(config.kernel_size[0]),
regularizer: CPURegularizer::from(config.c.unwrap_or(0.0), config.l1_ratio.unwrap_or(1.0))
}
}

Expand Down Expand Up @@ -138,6 +148,8 @@ impl Conv2DCPULayer {
}
}
}
self.l_weights = self.regularizer.coeff(&self.weights.clone().into_dyn()).into_dimensionality::<Ix4>().unwrap();
self.l_biases = self.regularizer.coeff(&self.biases.clone().into_dyn()).into_dimensionality::<Ix1>().unwrap();

d_inputs.into_dyn()
}
Expand Down
Loading

0 comments on commit 42dfb49

Please sign in to comment.