denosaurs · load1n9 · Sep 4, 2024 · Sep 2, 2024 · Sep 4, 2024 · Sep 4, 2024
diff --git a/crates/core-gpu/src/types.rs b/crates/core-gpu/src/types.rs
@@ -133,12 +133,20 @@ pub struct AdamOptimizer {
     pub epsilon: f32,
 }
 
+#[derive(Serialize, Deserialize, Debug, Clone)]
+#[serde(rename_all = "lowercase")]
+pub struct RMSPropOptimizer {
+    pub decay_rate: f32,
+    pub epsilon: f32,
+}
+
 #[derive(Serialize, Deserialize, Debug, Clone)]
 #[serde(tag = "type", content = "config")]
 #[serde(rename_all = "lowercase")]
 pub enum Optimizer {
     SGD,
     Adam(AdamOptimizer),
+    RMSProp(RMSPropOptimizer),
 }
 
 #[derive(Serialize, Deserialize, Debug, Clone)]

diff --git a/crates/core/src/cpu/backend.rs b/crates/core/src/cpu/backend.rs
@@ -14,6 +14,8 @@ use crate::{
 pub struct Backend {
     pub silent: bool,
     pub config: BackendConfig,
+    pub tolerance: f32,
+    pub patience: usize,
     pub layers: Vec<CPULayer>,
     pub size: Vec<usize>,
     pub cost: CPUCost,
@@ -83,10 +85,14 @@ impl Backend {
         let scheduler = CPUScheduler::from(&config.scheduler);
         let cost = CPUCost::from(config.cost.clone());
         let silent = config.silent.is_some_and(|x| x == true);
+        let tolerance = config.tolerance.unwrap_or(0.0);
+        let patience = config.patience.unwrap_or(0);
         Self {
             logger,
             silent,
             config,
+            tolerance,
+            patience,
             layers,
             cost,
             optimizer,
@@ -122,7 +128,7 @@ impl Backend {
         outputs: ArrayViewD<'b, f32>,
         data: ArrayViewD<'b, f32>,
     ) -> ArrayD<f32> {
-        let mut d_outputs = (self.cost.prime)(data, outputs);
+        let mut d_outputs = (self.cost.prime)(outputs, data);
         for layer in self.layers.iter_mut().rev() {
             d_outputs = layer.backward_propagate(d_outputs);
         }
@@ -131,6 +137,10 @@ impl Backend {
 
     pub fn train(&mut self, datasets: Vec<Dataset>, epochs: usize, batches: usize, rate: f32) {
         let mut epoch = 0;
+        let mut best_cost = -1f32;
+        let mut disappointments = 0;
+        let mut best_net = self.save();
+        let mut cost = 0f32;
         while epoch < epochs {
             let mut total = 0.0;
             for (i, dataset) in datasets.iter().enumerate() {
@@ -141,12 +151,35 @@ impl Backend {
                 total += (self.cost.cost)(outputs.view(), dataset.outputs.view());
                 let minibatch = outputs.dim()[0];
                 if !self.silent && ((i + 1) * minibatch) % batches == 0 {
-                    let cost = total / (batches) as f32;
+                    cost = total / (batches) as f32;
                     let msg = format!("Epoch={}, Dataset={}, Cost={}", epoch, i * minibatch, cost);
                     (self.logger.log)(msg);
                     total = 0.0;
                 }
             }
+            if self.patience != 0 {
+                if best_cost < 0.0 {
+                    best_cost = cost;
+                }
+                if cost < best_cost - self.tolerance {
+                    disappointments = 0;
+                    best_cost = cost;
+                    best_net = self.save();
+                }  else {
+                    disappointments += 1;
+                    if !self.silent {
+                        println!("Patience counter: {} disappointing epochs out of {}.", disappointments, self.patience);
+                    }
+                }
+                if disappointments >= self.patience {
+                    if !self.silent {
+                        println!("No improvement for {} epochs. Stopping early at cost={}", disappointments, best_cost);
+                    }
+                    let net = Self::load(&best_net, Logger { log: |x| println!("{}", x) });
+                    self.layers = net.layers;
+                    break;
+                }
+            }
             epoch += 1
         }
     }

diff --git a/crates/core/src/cpu/cost.rs b/crates/core/src/cpu/cost.rs
@@ -3,10 +3,13 @@ use std::{
     ops::{Mul, Sub},
 };
 
-use ndarray::{ArrayD, ArrayViewD};
+use ndarray::{Array1, ArrayD, ArrayViewD};
 
 use crate::Cost;
 
+const HUBER_DELTA: f32 = 1.5;
+const TUKEY_C: f32 = 4.685;
+
 pub struct CPUCost {
     pub cost: for<'a> fn(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32,
     pub prime: for<'a> fn(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32>,
@@ -19,6 +22,10 @@ impl CPUCost {
                 cost: mse,
                 prime: mse_prime,
             },
+            Cost::MAE => CPUCost {
+                cost: mae,
+                prime: mae_prime,
+            },
             Cost::CrossEntropy => CPUCost {
                 cost: cross_entropy,
                 prime: cross_entropy_prime,
@@ -31,27 +38,48 @@ impl CPUCost {
                 cost: hinge,
                 prime: hinge_prime,
             },
+            Cost::Huber => CPUCost {
+                cost: huber,
+                prime: huber_prime,
+            },
+            Cost::SmoothHinge => CPUCost {
+                cost: smooth_hinge,
+                prime: smooth_hinge_prime,
+            },
+            Cost::Tukey => CPUCost {
+                cost: tukey,
+                prime: tukey_prime,
+            },
         }
     }
 }
 
 fn mse<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
-    let sub = y.sub(&y_hat);
+    let sub = y_hat.sub(&y);
     return sub.clone().mul(sub).sum() / y.len() as f32;
 }
 
 fn mse_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
-    return y.sub(&y_hat);
+    return y_hat.sub(&y);
+}
+
+fn mae<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
+    let sub = y_hat.sub(&y);
+    return sub.map(|x| x.abs()).sum() / y.len() as f32;
+}
+
+fn mae_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
+    return y_hat.sub(&y).map(|x| x.signum());
 }
 
 fn cross_entropy<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
     let batches = y_hat.dim()[0];
-    let total = (-&y_hat * (y.map(|x| x.max(EPSILON).min(1f32 - EPSILON).ln()))).sum();
+    let total = (-&y * (y_hat.map(|x| x.max(EPSILON).min(1f32 - EPSILON).ln()))).sum();
     return total / batches as f32;
 }
 
 fn cross_entropy_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
-    return -&y_hat / (&y + EPSILON);
+    return -&y / (&y_hat + EPSILON);
 }
 
 fn bin_cross_entropy<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
@@ -63,7 +91,7 @@ fn bin_cross_entropy<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) ->
 }
 
 fn bin_cross_entropy_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
-    return (-&y_hat / (&y + EPSILON)) + (1.0 - &y_hat) / (1.0 - &y + EPSILON);
+    return (-&y / (&y_hat + EPSILON)) + (1.0 - &y) / (1.0 - &y_hat + EPSILON);
 }
 
 fn hinge<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
@@ -85,5 +113,101 @@ fn hinge_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD
             *result_i = -y_i;
         }
     }
-    return result;
+    return result
+}
+
+pub fn smooth_hinge<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
+    y_hat
+        .iter()
+        .zip(y.iter())
+        .map(|(y_hat_i, y_i)| {
+            let margin = y_i * y_hat_i;
+            if margin > -1f32 {
+                (1.0 - margin).max(0.0)
+            } else {
+                -4f32 * margin
+            }
+        })
+        .collect::<Array1<f32>>()
+        .to_shape(y.shape())
+        .unwrap()
+        .to_owned()
+        .sum()
+        / y.len() as f32
+}
+
+pub fn smooth_hinge_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
+    y_hat
+        .iter()
+        .zip(y.iter())
+        .map(|(y_hat_i, y_i)| {
+            let margin = y_i * y_hat_i;
+            if margin > -1f32 {
+                -y_i
+            } else {
+                -4f32 * y_i
+            }
+        })
+        .collect::<Array1<f32>>()
+        .to_shape(y.shape())
+        .unwrap()
+        .to_owned()
+}
+
+pub fn tukey<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
+    let c_squared = TUKEY_C * TUKEY_C / 6.0;
+    y.sub(&y_hat)
+        .map(|el| {
+            let r = el.abs();
+            if r <= TUKEY_C {
+                c_squared * (1.0 - (1.0 - (r / TUKEY_C).powi(2)).powi(3))
+            } else {
+                c_squared
+            }
+        })
+        .sum()
+        / y.len() as f32
+}
+
+pub fn tukey_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
+    y.sub(&y_hat).map(|el| {
+        let r = el.abs();
+        if r <= TUKEY_C {
+            r * (1.0 - ((r / TUKEY_C).powi(2))).powi(2)
+        } else {
+            0f32
+        }
+    })
+}
+
+pub fn huber<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> f32 {
+    let loss: Array1<f32> = y_hat
+        .iter()
+        .zip(y.iter())
+        .map(|(y_hat_i, y_i)| {
+            let residual = y_i - y_hat_i;
+            if residual.abs() <= HUBER_DELTA {
+                0.5 * residual.powi(2)
+            } else {
+                HUBER_DELTA * (residual.abs() - 0.5 * HUBER_DELTA)
+            }
+        })
+        .collect();
+    loss.to_shape(y.shape()).unwrap().sum() / y.len() as f32
+}
+
+pub fn huber_prime<'a>(y_hat: ArrayViewD<'a, f32>, y: ArrayViewD<'a, f32>) -> ArrayD<f32> {
+    let gradient: Array1<f32> = y_hat
+        .iter()
+        .zip(y.iter())
+        .map(|(y_hat_i, y_i)| {
+            let residual = y_i - y_hat_i;
+            if residual.abs() <= HUBER_DELTA {
+                -residual
+            } else {
+                -HUBER_DELTA * residual.signum()
+            }
+        })
+        .collect();
+    gradient.to_shape(y.shape()).unwrap().to_owned()
 }
diff --git a/crates/core/src/cpu/layers/batchnorm1d.rs b/crates/core/src/cpu/layers/batchnorm1d.rs
@@ -35,6 +35,10 @@ pub struct BatchNorm1DCPULayer {
     // gradients
     pub d_gamma: Array2<f32>,
     pub d_beta: Array2<f32>,
+
+    // gradients
+    pub l_gamma: Array2<f32>,
+    pub l_beta: Array2<f32>,
 }
 
 impl BatchNorm1DCPULayer {
@@ -74,6 +78,9 @@ impl BatchNorm1DCPULayer {
 
             d_gamma: Array2::zeros((1, size[1])),
             d_beta: Array2::zeros((1, size[1])),
+
+            l_gamma: Array2::zeros((1, size[1])),
+            l_beta: Array2::zeros((1, size[1])),
         }
     }
 

diff --git a/crates/core/src/cpu/layers/batchnorm2d.rs b/crates/core/src/cpu/layers/batchnorm2d.rs
@@ -32,6 +32,10 @@ pub struct BatchNorm2DCPULayer {
     // gradients
     pub d_gamma: Array4<f32>,
     pub d_beta: Array4<f32>,
+
+    // regularization
+    pub l_gamma: Array4<f32>,
+    pub l_beta: Array4<f32>,
 }
 
 impl BatchNorm2DCPULayer {
@@ -71,6 +75,9 @@ impl BatchNorm2DCPULayer {
 
             d_gamma: Array4::zeros((1, size[1], 1, 1)),
             d_beta: Array4::zeros((1, size[1], 1, 1)),
+
+            l_gamma: Array4::zeros((1, size[1], 1, 1)),
+            l_beta: Array4::zeros((1, size[1], 1, 1)),
         }
     }
 

diff --git a/crates/core/src/cpu/layers/conv2d.rs b/crates/core/src/cpu/layers/conv2d.rs
@@ -1,7 +1,7 @@
 use ndarray::{s, Array1, Array4, ArrayD, Dimension, Ix1, Ix4, IxDyn};
 use std::ops::{Add, AddAssign, Mul};
 
-use crate::{CPUInit, Conv2DLayer, Init, Tensors};
+use crate::{CPUInit, CPURegularizer, Conv2DLayer, Init, Tensors};
 
 pub struct Conv2DCPULayer {
     // cache
@@ -17,6 +17,12 @@ pub struct Conv2DCPULayer {
     // gradients
     pub d_weights: Array4<f32>,
     pub d_biases: Array1<f32>,
+
+    // regulatization
+    pub l_weights: Array4<f32>,
+    pub l_biases: Array1<f32>,
+
+    pub regularizer: CPURegularizer,
 }
 
 impl Conv2DCPULayer {
@@ -30,7 +36,6 @@ impl Conv2DCPULayer {
         let input_size = Ix4(size[0], size[1], input_y, input_x);
         let weight_size = IxDyn(config.kernel_size.as_slice());
         let output_size = Ix4(size[0], weight_size[0], output_y, output_x);
-
         let (weights, biases) = if let Some(Tensors::Conv(tensors)) = tensors {
             (tensors.weights, tensors.biases)
         } else {
@@ -54,10 +59,15 @@ impl Conv2DCPULayer {
             inputs: Array4::zeros(input_size),
             weights: weights.into_dimensionality::<Ix4>().unwrap(),
             biases: biases.into_dimensionality::<Ix1>().unwrap(),
-            d_weights: ArrayD::zeros(weight_size)
+            d_weights: ArrayD::zeros(weight_size.clone())
                 .into_dimensionality::<Ix4>()
                 .unwrap(),
             d_biases: Array1::zeros(config.kernel_size[0]),
+            l_weights: ArrayD::zeros(weight_size)
+                .into_dimensionality::<Ix4>()
+                .unwrap(),
+            l_biases: Array1::zeros(config.kernel_size[0]),
+            regularizer: CPURegularizer::from(config.c.unwrap_or(0.0), config.l1_ratio.unwrap_or(1.0))
         }
     }
 
@@ -138,6 +148,8 @@ impl Conv2DCPULayer {
                 }
             }
         }
+        self.l_weights = self.regularizer.coeff(&self.weights.clone().into_dyn()).into_dimensionality::<Ix4>().unwrap();
+        self.l_biases = self.regularizer.coeff(&self.biases.clone().into_dyn()).into_dimensionality::<Ix1>().unwrap();
 
         d_inputs.into_dyn()
     }