From 00f3de3bf43124e1ae460097c61e594a2ba97c44 Mon Sep 17 00:00:00 2001
From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com>
Date: Wed, 20 Dec 2023 15:07:24 +0100
Subject: [PATCH] Add other non-encoding preprocessing utilities as separate
 modules (#222)

* Add other non-encoding preprocessing utilities as separate modules

* Delete test/scholar/preprocessing/normalizer_test.exs

* Delete lib/scholar/preprocessing/normalizer.ex

* Format

* Remove redundant module attribute
---
 lib/scholar/preprocessing.ex                  |  79 +-------
 lib/scholar/preprocessing/max_abs_scaler.ex   | 119 ++++++++++++
 lib/scholar/preprocessing/min_max_scaler.ex   | 169 ++++++++++++++++++
 .../preprocessing/max_abs_scaler_test.exs     |  28 +++
 .../preprocessing/min_max_scaler_test.exs     |  45 +++++
 5 files changed, 367 insertions(+), 73 deletions(-)
 create mode 100644 lib/scholar/preprocessing/max_abs_scaler.ex
 create mode 100644 lib/scholar/preprocessing/min_max_scaler.ex
 create mode 100644 test/scholar/preprocessing/max_abs_scaler_test.exs
 create mode 100644 test/scholar/preprocessing/min_max_scaler_test.exs

diff --git a/lib/scholar/preprocessing.ex b/lib/scholar/preprocessing.ex
index eaf322dc..915a5f54 100644
--- a/lib/scholar/preprocessing.ex
+++ b/lib/scholar/preprocessing.ex
@@ -26,25 +26,6 @@ defmodule Scholar.Preprocessing do
     ]
   ]
 
-  min_max_schema =
-    general_schema ++
-      [
-        min: [
-          type: {:or, [:integer, :float]},
-          default: 0,
-          doc: """
-          The lower boundary of the desired range of transformed data.
-          """
-        ],
-        max: [
-          type: {:or, [:integer, :float]},
-          default: 1,
-          doc: """
-          The upper boundary of the desired range of transformed data.
-          """
-        ]
-      ]
-
   normalize_schema =
     general_schema ++
       [
@@ -75,8 +56,6 @@ defmodule Scholar.Preprocessing do
     ]
   ]
 
-  @general_schema NimbleOptions.new!(general_schema)
-  @min_max_schema NimbleOptions.new!(min_max_schema)
   @normalize_schema NimbleOptions.new!(normalize_schema)
   @binarize_schema NimbleOptions.new!(binarize_schema)
   @encode_schema NimbleOptions.new!(encode_schema)
@@ -101,11 +80,9 @@ defmodule Scholar.Preprocessing do
   end
 
   @doc """
-  Scales a tensor by dividing each sample in batch by maximum absolute value in the batch
+  It is a shortcut for `Scholar.Preprocessing.MaxAbsScaler.fit_transform/2`.
+  See `Scholar.Preprocessing.MaxAbsScaler` for more information.
 
-  ## Options
-
-  #{NimbleOptions.docs(@general_schema)}
 
   ## Examples
 
@@ -133,20 +110,12 @@ defmodule Scholar.Preprocessing do
       >
   """
   deftransform max_abs_scale(tensor, opts \\ []) do
-    max_abs_scale_n(tensor, NimbleOptions.validate!(opts, @general_schema))
-  end
-
-  defnp max_abs_scale_n(tensor, opts) do
-    max_abs = Nx.abs(tensor) |> Nx.reduce_max(axes: opts[:axes], keep_axes: true)
-    tensor / Nx.select(max_abs == 0, 1, max_abs)
+    Scholar.Preprocessing.MaxAbsScaler.fit_transform(tensor, opts)
   end
 
   @doc """
-  Transform a tensor by scaling each batch to the given range.
-
-  ## Options
-
-  #{NimbleOptions.docs(@min_max_schema)}
+  It is a shortcut for `Scholar.Preprocessing.MinMaxScaler.fit_transform/2`.
+  See `Scholar.Preprocessing.MinMaxScaler` for more information.
 
   ## Examples
 
@@ -156,28 +125,6 @@ defmodule Scholar.Preprocessing do
         [0.0, 0.5, 1.0]
       >
 
-      iex> Scholar.Preprocessing.min_max_scale(Nx.tensor([[1, -1, 2], [3, 0, 0], [0, 1, -1], [2, 3, 1]]), axes: [0])
-      #Nx.Tensor<
-        f32[4][3]
-        [
-          [0.3333333432674408, 0.0, 1.0],
-          [1.0, 0.25, 0.3333333432674408],
-          [0.0, 0.5, 0.0],
-          [0.6666666865348816, 1.0, 0.6666666865348816]
-        ]
-      >
-
-      iex> Scholar.Preprocessing.min_max_scale(Nx.tensor([[1, -1, 2], [3, 0, 0], [0, 1, -1], [2, 3, 1]]), axes: [0], min: 1, max: 3)
-      #Nx.Tensor<
-        f32[4][3]
-        [
-          [1.6666667461395264, 1.0, 3.0],
-          [3.0, 1.5, 1.6666667461395264],
-          [1.0, 2.0, 1.0],
-          [2.3333334922790527, 3.0, 2.3333334922790527]
-        ]
-      >
-
       iex> Scholar.Preprocessing.min_max_scale(42)
       #Nx.Tensor<
         f32
@@ -185,21 +132,7 @@ defmodule Scholar.Preprocessing do
       >
   """
   deftransform min_max_scale(tensor, opts \\ []) do
-    min_max_scale_n(tensor, NimbleOptions.validate!(opts, @min_max_schema))
-  end
-
-  defnp min_max_scale_n(tensor, opts) do
-    if opts[:max] <= opts[:min] do
-      raise ArgumentError,
-            "expected :max to be greater than :min"
-    else
-      reduced_max = Nx.reduce_max(tensor, axes: opts[:axes], keep_axes: true)
-      reduced_min = Nx.reduce_min(tensor, axes: opts[:axes], keep_axes: true)
-      denominator = reduced_max - reduced_min
-      denominator = Nx.select(denominator == 0, 1, denominator)
-      x_std = (tensor - reduced_min) / denominator
-      x_std * (opts[:max] - opts[:min]) + opts[:min]
-    end
+    Scholar.Preprocessing.MinMaxScaler.fit_transform(tensor, opts)
   end
 
   @doc """
diff --git a/lib/scholar/preprocessing/max_abs_scaler.ex b/lib/scholar/preprocessing/max_abs_scaler.ex
new file mode 100644
index 00000000..b84ed029
--- /dev/null
+++ b/lib/scholar/preprocessing/max_abs_scaler.ex
@@ -0,0 +1,119 @@
+defmodule Scholar.Preprocessing.MaxAbsScaler do
+  @moduledoc """
+  Scales a tensor by dividing each sample in batch by maximum absolute value in the batch
+
+  Centering and scaling happen independently on each feature by computing the relevant
+  statistics on the samples in the training set. Maximum absolute value then is
+  stored to be used on new samples.
+  """
+
+  import Nx.Defn
+
+  @derive {Nx.Container, containers: [:max_abs]}
+  defstruct [:max_abs]
+
+  opts_schema = [
+    axes: [
+      type: {:custom, Scholar.Options, :axes, []},
+      doc: """
+      Axes to calculate the max absolute value over. By default the absolute values
+      are calculated between the whole tensors.
+      """
+    ]
+  ]
+
+  @opts_schema NimbleOptions.new!(opts_schema)
+
+  @doc """
+  Compute the maximum absolute value of samples to be used for later scaling.
+
+  ## Options
+
+  #{NimbleOptions.docs(@opts_schema)}
+
+  ## Return values
+
+    Returns a struct with the following parameters:
+
+    * `max_abs`: the calculated maximum absolute value of samples.
+
+  ## Examples
+
+      iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
+      iex> Scholar.Preprocessing.MaxAbsScaler.fit(t)
+      %Scholar.Preprocessing.MaxAbsScaler{
+        max_abs: Nx.tensor(
+          [
+            [2]
+          ]
+        )
+      }
+  """
+  deftransform fit(tensor, opts \\ []) do
+    fit_n(tensor, NimbleOptions.validate!(opts, @opts_schema))
+  end
+
+  defnp fit_n(tensor, opts) do
+    max_abs =
+      Nx.abs(tensor)
+      |> Nx.reduce_max(axes: opts[:axes], keep_axes: true)
+
+    max_abs = Nx.select(max_abs == 0, 1, max_abs)
+
+    %__MODULE__{max_abs: max_abs}
+  end
+
+  @doc """
+  Performs the standardization of the tensor using a fitted scaler.
+
+  ## Examples
+
+      iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
+      iex> scaler = Scholar.Preprocessing.MaxAbsScaler.fit(t)
+      iex> Scholar.Preprocessing.MaxAbsScaler.transform(scaler, t)
+      #Nx.Tensor<
+        f32[3][3]
+        [
+          [0.5, -0.5, 1.0],
+          [1.0, 0.0, 0.0],
+          [0.0, 0.5, -0.5]
+        ]
+      >
+      iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
+      iex> scaler = Scholar.Preprocessing.MaxAbsScaler.fit(t)
+      iex> new_tensor = Nx.tensor([[0.5, 1, -1], [0.3, 0.8, -1.6]])
+      iex> Scholar.Preprocessing.MaxAbsScaler.transform(scaler, new_tensor)
+      #Nx.Tensor<
+        f32[2][3]
+        [
+          [0.25, 0.5, -0.5],
+          [0.15000000596046448, 0.4000000059604645, -0.800000011920929]
+        ]
+      >
+  """
+  defn transform(%__MODULE__{max_abs: max_abs}, tensor) do
+    tensor / max_abs
+  end
+
+  @doc """
+  Standardizes the tensor by removing the mean and scaling to unit variance.
+
+  ## Examples
+
+      iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
+      iex> Scholar.Preprocessing.MaxAbsScaler.fit_transform(t)
+      #Nx.Tensor<
+        f32[3][3]
+        [
+          [0.5, -0.5, 1.0],
+          [1.0, 0.0, 0.0],
+          [0.0, 0.5, -0.5]
+        ]
+      >
+  """
+  defn fit_transform(tensor, opts \\ []) do
+    tensor
+    |> fit(opts)
+    |> transform(tensor)
+  end
+end
diff --git a/lib/scholar/preprocessing/min_max_scaler.ex b/lib/scholar/preprocessing/min_max_scaler.ex
new file mode 100644
index 00000000..9c98a103
--- /dev/null
+++ b/lib/scholar/preprocessing/min_max_scaler.ex
@@ -0,0 +1,169 @@
+defmodule Scholar.Preprocessing.MinMaxScaler do
+  @moduledoc """
+  Scales a tensor by dividing each sample in batch by maximum absolute value in the batch
+
+  Centering and scaling happen independently on each feature by computing the relevant
+  statistics on the samples in the training set. Maximum absolute value then is
+  stored to be used on new samples.
+  """
+
+  import Nx.Defn
+
+  @derive {Nx.Container, containers: [:min_data, :max_data, :min_bound, :max_bound]}
+  defstruct [:min_data, :max_data, :min_bound, :max_bound]
+
+  opts_schema = [
+    axes: [
+      type: {:custom, Scholar.Options, :axes, []},
+      doc: """
+      Axes to calculate the max absolute value over. By default the absolute values
+      are calculated between the whole tensors.
+      """
+    ],
+    min_bound: [
+      type: {:or, [:integer, :float]},
+      default: 0,
+      doc: """
+      The lower boundary of the desired range of transformed data.
+      """
+    ],
+    max_bound: [
+      type: {:or, [:integer, :float]},
+      default: 1,
+      doc: """
+      The upper boundary of the desired range of transformed data.
+      """
+    ]
+  ]
+
+  @opts_schema NimbleOptions.new!(opts_schema)
+
+  @doc """
+  Compute the maximum absolute value of samples to be used for later scaling.
+
+  ## Options
+
+  #{NimbleOptions.docs(@opts_schema)}
+
+  ## Return values
+
+    Returns a struct with the following parameters:
+
+    * `min_data`: the calculated minimum value of samples.
+
+    * `max_data`: the calculated maximum value of samples.
+
+    * `min_bound`: The lower boundary of the desired range of transformed data.
+
+    * `max_bound`: The upper boundary of the desired range of transformed data.
+
+  ## Examples
+
+      iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
+      iex> Scholar.Preprocessing.MinMaxScaler.fit(t)
+      %Scholar.Preprocessing.MinMaxScaler{
+        min_data: Nx.tensor(
+          [
+            [-1]
+          ]
+        ),
+        max_data: Nx.tensor(
+          [
+            [2]
+          ]
+        ),
+        min_bound: Nx.tensor(
+          0
+        ),
+        max_bound: Nx.tensor(
+          1
+        )
+      }
+  """
+  deftransform fit(tensor, opts \\ []) do
+    fit_n(tensor, NimbleOptions.validate!(opts, @opts_schema))
+  end
+
+  defnp fit_n(tensor, opts) do
+    if opts[:max_bound] <= opts[:min_bound] do
+      raise ArgumentError,
+            "expected :max to be greater than :min"
+    else
+      reduced_max = Nx.reduce_max(tensor, axes: opts[:axes], keep_axes: true)
+      reduced_min = Nx.reduce_min(tensor, axes: opts[:axes], keep_axes: true)
+
+      %__MODULE__{
+        min_data: reduced_min,
+        max_data: reduced_max,
+        min_bound: opts[:min_bound],
+        max_bound: opts[:max_bound]
+      }
+    end
+  end
+
+  @doc """
+  Performs the standardization of the tensor using a fitted scaler.
+
+  ## Examples
+
+      iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
+      iex> scaler = Scholar.Preprocessing.MinMaxScaler.fit(t)
+      iex> Scholar.Preprocessing.MinMaxScaler.transform(scaler, t)
+      #Nx.Tensor<
+        f32[3][3]
+        [
+          [0.6666666865348816, 0.0, 1.0],
+          [1.0, 0.3333333432674408, 0.3333333432674408],
+          [0.3333333432674408, 0.6666666865348816, 0.0]
+        ]
+      >
+
+      iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
+      iex> scaler = Scholar.Preprocessing.MinMaxScaler.fit(t)
+      iex> new_tensor = Nx.tensor([[0.5, 1, -1], [0.3, 0.8, -1.6]])
+      iex> Scholar.Preprocessing.MinMaxScaler.transform(scaler, new_tensor)
+      #Nx.Tensor<
+        f32[2][3]
+        [
+          [0.5, 0.6666666865348816, 0.0],
+          [0.43333330750465393, 0.5999999642372131, -0.20000000298023224]
+        ]
+      >
+  """
+  defn transform(
+         %__MODULE__{
+           min_data: min_data,
+           max_data: max_data,
+           min_bound: min_bound,
+           max_bound: max_bound
+         },
+         tensor
+       ) do
+    denominator = max_data - min_data
+    denominator = Nx.select(denominator == 0, 1, denominator)
+    x_std = (tensor - min_data) / denominator
+    x_std * (max_bound - min_bound) + min_bound
+  end
+
+  @doc """
+  Standardizes the tensor by removing the mean and scaling to unit variance.
+
+  ## Examples
+
+      iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
+      iex> Scholar.Preprocessing.MinMaxScaler.fit_transform(t)
+      #Nx.Tensor<
+        f32[3][3]
+        [
+          [0.6666666865348816, 0.0, 1.0],
+          [1.0, 0.3333333432674408, 0.3333333432674408],
+          [0.3333333432674408, 0.6666666865348816, 0.0]
+        ]
+      >
+  """
+  defn fit_transform(tensor, opts \\ []) do
+    tensor
+    |> fit(opts)
+    |> transform(tensor)
+  end
+end
diff --git a/test/scholar/preprocessing/max_abs_scaler_test.exs b/test/scholar/preprocessing/max_abs_scaler_test.exs
new file mode 100644
index 00000000..38daf5d0
--- /dev/null
+++ b/test/scholar/preprocessing/max_abs_scaler_test.exs
@@ -0,0 +1,28 @@
+defmodule Scholar.Preprocessing.MaxAbsScalerTest do
+  use Scholar.Case, async: true
+  alias Scholar.Preprocessing.MaxAbsScaler
+
+  doctest MaxAbsScaler
+
+  describe "fit_transform/2" do
+    test "set axes to [0]" do
+      data = Nx.tensor([[1, -1, 2], [3, 0, 0], [0, 1, -1], [2, 3, 1]])
+
+      expected =
+        Nx.tensor([
+          [0.3333333432674408, -0.3333333432674408, 1.0],
+          [1.0, 0.0, 0.0],
+          [0.0, 0.3333333432674408, -0.5],
+          [0.6666666865348816, 1.0, 0.5]
+        ])
+
+      assert_all_close(MaxAbsScaler.fit_transform(data, axes: [0]), expected)
+    end
+
+    test "Work in case where tensor contains only zeros" do
+      data = Nx.broadcast(Nx.f32(0), {3, 3})
+      expected = data
+      assert MaxAbsScaler.fit_transform(data) == expected
+    end
+  end
+end
diff --git a/test/scholar/preprocessing/min_max_scaler_test.exs b/test/scholar/preprocessing/min_max_scaler_test.exs
new file mode 100644
index 00000000..bafe4406
--- /dev/null
+++ b/test/scholar/preprocessing/min_max_scaler_test.exs
@@ -0,0 +1,45 @@
+defmodule Scholar.Preprocessing.MinMaxScalerTest do
+  use Scholar.Case, async: true
+  alias Scholar.Preprocessing.MinMaxScaler
+
+  doctest MinMaxScaler
+
+  describe "fit_transform/2" do
+    test "set axes to [0]" do
+      data = Nx.tensor([[1, -1, 2], [3, 0, 0], [0, 1, -1], [2, 3, 1]])
+
+      expected =
+        Nx.tensor([
+          [0.3333333432674408, 0.0, 1.0],
+          [1.0, 0.25, 0.3333333432674408],
+          [0.0, 0.5, 0.0],
+          [0.6666666865348816, 1.0, 0.6666666865348816]
+        ])
+
+      assert_all_close(MinMaxScaler.fit_transform(data, axes: [0]), expected)
+    end
+
+    test "set axes to [0], min_bound to 1, and max_bound to 3" do
+      data = Nx.tensor([[1, -1, 2], [3, 0, 0], [0, 1, -1], [2, 3, 1]])
+
+      expected =
+        Nx.tensor([
+          [1.6666667461395264, 1.0, 3.0],
+          [3.0, 1.5, 1.6666667461395264],
+          [1.0, 2.0, 1.0],
+          [2.3333334922790527, 3.0, 2.3333334922790527]
+        ])
+
+      assert_all_close(
+        MinMaxScaler.fit_transform(data, axes: [0], min_bound: 1, max_bound: 3),
+        expected
+      )
+    end
+
+    test "Work in case where tensor contains only zeros" do
+      data = Nx.broadcast(Nx.f32(0), {3, 3})
+      expected = data
+      assert MinMaxScaler.fit_transform(data) == expected
+    end
+  end
+end