Add other non-encoding preprocessing utilities as separate modules (e…

…lixir-nx#222) * Add other non-encoding preprocessing utilities as separate modules * Delete test/scholar/preprocessing/normalizer_test.exs * Delete lib/scholar/preprocessing/normalizer.ex * Format * Remove redundant module attribute
msluszniak · Dec 20, 2023 · 00f3de3 · 00f3de3
1 parent 6c60968
commit 00f3de3
Show file tree

Hide file tree

Showing 5 changed files with 367 additions and 73 deletions.
diff --git a/lib/scholar/preprocessing.ex b/lib/scholar/preprocessing.ex
@@ -26,25 +26,6 @@ defmodule Scholar.Preprocessing do
     ]
   ]
 
-  min_max_schema =
-    general_schema ++
-      [
-        min: [
-          type: {:or, [:integer, :float]},
-          default: 0,
-          doc: """
-          The lower boundary of the desired range of transformed data.
-          """
-        ],
-        max: [
-          type: {:or, [:integer, :float]},
-          default: 1,
-          doc: """
-          The upper boundary of the desired range of transformed data.
-          """
-        ]
-      ]
-
   normalize_schema =
     general_schema ++
       [
@@ -75,8 +56,6 @@ defmodule Scholar.Preprocessing do
     ]
   ]
 
-  @general_schema NimbleOptions.new!(general_schema)
-  @min_max_schema NimbleOptions.new!(min_max_schema)
   @normalize_schema NimbleOptions.new!(normalize_schema)
   @binarize_schema NimbleOptions.new!(binarize_schema)
   @encode_schema NimbleOptions.new!(encode_schema)
@@ -101,11 +80,9 @@ defmodule Scholar.Preprocessing do
   end
 
   @doc """
-  Scales a tensor by dividing each sample in batch by maximum absolute value in the batch
+  It is a shortcut for `Scholar.Preprocessing.MaxAbsScaler.fit_transform/2`.
+  See `Scholar.Preprocessing.MaxAbsScaler` for more information.
 
-  ## Options
-
-  #{NimbleOptions.docs(@general_schema)}
 
   ## Examples
 
@@ -133,20 +110,12 @@ defmodule Scholar.Preprocessing do
       >
   """
   deftransform max_abs_scale(tensor, opts \\ []) do
-    max_abs_scale_n(tensor, NimbleOptions.validate!(opts, @general_schema))
-  end
-
-  defnp max_abs_scale_n(tensor, opts) do
-    max_abs = Nx.abs(tensor) |> Nx.reduce_max(axes: opts[:axes], keep_axes: true)
-    tensor / Nx.select(max_abs == 0, 1, max_abs)
+    Scholar.Preprocessing.MaxAbsScaler.fit_transform(tensor, opts)
   end
 
   @doc """
-  Transform a tensor by scaling each batch to the given range.
-
-  ## Options
-
-  #{NimbleOptions.docs(@min_max_schema)}
+  It is a shortcut for `Scholar.Preprocessing.MinMaxScaler.fit_transform/2`.
+  See `Scholar.Preprocessing.MinMaxScaler` for more information.
 
   ## Examples
 
@@ -156,50 +125,14 @@ defmodule Scholar.Preprocessing do
         [0.0, 0.5, 1.0]
       >
 
-      iex> Scholar.Preprocessing.min_max_scale(Nx.tensor([[1, -1, 2], [3, 0, 0], [0, 1, -1], [2, 3, 1]]), axes: [0])
-      #Nx.Tensor<
-        f32[4][3]
-        [
-          [0.3333333432674408, 0.0, 1.0],
-          [1.0, 0.25, 0.3333333432674408],
-          [0.0, 0.5, 0.0],
-          [0.6666666865348816, 1.0, 0.6666666865348816]
-        ]
-      >
-
-      iex> Scholar.Preprocessing.min_max_scale(Nx.tensor([[1, -1, 2], [3, 0, 0], [0, 1, -1], [2, 3, 1]]), axes: [0], min: 1, max: 3)
-      #Nx.Tensor<
-        f32[4][3]
-        [
-          [1.6666667461395264, 1.0, 3.0],
-          [3.0, 1.5, 1.6666667461395264],
-          [1.0, 2.0, 1.0],
-          [2.3333334922790527, 3.0, 2.3333334922790527]
-        ]
-      >
-
       iex> Scholar.Preprocessing.min_max_scale(42)
       #Nx.Tensor<
         f32
         0.0
       >
   """
   deftransform min_max_scale(tensor, opts \\ []) do
-    min_max_scale_n(tensor, NimbleOptions.validate!(opts, @min_max_schema))
-  end
-
-  defnp min_max_scale_n(tensor, opts) do
-    if opts[:max] <= opts[:min] do
-      raise ArgumentError,
-            "expected :max to be greater than :min"
-    else
-      reduced_max = Nx.reduce_max(tensor, axes: opts[:axes], keep_axes: true)
-      reduced_min = Nx.reduce_min(tensor, axes: opts[:axes], keep_axes: true)
-      denominator = reduced_max - reduced_min
-      denominator = Nx.select(denominator == 0, 1, denominator)
-      x_std = (tensor - reduced_min) / denominator
-      x_std * (opts[:max] - opts[:min]) + opts[:min]
-    end
+    Scholar.Preprocessing.MinMaxScaler.fit_transform(tensor, opts)
   end
 
   @doc """

diff --git a/lib/scholar/preprocessing/max_abs_scaler.ex b/lib/scholar/preprocessing/max_abs_scaler.ex
@@ -0,0 +1,119 @@
+defmodule Scholar.Preprocessing.MaxAbsScaler do
+  @moduledoc """
+  Scales a tensor by dividing each sample in batch by maximum absolute value in the batch
+
+  Centering and scaling happen independently on each feature by computing the relevant
+  statistics on the samples in the training set. Maximum absolute value then is
+  stored to be used on new samples.
+  """
+
+  import Nx.Defn
+
+  @derive {Nx.Container, containers: [:max_abs]}
+  defstruct [:max_abs]
+
+  opts_schema = [
+    axes: [
+      type: {:custom, Scholar.Options, :axes, []},
+      doc: """
+      Axes to calculate the max absolute value over. By default the absolute values
+      are calculated between the whole tensors.
+      """
+    ]
+  ]
+
+  @opts_schema NimbleOptions.new!(opts_schema)
+
+  @doc """
+  Compute the maximum absolute value of samples to be used for later scaling.
+
+  ## Options
+
+  #{NimbleOptions.docs(@opts_schema)}
+
+  ## Return values
+
+    Returns a struct with the following parameters:
+
+    * `max_abs`: the calculated maximum absolute value of samples.
+
+  ## Examples
+
+      iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
+      iex> Scholar.Preprocessing.MaxAbsScaler.fit(t)
+      %Scholar.Preprocessing.MaxAbsScaler{
+        max_abs: Nx.tensor(
+          [
+            [2]
+          ]
+        )
+      }
+  """
+  deftransform fit(tensor, opts \\ []) do
+    fit_n(tensor, NimbleOptions.validate!(opts, @opts_schema))
+  end
+
+  defnp fit_n(tensor, opts) do
+    max_abs =
+      Nx.abs(tensor)
+      |> Nx.reduce_max(axes: opts[:axes], keep_axes: true)
+
+    max_abs = Nx.select(max_abs == 0, 1, max_abs)
+
+    %__MODULE__{max_abs: max_abs}
+  end
+
+  @doc """
+  Performs the standardization of the tensor using a fitted scaler.
+
+  ## Examples
+
+      iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
+      iex> scaler = Scholar.Preprocessing.MaxAbsScaler.fit(t)
+      iex> Scholar.Preprocessing.MaxAbsScaler.transform(scaler, t)
+      #Nx.Tensor<
+        f32[3][3]
+        [
+          [0.5, -0.5, 1.0],
+          [1.0, 0.0, 0.0],
+          [0.0, 0.5, -0.5]
+        ]
+      >
+      iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
+      iex> scaler = Scholar.Preprocessing.MaxAbsScaler.fit(t)
+      iex> new_tensor = Nx.tensor([[0.5, 1, -1], [0.3, 0.8, -1.6]])
+      iex> Scholar.Preprocessing.MaxAbsScaler.transform(scaler, new_tensor)
+      #Nx.Tensor<
+        f32[2][3]
+        [
+          [0.25, 0.5, -0.5],
+          [0.15000000596046448, 0.4000000059604645, -0.800000011920929]
+        ]
+      >
+  """
+  defn transform(%__MODULE__{max_abs: max_abs}, tensor) do
+    tensor / max_abs
+  end
+
+  @doc """
+  Standardizes the tensor by removing the mean and scaling to unit variance.
+
+  ## Examples
+
+      iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
+      iex> Scholar.Preprocessing.MaxAbsScaler.fit_transform(t)
+      #Nx.Tensor<
+        f32[3][3]
+        [
+          [0.5, -0.5, 1.0],
+          [1.0, 0.0, 0.0],
+          [0.0, 0.5, -0.5]
+        ]
+      >
+  """
+  defn fit_transform(tensor, opts \\ []) do
+    tensor
+    |> fit(opts)
+    |> transform(tensor)
+  end
+end