From 00f3de3bf43124e1ae460097c61e594a2ba97c44 Mon Sep 17 00:00:00 2001 From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com> Date: Wed, 20 Dec 2023 15:07:24 +0100 Subject: [PATCH] Add other non-encoding preprocessing utilities as separate modules (#222) * Add other non-encoding preprocessing utilities as separate modules * Delete test/scholar/preprocessing/normalizer_test.exs * Delete lib/scholar/preprocessing/normalizer.ex * Format * Remove redundant module attribute --- lib/scholar/preprocessing.ex | 79 +------- lib/scholar/preprocessing/max_abs_scaler.ex | 119 ++++++++++++ lib/scholar/preprocessing/min_max_scaler.ex | 169 ++++++++++++++++++ .../preprocessing/max_abs_scaler_test.exs | 28 +++ .../preprocessing/min_max_scaler_test.exs | 45 +++++ 5 files changed, 367 insertions(+), 73 deletions(-) create mode 100644 lib/scholar/preprocessing/max_abs_scaler.ex create mode 100644 lib/scholar/preprocessing/min_max_scaler.ex create mode 100644 test/scholar/preprocessing/max_abs_scaler_test.exs create mode 100644 test/scholar/preprocessing/min_max_scaler_test.exs diff --git a/lib/scholar/preprocessing.ex b/lib/scholar/preprocessing.ex index eaf322dc..915a5f54 100644 --- a/lib/scholar/preprocessing.ex +++ b/lib/scholar/preprocessing.ex @@ -26,25 +26,6 @@ defmodule Scholar.Preprocessing do ] ] - min_max_schema = - general_schema ++ - [ - min: [ - type: {:or, [:integer, :float]}, - default: 0, - doc: """ - The lower boundary of the desired range of transformed data. - """ - ], - max: [ - type: {:or, [:integer, :float]}, - default: 1, - doc: """ - The upper boundary of the desired range of transformed data. - """ - ] - ] - normalize_schema = general_schema ++ [ @@ -75,8 +56,6 @@ defmodule Scholar.Preprocessing do ] ] - @general_schema NimbleOptions.new!(general_schema) - @min_max_schema NimbleOptions.new!(min_max_schema) @normalize_schema NimbleOptions.new!(normalize_schema) @binarize_schema NimbleOptions.new!(binarize_schema) @encode_schema NimbleOptions.new!(encode_schema) @@ -101,11 +80,9 @@ defmodule Scholar.Preprocessing do end @doc """ - Scales a tensor by dividing each sample in batch by maximum absolute value in the batch + It is a shortcut for `Scholar.Preprocessing.MaxAbsScaler.fit_transform/2`. + See `Scholar.Preprocessing.MaxAbsScaler` for more information. - ## Options - - #{NimbleOptions.docs(@general_schema)} ## Examples @@ -133,20 +110,12 @@ defmodule Scholar.Preprocessing do > """ deftransform max_abs_scale(tensor, opts \\ []) do - max_abs_scale_n(tensor, NimbleOptions.validate!(opts, @general_schema)) - end - - defnp max_abs_scale_n(tensor, opts) do - max_abs = Nx.abs(tensor) |> Nx.reduce_max(axes: opts[:axes], keep_axes: true) - tensor / Nx.select(max_abs == 0, 1, max_abs) + Scholar.Preprocessing.MaxAbsScaler.fit_transform(tensor, opts) end @doc """ - Transform a tensor by scaling each batch to the given range. - - ## Options - - #{NimbleOptions.docs(@min_max_schema)} + It is a shortcut for `Scholar.Preprocessing.MinMaxScaler.fit_transform/2`. + See `Scholar.Preprocessing.MinMaxScaler` for more information. ## Examples @@ -156,28 +125,6 @@ defmodule Scholar.Preprocessing do [0.0, 0.5, 1.0] > - iex> Scholar.Preprocessing.min_max_scale(Nx.tensor([[1, -1, 2], [3, 0, 0], [0, 1, -1], [2, 3, 1]]), axes: [0]) - #Nx.Tensor< - f32[4][3] - [ - [0.3333333432674408, 0.0, 1.0], - [1.0, 0.25, 0.3333333432674408], - [0.0, 0.5, 0.0], - [0.6666666865348816, 1.0, 0.6666666865348816] - ] - > - - iex> Scholar.Preprocessing.min_max_scale(Nx.tensor([[1, -1, 2], [3, 0, 0], [0, 1, -1], [2, 3, 1]]), axes: [0], min: 1, max: 3) - #Nx.Tensor< - f32[4][3] - [ - [1.6666667461395264, 1.0, 3.0], - [3.0, 1.5, 1.6666667461395264], - [1.0, 2.0, 1.0], - [2.3333334922790527, 3.0, 2.3333334922790527] - ] - > - iex> Scholar.Preprocessing.min_max_scale(42) #Nx.Tensor< f32 @@ -185,21 +132,7 @@ defmodule Scholar.Preprocessing do > """ deftransform min_max_scale(tensor, opts \\ []) do - min_max_scale_n(tensor, NimbleOptions.validate!(opts, @min_max_schema)) - end - - defnp min_max_scale_n(tensor, opts) do - if opts[:max] <= opts[:min] do - raise ArgumentError, - "expected :max to be greater than :min" - else - reduced_max = Nx.reduce_max(tensor, axes: opts[:axes], keep_axes: true) - reduced_min = Nx.reduce_min(tensor, axes: opts[:axes], keep_axes: true) - denominator = reduced_max - reduced_min - denominator = Nx.select(denominator == 0, 1, denominator) - x_std = (tensor - reduced_min) / denominator - x_std * (opts[:max] - opts[:min]) + opts[:min] - end + Scholar.Preprocessing.MinMaxScaler.fit_transform(tensor, opts) end @doc """ diff --git a/lib/scholar/preprocessing/max_abs_scaler.ex b/lib/scholar/preprocessing/max_abs_scaler.ex new file mode 100644 index 00000000..b84ed029 --- /dev/null +++ b/lib/scholar/preprocessing/max_abs_scaler.ex @@ -0,0 +1,119 @@ +defmodule Scholar.Preprocessing.MaxAbsScaler do + @moduledoc """ + Scales a tensor by dividing each sample in batch by maximum absolute value in the batch + + Centering and scaling happen independently on each feature by computing the relevant + statistics on the samples in the training set. Maximum absolute value then is + stored to be used on new samples. + """ + + import Nx.Defn + + @derive {Nx.Container, containers: [:max_abs]} + defstruct [:max_abs] + + opts_schema = [ + axes: [ + type: {:custom, Scholar.Options, :axes, []}, + doc: """ + Axes to calculate the max absolute value over. By default the absolute values + are calculated between the whole tensors. + """ + ] + ] + + @opts_schema NimbleOptions.new!(opts_schema) + + @doc """ + Compute the maximum absolute value of samples to be used for later scaling. + + ## Options + + #{NimbleOptions.docs(@opts_schema)} + + ## Return values + + Returns a struct with the following parameters: + + * `max_abs`: the calculated maximum absolute value of samples. + + ## Examples + + iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]]) + iex> Scholar.Preprocessing.MaxAbsScaler.fit(t) + %Scholar.Preprocessing.MaxAbsScaler{ + max_abs: Nx.tensor( + [ + [2] + ] + ) + } + """ + deftransform fit(tensor, opts \\ []) do + fit_n(tensor, NimbleOptions.validate!(opts, @opts_schema)) + end + + defnp fit_n(tensor, opts) do + max_abs = + Nx.abs(tensor) + |> Nx.reduce_max(axes: opts[:axes], keep_axes: true) + + max_abs = Nx.select(max_abs == 0, 1, max_abs) + + %__MODULE__{max_abs: max_abs} + end + + @doc """ + Performs the standardization of the tensor using a fitted scaler. + + ## Examples + + iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]]) + iex> scaler = Scholar.Preprocessing.MaxAbsScaler.fit(t) + iex> Scholar.Preprocessing.MaxAbsScaler.transform(scaler, t) + #Nx.Tensor< + f32[3][3] + [ + [0.5, -0.5, 1.0], + [1.0, 0.0, 0.0], + [0.0, 0.5, -0.5] + ] + > + iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]]) + iex> scaler = Scholar.Preprocessing.MaxAbsScaler.fit(t) + iex> new_tensor = Nx.tensor([[0.5, 1, -1], [0.3, 0.8, -1.6]]) + iex> Scholar.Preprocessing.MaxAbsScaler.transform(scaler, new_tensor) + #Nx.Tensor< + f32[2][3] + [ + [0.25, 0.5, -0.5], + [0.15000000596046448, 0.4000000059604645, -0.800000011920929] + ] + > + """ + defn transform(%__MODULE__{max_abs: max_abs}, tensor) do + tensor / max_abs + end + + @doc """ + Standardizes the tensor by removing the mean and scaling to unit variance. + + ## Examples + + iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]]) + iex> Scholar.Preprocessing.MaxAbsScaler.fit_transform(t) + #Nx.Tensor< + f32[3][3] + [ + [0.5, -0.5, 1.0], + [1.0, 0.0, 0.0], + [0.0, 0.5, -0.5] + ] + > + """ + defn fit_transform(tensor, opts \\ []) do + tensor + |> fit(opts) + |> transform(tensor) + end +end diff --git a/lib/scholar/preprocessing/min_max_scaler.ex b/lib/scholar/preprocessing/min_max_scaler.ex new file mode 100644 index 00000000..9c98a103 --- /dev/null +++ b/lib/scholar/preprocessing/min_max_scaler.ex @@ -0,0 +1,169 @@ +defmodule Scholar.Preprocessing.MinMaxScaler do + @moduledoc """ + Scales a tensor by dividing each sample in batch by maximum absolute value in the batch + + Centering and scaling happen independently on each feature by computing the relevant + statistics on the samples in the training set. Maximum absolute value then is + stored to be used on new samples. + """ + + import Nx.Defn + + @derive {Nx.Container, containers: [:min_data, :max_data, :min_bound, :max_bound]} + defstruct [:min_data, :max_data, :min_bound, :max_bound] + + opts_schema = [ + axes: [ + type: {:custom, Scholar.Options, :axes, []}, + doc: """ + Axes to calculate the max absolute value over. By default the absolute values + are calculated between the whole tensors. + """ + ], + min_bound: [ + type: {:or, [:integer, :float]}, + default: 0, + doc: """ + The lower boundary of the desired range of transformed data. + """ + ], + max_bound: [ + type: {:or, [:integer, :float]}, + default: 1, + doc: """ + The upper boundary of the desired range of transformed data. + """ + ] + ] + + @opts_schema NimbleOptions.new!(opts_schema) + + @doc """ + Compute the maximum absolute value of samples to be used for later scaling. + + ## Options + + #{NimbleOptions.docs(@opts_schema)} + + ## Return values + + Returns a struct with the following parameters: + + * `min_data`: the calculated minimum value of samples. + + * `max_data`: the calculated maximum value of samples. + + * `min_bound`: The lower boundary of the desired range of transformed data. + + * `max_bound`: The upper boundary of the desired range of transformed data. + + ## Examples + + iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]]) + iex> Scholar.Preprocessing.MinMaxScaler.fit(t) + %Scholar.Preprocessing.MinMaxScaler{ + min_data: Nx.tensor( + [ + [-1] + ] + ), + max_data: Nx.tensor( + [ + [2] + ] + ), + min_bound: Nx.tensor( + 0 + ), + max_bound: Nx.tensor( + 1 + ) + } + """ + deftransform fit(tensor, opts \\ []) do + fit_n(tensor, NimbleOptions.validate!(opts, @opts_schema)) + end + + defnp fit_n(tensor, opts) do + if opts[:max_bound] <= opts[:min_bound] do + raise ArgumentError, + "expected :max to be greater than :min" + else + reduced_max = Nx.reduce_max(tensor, axes: opts[:axes], keep_axes: true) + reduced_min = Nx.reduce_min(tensor, axes: opts[:axes], keep_axes: true) + + %__MODULE__{ + min_data: reduced_min, + max_data: reduced_max, + min_bound: opts[:min_bound], + max_bound: opts[:max_bound] + } + end + end + + @doc """ + Performs the standardization of the tensor using a fitted scaler. + + ## Examples + + iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]]) + iex> scaler = Scholar.Preprocessing.MinMaxScaler.fit(t) + iex> Scholar.Preprocessing.MinMaxScaler.transform(scaler, t) + #Nx.Tensor< + f32[3][3] + [ + [0.6666666865348816, 0.0, 1.0], + [1.0, 0.3333333432674408, 0.3333333432674408], + [0.3333333432674408, 0.6666666865348816, 0.0] + ] + > + + iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]]) + iex> scaler = Scholar.Preprocessing.MinMaxScaler.fit(t) + iex> new_tensor = Nx.tensor([[0.5, 1, -1], [0.3, 0.8, -1.6]]) + iex> Scholar.Preprocessing.MinMaxScaler.transform(scaler, new_tensor) + #Nx.Tensor< + f32[2][3] + [ + [0.5, 0.6666666865348816, 0.0], + [0.43333330750465393, 0.5999999642372131, -0.20000000298023224] + ] + > + """ + defn transform( + %__MODULE__{ + min_data: min_data, + max_data: max_data, + min_bound: min_bound, + max_bound: max_bound + }, + tensor + ) do + denominator = max_data - min_data + denominator = Nx.select(denominator == 0, 1, denominator) + x_std = (tensor - min_data) / denominator + x_std * (max_bound - min_bound) + min_bound + end + + @doc """ + Standardizes the tensor by removing the mean and scaling to unit variance. + + ## Examples + + iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]]) + iex> Scholar.Preprocessing.MinMaxScaler.fit_transform(t) + #Nx.Tensor< + f32[3][3] + [ + [0.6666666865348816, 0.0, 1.0], + [1.0, 0.3333333432674408, 0.3333333432674408], + [0.3333333432674408, 0.6666666865348816, 0.0] + ] + > + """ + defn fit_transform(tensor, opts \\ []) do + tensor + |> fit(opts) + |> transform(tensor) + end +end diff --git a/test/scholar/preprocessing/max_abs_scaler_test.exs b/test/scholar/preprocessing/max_abs_scaler_test.exs new file mode 100644 index 00000000..38daf5d0 --- /dev/null +++ b/test/scholar/preprocessing/max_abs_scaler_test.exs @@ -0,0 +1,28 @@ +defmodule Scholar.Preprocessing.MaxAbsScalerTest do + use Scholar.Case, async: true + alias Scholar.Preprocessing.MaxAbsScaler + + doctest MaxAbsScaler + + describe "fit_transform/2" do + test "set axes to [0]" do + data = Nx.tensor([[1, -1, 2], [3, 0, 0], [0, 1, -1], [2, 3, 1]]) + + expected = + Nx.tensor([ + [0.3333333432674408, -0.3333333432674408, 1.0], + [1.0, 0.0, 0.0], + [0.0, 0.3333333432674408, -0.5], + [0.6666666865348816, 1.0, 0.5] + ]) + + assert_all_close(MaxAbsScaler.fit_transform(data, axes: [0]), expected) + end + + test "Work in case where tensor contains only zeros" do + data = Nx.broadcast(Nx.f32(0), {3, 3}) + expected = data + assert MaxAbsScaler.fit_transform(data) == expected + end + end +end diff --git a/test/scholar/preprocessing/min_max_scaler_test.exs b/test/scholar/preprocessing/min_max_scaler_test.exs new file mode 100644 index 00000000..bafe4406 --- /dev/null +++ b/test/scholar/preprocessing/min_max_scaler_test.exs @@ -0,0 +1,45 @@ +defmodule Scholar.Preprocessing.MinMaxScalerTest do + use Scholar.Case, async: true + alias Scholar.Preprocessing.MinMaxScaler + + doctest MinMaxScaler + + describe "fit_transform/2" do + test "set axes to [0]" do + data = Nx.tensor([[1, -1, 2], [3, 0, 0], [0, 1, -1], [2, 3, 1]]) + + expected = + Nx.tensor([ + [0.3333333432674408, 0.0, 1.0], + [1.0, 0.25, 0.3333333432674408], + [0.0, 0.5, 0.0], + [0.6666666865348816, 1.0, 0.6666666865348816] + ]) + + assert_all_close(MinMaxScaler.fit_transform(data, axes: [0]), expected) + end + + test "set axes to [0], min_bound to 1, and max_bound to 3" do + data = Nx.tensor([[1, -1, 2], [3, 0, 0], [0, 1, -1], [2, 3, 1]]) + + expected = + Nx.tensor([ + [1.6666667461395264, 1.0, 3.0], + [3.0, 1.5, 1.6666667461395264], + [1.0, 2.0, 1.0], + [2.3333334922790527, 3.0, 2.3333334922790527] + ]) + + assert_all_close( + MinMaxScaler.fit_transform(data, axes: [0], min_bound: 1, max_bound: 3), + expected + ) + end + + test "Work in case where tensor contains only zeros" do + data = Nx.broadcast(Nx.f32(0), {3, 3}) + expected = data + assert MinMaxScaler.fit_transform(data) == expected + end + end +end