From 981979835d86cdda2c073804fd50d6c13ebc1346 Mon Sep 17 00:00:00 2001 From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com> Date: Fri, 29 Dec 2023 11:40:07 +0100 Subject: [PATCH] Introduce encoders in separate modules (#225) * Introduce encoders in separate modules * Update preprocessing.ex * Add module docs --- lib/scholar/preprocessing.ex | 76 ++------- lib/scholar/preprocessing/one_hot_encoder.ex | 136 ++++++++++++++++ lib/scholar/preprocessing/ordinal_encoder.ex | 154 ++++++++++++++++++ .../preprocessing/one_hot_encoder_test.exs | 6 + .../preprocessing/ordinal_encoder_tes.exs | 6 + 5 files changed, 313 insertions(+), 65 deletions(-) create mode 100644 lib/scholar/preprocessing/one_hot_encoder.ex create mode 100644 lib/scholar/preprocessing/ordinal_encoder.ex create mode 100644 test/scholar/preprocessing/one_hot_encoder_test.exs create mode 100644 test/scholar/preprocessing/ordinal_encoder_tes.exs diff --git a/lib/scholar/preprocessing.ex b/lib/scholar/preprocessing.ex index 993f7250..acfeb2c1 100644 --- a/lib/scholar/preprocessing.ex +++ b/lib/scholar/preprocessing.ex @@ -16,16 +16,6 @@ defmodule Scholar.Preprocessing do ] ] - encode_schema = [ - num_classes: [ - required: true, - type: :pos_integer, - doc: """ - Number of classes to be encoded. - """ - ] - ] - normalize_schema = general_schema ++ [ @@ -58,7 +48,6 @@ defmodule Scholar.Preprocessing do @normalize_schema NimbleOptions.new!(normalize_schema) @binarize_schema NimbleOptions.new!(binarize_schema) - @encode_schema NimbleOptions.new!(encode_schema) @doc """ Standardizes the tensor by removing the mean and scaling to unit variance. @@ -75,7 +64,7 @@ defmodule Scholar.Preprocessing do > """ - deftransform standard_scale(tensor, opts \\ []) do + defn standard_scale(tensor, opts \\ []) do Scholar.Preprocessing.StandardScaler.fit_transform(tensor, opts) end @@ -110,7 +99,7 @@ defmodule Scholar.Preprocessing do 1.0 > """ - deftransform max_abs_scale(tensor, opts \\ []) do + defn max_abs_scale(tensor, opts \\ []) do Scholar.Preprocessing.MaxAbsScaler.fit_transform(tensor, opts) end @@ -134,7 +123,7 @@ defmodule Scholar.Preprocessing do 0.0 > """ - deftransform min_max_scale(tensor, opts \\ []) do + defn min_max_scale(tensor, opts \\ []) do Scholar.Preprocessing.MinMaxScaler.fit_transform(tensor, opts) end @@ -176,11 +165,8 @@ defmodule Scholar.Preprocessing do end @doc """ - Encodes a tensor's values into integers from range 0 to `:num_classes - 1`. - - ## Options - - #{NimbleOptions.docs(@encode_schema)} + It is a shortcut for `Scholar.Preprocessing.OrdinalEncoder.fit_transform/2`. + See `Scholar.Preprocessing.OrdinalEncoder` for more information. ## Examples @@ -190,42 +176,13 @@ defmodule Scholar.Preprocessing do [1, 0, 2, 3, 0, 2, 0] > """ - deftransform ordinal_encode(tensor, opts \\ []) do - ordinal_encode_n(tensor, NimbleOptions.validate!(opts, @encode_schema)) - end - - defnp ordinal_encode_n(tensor, opts) do - sorted = Nx.sort(tensor) - num_classes = opts[:num_classes] - - # A mask with a single 1 in every group of equal values - representative_mask = - Nx.concatenate([ - sorted[0..-2//1] != sorted[1..-1//1], - Nx.tensor([1]) - ]) - - representative_indices = - representative_mask - |> Nx.argsort(direction: :desc) - |> Nx.slice_along_axis(0, num_classes) - - representative_values = Nx.take(sorted, representative_indices) - - (Nx.new_axis(tensor, 1) == - Nx.new_axis(representative_values, 0)) - |> Nx.argmax(axis: 1) + defn ordinal_encode(tensor, opts \\ []) do + Scholar.Preprocessing.OrdinalEncoder.fit_transform(tensor, opts) end @doc """ - Encode labels as a one-hot numeric tensor. - - Labels must be integers from 0 to `:num_classes - 1`. If the data does - not meet the condition, please use `ordinal_encoding/2` first. - - ## Options - - #{NimbleOptions.docs(@encode_schema)} + It is a shortcut for `Scholar.Preprocessing.OneHotEncoder.fit_transform/2`. + See `Scholar.Preprocessing.OneHotEncoder` for more information. ## Examples @@ -243,19 +200,8 @@ defmodule Scholar.Preprocessing do ] > """ - deftransform one_hot_encode(tensor, opts \\ []) do - one_hot_encode_n(tensor, NimbleOptions.validate!(opts, @encode_schema)) - end - - defnp one_hot_encode_n(tensor, opts) do - {len} = Nx.shape(tensor) - - if opts[:num_classes] > len do - raise ArgumentError, - "expected :num_classes to be at most as length of label vector" - end - - Nx.new_axis(tensor, -1) == Nx.iota({1, opts[:num_classes]}) + defn one_hot_encode(tensor, opts \\ []) do + Scholar.Preprocessing.OneHotEncoder.fit_transform(tensor, opts) end @doc """ diff --git a/lib/scholar/preprocessing/one_hot_encoder.ex b/lib/scholar/preprocessing/one_hot_encoder.ex new file mode 100644 index 00000000..8d50fdb6 --- /dev/null +++ b/lib/scholar/preprocessing/one_hot_encoder.ex @@ -0,0 +1,136 @@ +defmodule Scholar.Preprocessing.OneHotEncoder do + @moduledoc """ + Implements encoder that converts integer value (substitute of categorical data in tensors) into 0-1 vector. + The index of 1 in the vector is aranged in sorted manner. This means that for x < y => one_index(x) < one_index(y). + + Currently the module supports only 1D tensors. + """ + import Nx.Defn + + @derive {Nx.Container, containers: [:encoder, :one_hot]} + defstruct [:encoder, :one_hot] + + encode_schema = [ + num_classes: [ + required: true, + type: :pos_integer, + doc: """ + Number of classes to be encoded. + """ + ] + ] + + @encode_schema NimbleOptions.new!(encode_schema) + + @doc """ + Creates mapping from values into one-hot vectors. + + ## Options + + #{NimbleOptions.docs(@encode_schema)} + + ## Examples + + iex> t = Nx.tensor([3, 2, 4, 56, 2, 4, 2]) + iex> Scholar.Preprocessing.OneHotEncoder.fit(t, num_classes: 4) + %Scholar.Preprocessing.OneHotEncoder{ + encoder: %Scholar.Preprocessing.OrdinalEncoder{ + encoding_tensor: Nx.tensor( + [ + [0, 2], + [1, 3], + [2, 4], + [3, 56] + ] + ) + }, + one_hot: Nx.tensor( + [ + [1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 1] + ], type: :u8 + ) + } + """ + deftransform fit(tensor, opts \\ []) do + fit_n(tensor, NimbleOptions.validate!(opts, @encode_schema)) + end + + defnp fit_n(tensor, opts) do + encoder = Scholar.Preprocessing.OrdinalEncoder.fit(tensor, opts) + one_hot = Nx.iota({opts[:num_classes]}) == Nx.iota({opts[:num_classes], 1}) + %__MODULE__{encoder: encoder, one_hot: one_hot} + end + + @doc """ + Encode labels as a one-hot numeric tensor. All values provided to `transform/2` must be seen + in `fit/2` function, otherwise an error occurs. + + ## Examples + + iex> t = Nx.tensor([3, 2, 4, 56, 2, 4, 2]) + iex> enoder = Scholar.Preprocessing.OneHotEncoder.fit(t, num_classes: 4) + iex> Scholar.Preprocessing.OneHotEncoder.transform(enoder, t) + #Nx.Tensor< + u8[7][4] + [ + [0, 1, 0, 0], + [1, 0, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 1], + [1, 0, 0, 0], + [0, 0, 1, 0], + [1, 0, 0, 0] + ] + > + + iex> t = Nx.tensor([3, 2, 4, 56, 2, 4, 2]) + iex> enoder = Scholar.Preprocessing.OneHotEncoder.fit(t, num_classes: 4) + iex> new_tensor = Nx.tensor([2, 3, 4, 3, 4, 56, 2]) + iex> Scholar.Preprocessing.OneHotEncoder.transform(enoder, new_tensor) + #Nx.Tensor< + u8[7][4] + [ + [1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 1], + [1, 0, 0, 0] + ] + > + """ + defn transform(%__MODULE__{encoder: encoder, one_hot: one_hot}, tensor) do + decoded = Scholar.Preprocessing.OrdinalEncoder.transform(encoder, tensor) + Nx.take(one_hot, decoded) + end + + @doc """ + Apply encoding on the provided tensor directly. It's equivalent to `fit/2` and then `transform/2` on the same data. + + ## Examples + + iex> t = Nx.tensor([3, 2, 4, 56, 2, 4, 2]) + iex> Scholar.Preprocessing.OneHotEncoder.fit_transform(t, num_classes: 4) + #Nx.Tensor< + u8[7][4] + [ + [0, 1, 0, 0], + [1, 0, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 1], + [1, 0, 0, 0], + [0, 0, 1, 0], + [1, 0, 0, 0] + ] + > + """ + defn fit_transform(tensor, opts \\ []) do + tensor + |> fit(opts) + |> transform(tensor) + end +end diff --git a/lib/scholar/preprocessing/ordinal_encoder.ex b/lib/scholar/preprocessing/ordinal_encoder.ex new file mode 100644 index 00000000..5505f625 --- /dev/null +++ b/lib/scholar/preprocessing/ordinal_encoder.ex @@ -0,0 +1,154 @@ +defmodule Scholar.Preprocessing.OrdinalEncoder do + @moduledoc """ + Implements encoder that converts integer value (substitute of categorical data in tensors) into other integer value. + The values assigned starts from `0` and go up to `num_classes - 1`.They are maintained in sorted manner. + This means that for x < y => encoded_value(x) < encoded_value(y). + + Currently the module supports only 1D tensors. + """ + import Nx.Defn + + @derive {Nx.Container, containers: [:encoding_tensor]} + defstruct [:encoding_tensor] + + encode_schema = [ + num_classes: [ + required: true, + type: :pos_integer, + doc: """ + Number of classes to be encoded. + """ + ] + ] + + @encode_schema NimbleOptions.new!(encode_schema) + + @doc """ + Fit the ordinal encoder to provided data. The labels are assigned in a sorted manner. + + ## Options + + #{NimbleOptions.docs(@encode_schema)} + + ## Examples + + iex> t = Nx.tensor([3, 2, 4, 56, 2, 4, 2]) + iex> Scholar.Preprocessing.OrdinalEncoder.fit(t, num_classes: 4) + %Scholar.Preprocessing.OrdinalEncoder{ + encoding_tensor: Nx.tensor( + [ + [0, 2], + [1, 3], + [2, 4], + [3, 56] + ] + ) + } + """ + deftransform fit(tensor, opts \\ []) do + fit_n(tensor, NimbleOptions.validate!(opts, @encode_schema)) + end + + defnp fit_n(tensor, opts) do + sorted = Nx.sort(tensor) + num_classes = opts[:num_classes] + + # A mask with a single 1 in every group of equal values + representative_mask = + Nx.concatenate([ + sorted[0..-2//1] != sorted[1..-1//1], + Nx.tensor([1]) + ]) + + representative_indices = + representative_mask + |> Nx.argsort(direction: :desc) + |> Nx.slice_along_axis(0, num_classes) + + representative_values = Nx.take(sorted, representative_indices) |> Nx.new_axis(-1) + + encoding_tensor = + Nx.concatenate([Nx.iota(Nx.shape(representative_values)), representative_values], axis: 1) + + %__MODULE__{encoding_tensor: encoding_tensor} + end + + @doc """ + Encodes a tensor's values into integers from range 0 to `:num_classes - 1` or -1 if the value did not occur in fitting process. + + ## Examples + + iex> t = Nx.tensor([3, 2, 4, 56, 2, 4, 2]) + iex> enoder = Scholar.Preprocessing.OrdinalEncoder.fit(t, num_classes: 4) + iex> Scholar.Preprocessing.OrdinalEncoder.transform(enoder, t) + #Nx.Tensor< + s64[7] + [1, 0, 2, 3, 0, 2, 0] + > + + iex> t = Nx.tensor([3, 2, 4, 56, 2, 4, 2]) + iex> enoder = Scholar.Preprocessing.OrdinalEncoder.fit(t, num_classes: 4) + iex> new_tensor = Nx.tensor([2, 3, 4, 5, 4, 56, 2]) + iex> Scholar.Preprocessing.OrdinalEncoder.transform(enoder, new_tensor) + #Nx.Tensor< + s64[7] + [0, 1, 2, -1, 2, 3, 0] + > + """ + defn transform(%__MODULE__{encoding_tensor: encoding_tensor}, tensor) do + tensor_size = Nx.axis_size(encoding_tensor, 0) + decode_size = Nx.axis_size(tensor, 0) + input_vectorized_axes = tensor.vectorized_axes + + tensor = + Nx.revectorize(tensor, [x: decode_size], target_shape: {:auto}) + + left = 0 + right = tensor_size - 1 + label = -1 + + [left, right, label, tensor] = + Nx.broadcast_vectors([ + left, + right, + label, + tensor + ]) + + {label, _} = + while {label, {left, right, tensor, encoding_tensor}}, left <= right do + curr = Nx.quotient(left + right, 2) + + cond do + tensor[0] > encoding_tensor[curr][1] -> + {label, {curr + 1, right, tensor, encoding_tensor}} + + tensor[0] < encoding_tensor[curr][1] -> + {label, {left, curr - 1, tensor, encoding_tensor}} + + true -> + {encoding_tensor[curr][0], {1, 0, tensor, encoding_tensor}} + end + end + + Nx.revectorize(label, input_vectorized_axes, target_shape: {decode_size}) + end + + @doc """ + Apply encoding on the provided tensor directly. It's equivalent to `fit/2` and then `transform/2` on the same data. + + ## Examples + + iex> t = Nx.tensor([3, 2, 4, 56, 2, 4, 2]) + iex> Scholar.Preprocessing.OridinalEncoder.fit_transform(t, num_classes: 4) + #Nx.Tensor< + s64[7] + [1, 0, 2, 3, 0, 2, 0] + > + """ + defn fit_transform(tensor, opts \\ []) do + tensor + |> fit(opts) + |> transform(tensor) + end +end diff --git a/test/scholar/preprocessing/one_hot_encoder_test.exs b/test/scholar/preprocessing/one_hot_encoder_test.exs new file mode 100644 index 00000000..1b59e0f0 --- /dev/null +++ b/test/scholar/preprocessing/one_hot_encoder_test.exs @@ -0,0 +1,6 @@ +defmodule Scholar.Preprocessing.OneHotEncoderTest do + use Scholar.Case, async: true + alias Scholar.Preprocessing.OneHotEncoder + + doctest OneHotEncoder +end diff --git a/test/scholar/preprocessing/ordinal_encoder_tes.exs b/test/scholar/preprocessing/ordinal_encoder_tes.exs new file mode 100644 index 00000000..06482b93 --- /dev/null +++ b/test/scholar/preprocessing/ordinal_encoder_tes.exs @@ -0,0 +1,6 @@ +defmodule Scholar.Preprocessing.OrdinalEncoderTest do + use Scholar.Case, async: true + alias Scholar.Preprocessing.OrdinalEncoder + + doctest OrdinalEncoder +end