From d6c7a5536d0d5edf6b069c867db8ba21939b142a Mon Sep 17 00:00:00 2001 From: srzeszut Date: Sun, 20 Oct 2024 15:17:44 +0200 Subject: [PATCH 01/13] add KNNImputer --- lib/scholar/impute/knn_imputer.ex | 259 ++++++++++++++++++++++++++++++ 1 file changed, 259 insertions(+) create mode 100644 lib/scholar/impute/knn_imputer.ex diff --git a/lib/scholar/impute/knn_imputer.ex b/lib/scholar/impute/knn_imputer.ex new file mode 100644 index 00000000..6db2bc90 --- /dev/null +++ b/lib/scholar/impute/knn_imputer.ex @@ -0,0 +1,259 @@ +defmodule Scholar.Impute.KNNImputer do + @moduledoc """ + Imputer for completing missing values using k-Nearest Neighbors. + + Each sample's missing values are imputed using the mean value from + `n_neighbors` nearest neighbors found in the training set. Two samples are + close if the features that neither is missing are close. + """ + import Nx.Defn + import Scholar.Metrics.Distance + + @derive {Nx.Container, keep: [:missing_values], containers: [:statistics]} + defstruct [:statistics, :missing_values] + + opts_schema = [ + missing_values: [ + type: {:or, [:float, :integer, {:in, [:nan]}]}, + default: :nan, + doc: ~S""" + The placeholder for the missing values. All occurrences of `:missing_values` will be imputed. + """ + ], + number_of_neighbors: [ + type: :pos_integer, + default: 2, + doc: "The number of nearest neighbors." + ] + ] + + @opts_schema NimbleOptions.new!(opts_schema) + + @doc """ + Imputer for completing missing values using k-Nearest Neighbors. + + ## Options + + #{NimbleOptions.docs(@opts_schema)} + + ## Return Values + + The function returns a struct with the following parameters: + + * `:missing_values` - the same value as in `:missing_values` + + * `:statistics` - The imputation fill value for each feature. Computing statistics can result in + [`Nx.Constant.nan/0`](https://hexdocs.pm/nx/Nx.Constants.html#nan/0) values. + + ## Examples + + iex> x = Nx.tensor([[40.0, 2.0],[4.0, 5.0],[7.0, :nan],[:nan, 8.0],[11.0, 11.0]]) + iex> Scholar.Impute.KNNImputer.fit(x, number_of_neighbors: 2) + %Scholar.Impute.KNNImputer{ + statistics: #Nx.Tensor< + f32[5][2] + [ + [NaN, NaN], + [NaN, NaN], + [NaN, 8.0], + [7.5, NaN], + [NaN, NaN] + ] + >, + missing_values: :nan + } + + """ + + deftransform fit(x, opts \\ []) do + opts = NimbleOptions.validate!(opts, @opts_schema) + + input_rank = Nx.rank(x) + + if input_rank != 2 do + raise ArgumentError, "Wrong input rank. Expected: 2, got: #{inspect(input_rank)}" + end + + if opts[:missing_values] != :nan and + Nx.any(Nx.is_nan(x)) == Nx.tensor(1, type: :u8) do + raise ArgumentError, + ":missing_values other than :nan possible only if there is no Nx.Constant.nan() in the array" + end + + x = + if opts[:missing_values] != :nan, + do: Nx.select(Nx.equal(x, opts[:missing_values]), Nx.Constants.nan(), x), + else: x + + num_neighbors = opts[:number_of_neighbors] + + if num_neighbors < 1 do + raise ArgumentError, "Number of neighbors must be greater than 0" + end + + {rows, cols} = Nx.shape(x) + + # TODO calculate all nan rows + row_nan_count = Nx.sum(Nx.is_nan(x), axes: [1]) + # row with only 1 non nan value is also considered as all nan row + all_nan_rows = + Nx.select(Nx.greater_equal(row_nan_count, cols - 1), Nx.tensor(1), Nx.tensor(0)) + + all_nan_rows_count = Nx.sum(all_nan_rows) + + if num_neighbors > rows - 1 - Nx.to_number(all_nan_rows_count) do + raise ArgumentError, + "Number of neighbors rows must be less than number valid of rows - 1 (valid row is row with more than 1 non nan value)" + end + + statistics = knn_impute(x, num_neighbors: num_neighbors) + # statistics = all_nan_rows_count + missing_values = opts[:missing_values] + %__MODULE__{statistics: statistics, missing_values: missing_values} + end + + @doc """ + Impute all missing values in `x` using fitted imputer. + + ## Return Values + + The function returns input tensor with NaN replaced with values saved in fitted imputer. + + ## Examples + + iex> x = Nx.tensor([[40.0, 2.0],[4.0, 5.0],[7.0, :nan],[:nan, 8.0],[11.0, 11.0]]) + iex> imputer = Scholar.Impute.KNNImputer.fit(x, strategy: :mean) + iex> Scholar.Impute.KNNImputer.transform(imputer, x) + Nx.tensor( + f32[5][2] + [ + [40.0, 2.0], + [4.0, 5.0], + [7.0, 8.0], + [7.5, 8.0], + [11.0, 11.0] + ] + ) + """ + deftransform transform(%__MODULE__{statistics: statistics, missing_values: missing_values}, x) do + mask = if missing_values == :nan, do: Nx.is_nan(x), else: Nx.equal(x, missing_values) + Nx.select(mask, statistics, x) + end + + defn knn_impute(x, opts \\ []) do + mask = Nx.is_nan(x) + {num_rows, num_cols} = Nx.shape(x) + num_neighbors = opts[:num_neighbors] + + placeholder_value = Nx.Constants.nan() + values_to_impute = Nx.broadcast(Nx.tensor(placeholder_value), x) + + {_, values_to_impute} = + while {{row = 0, mask, num_neighbors, num_rows, x}, values_to_impute}, + Nx.less(row, num_rows) do + {_, values_to_impute} = + while {{col = 0, mask, num_neighbors, num_cols, row, x}, values_to_impute}, + Nx.less(col, num_cols) do + if mask[row][col] > 0 do + {rows, cols} = Nx.shape(x) + + neighbor_avg = + calculate_knn(x, row, col, rows: rows, num_neighbors: opts[:num_neighbors]) + + indices = + [Nx.stack(row), Nx.stack(col)] + |> Nx.concatenate() + |> Nx.stack() + + values_to_impute = Nx.indexed_put(values_to_impute, indices, Nx.stack(neighbor_avg)) + {{col + 1, mask, num_neighbors, cols, row, x}, values_to_impute} + else + {{col + 1, mask, num_neighbors, num_cols, row, x}, values_to_impute} + end + end + + {{row + 1, mask, num_neighbors, num_rows, x}, values_to_impute} + end + + values_to_impute + end + + defnp calculate_knn(x, nan_row, nan_col, opts \\ []) do + opts = keyword!(opts, rows: 1, num_neighbors: 2) + rows = opts[:rows] + num_neighbors = opts[:num_neighbors] + + row_distances = Nx.iota({rows}, type: {:f, 32}) + + row_with_value_to_fill = x[nan_row] + + # calculate distance between row with nan to fill and all other rows where distance + # to the row is under its index in the tensor + {_, row_distances} = + while {{i = 0, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances}, + Nx.less(i, rows) do + potential_donor = x[i] + + if i == nan_row do + distance = Nx.Constants.infinity({:f, 32}) + row_distances = Nx.indexed_put(row_distances, Nx.new_axis(i, 0), distance) + {{i + 1, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances} + else + distance = nan_euclidian(row_with_value_to_fill, nan_col, potential_donor) + row_distances = Nx.indexed_put(row_distances, Nx.new_axis(i, 0), distance) + {{i + 1, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances} + end + end + + {_, indices} = Nx.top_k(-row_distances, k: num_neighbors) + + gather_indices = Nx.stack([indices, Nx.broadcast(nan_col, indices)], axis: 1) + values = Nx.gather(x, gather_indices) + Nx.sum(values) / num_neighbors + end + + # nan_col is the column of the value to impute + defnp nan_euclidian(row, nan_col, potential_neighbor) do + {coordinates} = Nx.shape(row) + + # minus nan column + coordinates = coordinates - 1 + + # inputes zeros in nan_col to calculate distance with squared_euclidean + new_row = Nx.indexed_put(row, Nx.new_axis(nan_col, 0), Nx.tensor(0)) + + # if potential neighbor has nan in nan_col, we don't want to calculate distance and the case if potential_neighbour is the row to impute + {potential_neighbor} = + if potential_neighbor[nan_col] == Nx.Constants.nan() do + potential_neighbor = Nx.broadcast(Nx.Constants.infinity({:f, 32}), potential_neighbor) + {potential_neighbor} + else + # inputes zeros in nan_col to calculate distance with squared_euclidean - distance will be 0 so no change to the distance value + potential_neighbor = + Nx.indexed_put(potential_neighbor, Nx.new_axis(nan_col, 0), Nx.tensor(0)) + + {potential_neighbor} + end + + # calculates how many values are present in the row without nan_col to calculate weight for the distance + present_coordinates = Nx.sum(Nx.logical_not(Nx.is_nan(potential_neighbor))) - 1 + + # if row has all nans we skip it + {weight, potential_neighbor} = + if present_coordinates == 0 do + potential_neighbor = Nx.broadcast(Nx.Constants.infinity({:f, 32}), potential_neighbor) + weight = 0 + {weight, potential_neighbor} + else + potential_neighbor = Nx.select(Nx.is_nan(potential_neighbor), new_row, potential_neighbor) + weight = coordinates / present_coordinates + {weight, potential_neighbor} + end + + # calculating weighted euclidian distance + distance = Nx.sqrt(weight * squared_euclidean(new_row, potential_neighbor)) + + # return inf if potential_row is row to impute + Nx.select(Nx.is_nan(distance), Nx.Constants.infinity({:f, 32}), distance) + end +end From 47b4a653106272e8c6f6fdf3a2ef3d547136d913 Mon Sep 17 00:00:00 2001 From: srzeszut Date: Sun, 20 Oct 2024 15:21:25 +0200 Subject: [PATCH 02/13] fix doctests --- lib/scholar/impute/knn_imputer.ex | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/lib/scholar/impute/knn_imputer.ex b/lib/scholar/impute/knn_imputer.ex index 6db2bc90..0ad71747 100644 --- a/lib/scholar/impute/knn_imputer.ex +++ b/lib/scholar/impute/knn_imputer.ex @@ -46,11 +46,10 @@ defmodule Scholar.Impute.KNNImputer do [`Nx.Constant.nan/0`](https://hexdocs.pm/nx/Nx.Constants.html#nan/0) values. ## Examples - - iex> x = Nx.tensor([[40.0, 2.0],[4.0, 5.0],[7.0, :nan],[:nan, 8.0],[11.0, 11.0]]) - iex> Scholar.Impute.KNNImputer.fit(x, number_of_neighbors: 2) - %Scholar.Impute.KNNImputer{ - statistics: #Nx.Tensor< + iex> x = Nx.tensor([[40.0, 2.0],[4.0, 5.0],[7.0, :nan],[:nan, 8.0],[11.0, 11.0]]) + iex> Scholar.Impute.KNNImputer.fit(x, number_of_neighbors: 2) + %Scholar.Impute.KNNImputer{ + statistics: #Nx.Tensor< f32[5][2] [ [NaN, NaN], @@ -59,10 +58,9 @@ defmodule Scholar.Impute.KNNImputer do [7.5, NaN], [NaN, NaN] ] - >, - missing_values: :nan - } - + >, + missing_values: :nan + } """ deftransform fit(x, opts \\ []) do @@ -93,7 +91,6 @@ defmodule Scholar.Impute.KNNImputer do {rows, cols} = Nx.shape(x) - # TODO calculate all nan rows row_nan_count = Nx.sum(Nx.is_nan(x), axes: [1]) # row with only 1 non nan value is also considered as all nan row all_nan_rows = @@ -125,7 +122,6 @@ defmodule Scholar.Impute.KNNImputer do iex> imputer = Scholar.Impute.KNNImputer.fit(x, strategy: :mean) iex> Scholar.Impute.KNNImputer.transform(imputer, x) Nx.tensor( - f32[5][2] [ [40.0, 2.0], [4.0, 5.0], From eb8f245f075c46b79b17c6658b32cdcf3bbd3135 Mon Sep 17 00:00:00 2001 From: srzeszut Date: Sun, 20 Oct 2024 15:30:13 +0200 Subject: [PATCH 03/13] mix format --- lib/scholar/impute/knn_imputer.ex | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/scholar/impute/knn_imputer.ex b/lib/scholar/impute/knn_imputer.ex index 0ad71747..bcdf5d0b 100644 --- a/lib/scholar/impute/knn_imputer.ex +++ b/lib/scholar/impute/knn_imputer.ex @@ -73,15 +73,15 @@ defmodule Scholar.Impute.KNNImputer do end if opts[:missing_values] != :nan and - Nx.any(Nx.is_nan(x)) == Nx.tensor(1, type: :u8) do + Nx.any(Nx.is_nan(x)) == Nx.tensor(1, type: :u8) do raise ArgumentError, ":missing_values other than :nan possible only if there is no Nx.Constant.nan() in the array" end x = if opts[:missing_values] != :nan, - do: Nx.select(Nx.equal(x, opts[:missing_values]), Nx.Constants.nan(), x), - else: x + do: Nx.select(Nx.equal(x, opts[:missing_values]), Nx.Constants.nan(), x), + else: x num_neighbors = opts[:number_of_neighbors] From 642b15e29cc6932adca4f194bb8f7b17135a4c85 Mon Sep 17 00:00:00 2001 From: srzeszut Date: Sun, 20 Oct 2024 15:45:48 +0200 Subject: [PATCH 04/13] change placeholder_value to tensor --- lib/scholar/impute/knn_imputer.ex | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/scholar/impute/knn_imputer.ex b/lib/scholar/impute/knn_imputer.ex index bcdf5d0b..dcc716bf 100644 --- a/lib/scholar/impute/knn_imputer.ex +++ b/lib/scholar/impute/knn_imputer.ex @@ -103,7 +103,9 @@ defmodule Scholar.Impute.KNNImputer do "Number of neighbors rows must be less than number valid of rows - 1 (valid row is row with more than 1 non nan value)" end - statistics = knn_impute(x, num_neighbors: num_neighbors) + placeholder_value = Nx.Constants.nan() |> Nx.tensor() + + statistics = knn_impute(x, placeholder_value, num_neighbors: num_neighbors) # statistics = all_nan_rows_count missing_values = opts[:missing_values] %__MODULE__{statistics: statistics, missing_values: missing_values} @@ -136,13 +138,12 @@ defmodule Scholar.Impute.KNNImputer do Nx.select(mask, statistics, x) end - defn knn_impute(x, opts \\ []) do + defnp knn_impute(x, placeholder_value, opts \\ []) do mask = Nx.is_nan(x) {num_rows, num_cols} = Nx.shape(x) num_neighbors = opts[:num_neighbors] - placeholder_value = Nx.Constants.nan() - values_to_impute = Nx.broadcast(Nx.tensor(placeholder_value), x) + values_to_impute = Nx.broadcast(placeholder_value, x) {_, values_to_impute} = while {{row = 0, mask, num_neighbors, num_rows, x}, values_to_impute}, From 520633ae8c0ab6d02266fbcbc3b1e6004f63f68d Mon Sep 17 00:00:00 2001 From: srzeszut Date: Sun, 20 Oct 2024 15:58:05 +0200 Subject: [PATCH 05/13] fix doctest --- lib/scholar/impute/knn_imputer.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/scholar/impute/knn_imputer.ex b/lib/scholar/impute/knn_imputer.ex index dcc716bf..0d602dc5 100644 --- a/lib/scholar/impute/knn_imputer.ex +++ b/lib/scholar/impute/knn_imputer.ex @@ -121,7 +121,7 @@ defmodule Scholar.Impute.KNNImputer do ## Examples iex> x = Nx.tensor([[40.0, 2.0],[4.0, 5.0],[7.0, :nan],[:nan, 8.0],[11.0, 11.0]]) - iex> imputer = Scholar.Impute.KNNImputer.fit(x, strategy: :mean) + iex> imputer = Scholar.Impute.KNNImputer.fit(x, number_of_neighbors: 2) iex> Scholar.Impute.KNNImputer.transform(imputer, x) Nx.tensor( [ From 926a1c70bb56d54dbcd3aea93aa87e071927cd0a Mon Sep 17 00:00:00 2001 From: srzeszut Date: Sun, 27 Oct 2024 11:20:37 +0100 Subject: [PATCH 06/13] apply suggested changes --- .../{knn_imputer.ex => knn_imputter.ex} | 57 ++++++++++--------- 1 file changed, 29 insertions(+), 28 deletions(-) rename lib/scholar/impute/{knn_imputer.ex => knn_imputter.ex} (86%) diff --git a/lib/scholar/impute/knn_imputer.ex b/lib/scholar/impute/knn_imputter.ex similarity index 86% rename from lib/scholar/impute/knn_imputer.ex rename to lib/scholar/impute/knn_imputter.ex index 0d602dc5..8d4cdef7 100644 --- a/lib/scholar/impute/knn_imputer.ex +++ b/lib/scholar/impute/knn_imputter.ex @@ -1,4 +1,4 @@ -defmodule Scholar.Impute.KNNImputer do +defmodule Scholar.Impute.KNNImputter do @moduledoc """ Imputer for completing missing values using k-Nearest Neighbors. @@ -46,21 +46,22 @@ defmodule Scholar.Impute.KNNImputer do [`Nx.Constant.nan/0`](https://hexdocs.pm/nx/Nx.Constants.html#nan/0) values. ## Examples - iex> x = Nx.tensor([[40.0, 2.0],[4.0, 5.0],[7.0, :nan],[:nan, 8.0],[11.0, 11.0]]) - iex> Scholar.Impute.KNNImputer.fit(x, number_of_neighbors: 2) - %Scholar.Impute.KNNImputer{ - statistics: #Nx.Tensor< - f32[5][2] - [ - [NaN, NaN], - [NaN, NaN], - [NaN, 8.0], - [7.5, NaN], - [NaN, NaN] - ] - >, - missing_values: :nan - } + + iex> x = Nx.tensor([[40.0, 2.0],[4.0, 5.0],[7.0, :nan],[:nan, 8.0],[11.0, 11.0]]) + iex> Scholar.Impute.KNNImputter.fit(x, number_of_neighbors: 2) + %Scholar.Impute.KNNImputter{ + statistics: Nx.tensor( + [ + [NaN, NaN], + [NaN, NaN], + [NaN, 8.0], + [7.5, NaN], + [NaN, NaN] + ] + ), + missing_values: :nan + } + """ deftransform fit(x, opts \\ []) do @@ -121,8 +122,8 @@ defmodule Scholar.Impute.KNNImputer do ## Examples iex> x = Nx.tensor([[40.0, 2.0],[4.0, 5.0],[7.0, :nan],[:nan, 8.0],[11.0, 11.0]]) - iex> imputer = Scholar.Impute.KNNImputer.fit(x, number_of_neighbors: 2) - iex> Scholar.Impute.KNNImputer.transform(imputer, x) + iex> imputer = Scholar.Impute.KNNImputter.fit(x, number_of_neighbors: 2) + iex> Scholar.Impute.KNNImputter.transform(imputer, x) Nx.tensor( [ [40.0, 2.0], @@ -191,15 +192,15 @@ defmodule Scholar.Impute.KNNImputer do Nx.less(i, rows) do potential_donor = x[i] - if i == nan_row do - distance = Nx.Constants.infinity({:f, 32}) - row_distances = Nx.indexed_put(row_distances, Nx.new_axis(i, 0), distance) - {{i + 1, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances} - else - distance = nan_euclidian(row_with_value_to_fill, nan_col, potential_donor) - row_distances = Nx.indexed_put(row_distances, Nx.new_axis(i, 0), distance) - {{i + 1, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances} - end + distance = + if i == nan_row do + Nx.Constants.infinity({:f, 32}) + else + nan_euclidian(row_with_value_to_fill, nan_col, potential_donor) + end + + row_distances = Nx.indexed_put(row_distances, Nx.new_axis(i, 0), distance) + {{i + 1, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances} end {_, indices} = Nx.top_k(-row_distances, k: num_neighbors) @@ -221,7 +222,7 @@ defmodule Scholar.Impute.KNNImputer do # if potential neighbor has nan in nan_col, we don't want to calculate distance and the case if potential_neighbour is the row to impute {potential_neighbor} = - if potential_neighbor[nan_col] == Nx.Constants.nan() do + if Nx.is_nan(potential_neighbor[nan_col]) do potential_neighbor = Nx.broadcast(Nx.Constants.infinity({:f, 32}), potential_neighbor) {potential_neighbor} else From a3e0eba690d487f54263f0572f3c6fb04b0e26a2 Mon Sep 17 00:00:00 2001 From: srzeszut Date: Sun, 27 Oct 2024 11:37:23 +0100 Subject: [PATCH 07/13] added type tensors --- lib/scholar/impute/knn_imputter.ex | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/lib/scholar/impute/knn_imputter.ex b/lib/scholar/impute/knn_imputter.ex index 8d4cdef7..1b2a4984 100644 --- a/lib/scholar/impute/knn_imputter.ex +++ b/lib/scholar/impute/knn_imputter.ex @@ -194,7 +194,7 @@ defmodule Scholar.Impute.KNNImputter do distance = if i == nan_row do - Nx.Constants.infinity({:f, 32}) + Nx.Constants.infinity(Nx.type(row_with_value_to_fill)) else nan_euclidian(row_with_value_to_fill, nan_col, potential_donor) end @@ -223,12 +223,18 @@ defmodule Scholar.Impute.KNNImputter do # if potential neighbor has nan in nan_col, we don't want to calculate distance and the case if potential_neighbour is the row to impute {potential_neighbor} = if Nx.is_nan(potential_neighbor[nan_col]) do - potential_neighbor = Nx.broadcast(Nx.Constants.infinity({:f, 32}), potential_neighbor) + potential_neighbor = + Nx.broadcast(Nx.Constants.infinity(Nx.type(potential_neighbor)), potential_neighbor) + {potential_neighbor} else # inputes zeros in nan_col to calculate distance with squared_euclidean - distance will be 0 so no change to the distance value potential_neighbor = - Nx.indexed_put(potential_neighbor, Nx.new_axis(nan_col, 0), Nx.tensor(0)) + Nx.indexed_put( + potential_neighbor, + Nx.new_axis(nan_col, 0), + Nx.tensor(0, type: Nx.type(row)) + ) {potential_neighbor} end @@ -239,7 +245,9 @@ defmodule Scholar.Impute.KNNImputter do # if row has all nans we skip it {weight, potential_neighbor} = if present_coordinates == 0 do - potential_neighbor = Nx.broadcast(Nx.Constants.infinity({:f, 32}), potential_neighbor) + potential_neighbor = + Nx.broadcast(Nx.Constants.infinity(Nx.type(potential_neighbor)), potential_neighbor) + weight = 0 {weight, potential_neighbor} else @@ -252,6 +260,6 @@ defmodule Scholar.Impute.KNNImputter do distance = Nx.sqrt(weight * squared_euclidean(new_row, potential_neighbor)) # return inf if potential_row is row to impute - Nx.select(Nx.is_nan(distance), Nx.Constants.infinity({:f, 32}), distance) + Nx.select(Nx.is_nan(distance), Nx.Constants.infinity(Nx.type(distance)), distance) end end From 108475d0d6d12b0f7232336088fafdd2c67ed9f1 Mon Sep 17 00:00:00 2001 From: srzeszut Date: Mon, 28 Oct 2024 14:38:47 +0100 Subject: [PATCH 08/13] added tests and remove not working checks --- lib/scholar/impute/knn_imputter.ex | 44 +++----- test/scholar/impute/knn_imputter_test.exs | 128 ++++++++++++++++++++++ 2 files changed, 141 insertions(+), 31 deletions(-) create mode 100644 test/scholar/impute/knn_imputter_test.exs diff --git a/lib/scholar/impute/knn_imputter.ex b/lib/scholar/impute/knn_imputter.ex index 1b2a4984..4a0901ed 100644 --- a/lib/scholar/impute/knn_imputter.ex +++ b/lib/scholar/impute/knn_imputter.ex @@ -18,6 +18,8 @@ defmodule Scholar.Impute.KNNImputter do default: :nan, doc: ~S""" The placeholder for the missing values. All occurrences of `:missing_values` will be imputed. + + The default value expects there are no NaNs in the input tensor. """ ], number_of_neighbors: [ @@ -30,7 +32,12 @@ defmodule Scholar.Impute.KNNImputter do @opts_schema NimbleOptions.new!(opts_schema) @doc """ - Imputer for completing missing values using k-Nearest Neighbors. + Imputter for completing missing values using k-Nearest Neighbors. + + Preconditions: + * `number_of_neighbors` is a positive integer. + * number of neighbors must be less than number valid of rows - 1 (valid row is row with more than 1 non nan value) otherwise it is better to use simple imputter + * when you set a value different than :nan in `missing_values` there should be no NaNs in the input tensor ## Options @@ -52,11 +59,11 @@ defmodule Scholar.Impute.KNNImputter do %Scholar.Impute.KNNImputter{ statistics: Nx.tensor( [ - [NaN, NaN], - [NaN, NaN], - [NaN, 8.0], - [7.5, NaN], - [NaN, NaN] + [:nan, :nan], + [:nan, :nan], + [:nan, 8.0], + [7.5, :nan], + [:nan, :nan] ] ), missing_values: :nan @@ -73,12 +80,6 @@ defmodule Scholar.Impute.KNNImputter do raise ArgumentError, "Wrong input rank. Expected: 2, got: #{inspect(input_rank)}" end - if opts[:missing_values] != :nan and - Nx.any(Nx.is_nan(x)) == Nx.tensor(1, type: :u8) do - raise ArgumentError, - ":missing_values other than :nan possible only if there is no Nx.Constant.nan() in the array" - end - x = if opts[:missing_values] != :nan, do: Nx.select(Nx.equal(x, opts[:missing_values]), Nx.Constants.nan(), x), @@ -86,28 +87,9 @@ defmodule Scholar.Impute.KNNImputter do num_neighbors = opts[:number_of_neighbors] - if num_neighbors < 1 do - raise ArgumentError, "Number of neighbors must be greater than 0" - end - - {rows, cols} = Nx.shape(x) - - row_nan_count = Nx.sum(Nx.is_nan(x), axes: [1]) - # row with only 1 non nan value is also considered as all nan row - all_nan_rows = - Nx.select(Nx.greater_equal(row_nan_count, cols - 1), Nx.tensor(1), Nx.tensor(0)) - - all_nan_rows_count = Nx.sum(all_nan_rows) - - if num_neighbors > rows - 1 - Nx.to_number(all_nan_rows_count) do - raise ArgumentError, - "Number of neighbors rows must be less than number valid of rows - 1 (valid row is row with more than 1 non nan value)" - end - placeholder_value = Nx.Constants.nan() |> Nx.tensor() statistics = knn_impute(x, placeholder_value, num_neighbors: num_neighbors) - # statistics = all_nan_rows_count missing_values = opts[:missing_values] %__MODULE__{statistics: statistics, missing_values: missing_values} end diff --git a/test/scholar/impute/knn_imputter_test.exs b/test/scholar/impute/knn_imputter_test.exs new file mode 100644 index 00000000..677a039f --- /dev/null +++ b/test/scholar/impute/knn_imputter_test.exs @@ -0,0 +1,128 @@ +defmodule KNNImputterTest do + use Scholar.Case, async: true + alias Scholar.Impute.KNNImputter + doctest KNNImputter + + describe "general cases" do + def generate_data() do + x = Nx.iota({5, 4}) + x = Nx.select(Nx.equal(Nx.quotient(x, 5), 2), Nx.Constants.nan(), x) + Nx.indexed_put(x, Nx.tensor([[4, 2]]), Nx.tensor([6.0])) + end + + test "general KNN imputer" do + x = generate_data() + jit_fit = Nx.Defn.jit(&KNNImputter.fit/2) + jit_transform = Nx.Defn.jit(&KNNImputter.transform/2) + + knn_imputer = + %KNNImputter{statistics: statistics, missing_values: missing_values} = + jit_fit.(x, missing_values: :nan, number_of_neighbors: 2) + + assert missing_values == :nan + + assert statistics == + Nx.tensor([ + [:nan, :nan, :nan, :nan], + [:nan, :nan, :nan, :nan], + [:nan, :nan, 4.0, 5.0], + [2.0, 3.0, 4.0, :nan], + [:nan, :nan, :nan, :nan] + ]) + + assert jit_transform.(knn_imputer, x) == + Nx.tensor([ + [0.0, 1.0, 2.0, 3.0], + [4.0, 5.0, 6.0, 7.0], + [8.0, 9.0, 4.0, 5.0], + [2.0, 3.0, 4.0, 15.0], + [16.0, 17.0, 6.0, 19.0] + ]) + end + + test "general KNN imputer with different number of neighbors" do + x = generate_data() + jit_fit = Nx.Defn.jit(&KNNImputter.fit/2) + jit_transform = Nx.Defn.jit(&KNNImputter.transform/2) + + knn_imputter = + %KNNImputter{statistics: statistics, missing_values: missing_values} = + jit_fit.(x, missing_values: :nan, number_of_neighbors: 1) + + assert missing_values == :nan + + assert statistics == + Nx.tensor([ + [:nan, :nan, :nan, :nan], + [:nan, :nan, :nan, :nan], + [:nan, :nan, 2.0, 3.0], + [0.0, 1.0, 2.0, :nan], + [:nan, :nan, :nan, :nan] + ]) + + assert jit_transform.(knn_imputter, x) == + Nx.tensor([ + [0.0, 1.0, 2.0, 3.0], + [4.0, 5.0, 6.0, 7.0], + [8.0, 9.0, 2.0, 3.0], + [0.0, 1.0, 2.0, 15.0], + [16.0, 17.0, 6.0, 19.0] + ]) + end + + test "missing values different than :nan" do + x = generate_data() + x = Nx.select(Nx.is_nan(x), Nx.tensor(19.0), x) + jit_fit = Nx.Defn.jit(&KNNImputter.fit/2) + jit_transform = Nx.Defn.jit(&KNNImputter.transform/2) + + knn_imputter = + %KNNImputter{statistics: statistics, missing_values: missing_values} = + jit_fit.(x, missing_values: 19.0, number_of_neighbors: 2) + + assert missing_values == 19.0 + + assert statistics == + Nx.tensor([ + [:nan, :nan, :nan, :nan], + [:nan, :nan, :nan, :nan], + [:nan, :nan, 4.0, 5.0], + [2.0, 3.0, 4.0, :nan], + [:nan, :nan, :nan, 5.0] + ]) + + assert jit_transform.(knn_imputter, x) == + Nx.tensor([ + [0.0, 1.0, 2.0, 3.0], + [4.0, 5.0, 6.0, 7.0], + [8.0, 9.0, 4.0, 5.0], + [2.0, 3.0, 4.0, 15.0], + [16.0, 17.0, 6.0, 5.0] + ]) + end + end + + describe "errors" do + test "Wrong impute rank" do + x = Nx.tensor([1, 2, 2, 3]) + + assert_raise ArgumentError, + "Wrong input rank. Expected: 2, got: 1", + fn -> + KNNImputter.fit(x, missing_values: 1, number_of_neighbors: 2) + end + end + + test "Invalid n_neighbors value" do + x = generate_data() + + jit_fit = Nx.Defn.jit(&KNNImputter.fit/2) + + assert_raise NimbleOptions.ValidationError, + "invalid value for :number_of_neighbors option: expected positive integer, got: -1", + fn -> + jit_fit.(x, missing_values: 1.0, number_of_neighbors: -1) + end + end + end +end From 1a3aae7adc331772205a10431e2bf78801726ca5 Mon Sep 17 00:00:00 2001 From: srzeszut Date: Wed, 30 Oct 2024 11:01:48 +0100 Subject: [PATCH 09/13] change errors and refactor --- lib/scholar/impute/knn_imputter.ex | 27 ++++++++++++----------- test/scholar/impute/knn_imputter_test.exs | 9 ++++---- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/lib/scholar/impute/knn_imputter.ex b/lib/scholar/impute/knn_imputter.ex index 4a0901ed..b9d220f8 100644 --- a/lib/scholar/impute/knn_imputter.ex +++ b/lib/scholar/impute/knn_imputter.ex @@ -3,8 +3,8 @@ defmodule Scholar.Impute.KNNImputter do Imputer for completing missing values using k-Nearest Neighbors. Each sample's missing values are imputed using the mean value from - `n_neighbors` nearest neighbors found in the training set. Two samples are - close if the features that neither is missing are close. + `n_neighbors` nearest neighbors found in the training set. Two samples are + close if the features that neither is missing are close. """ import Nx.Defn import Scholar.Metrics.Distance @@ -36,7 +36,8 @@ defmodule Scholar.Impute.KNNImputter do Preconditions: * `number_of_neighbors` is a positive integer. - * number of neighbors must be less than number valid of rows - 1 (valid row is row with more than 1 non nan value) otherwise it is better to use simple imputter + * number of neighbors must be less than number valid of rows - 1 + (valid row is row with more than 1 non nan value) otherwise it is better to use simple imputter * when you set a value different than :nan in `missing_values` there should be no NaNs in the input tensor ## Options @@ -77,20 +78,18 @@ defmodule Scholar.Impute.KNNImputter do input_rank = Nx.rank(x) if input_rank != 2 do - raise ArgumentError, "Wrong input rank. Expected: 2, got: #{inspect(input_rank)}" + raise ArgumentError, "wrong input rank. Expected: 2, got: #{inspect(input_rank)}" end - x = - if opts[:missing_values] != :nan, - do: Nx.select(Nx.equal(x, opts[:missing_values]), Nx.Constants.nan(), x), - else: x + missing_values = opts[:missing_values] - num_neighbors = opts[:number_of_neighbors] + x = + if missing_values != :nan, + do: Nx.select(Nx.equal(x, missing_values), :nan, x), + else: x - placeholder_value = Nx.Constants.nan() |> Nx.tensor() - statistics = knn_impute(x, placeholder_value, num_neighbors: num_neighbors) - missing_values = opts[:missing_values] + statistics = knn_impute(x, num_neighbors: opts[:number_of_neighbors], missing_values: missing_values) %__MODULE__{statistics: statistics, missing_values: missing_values} end @@ -121,11 +120,13 @@ defmodule Scholar.Impute.KNNImputter do Nx.select(mask, statistics, x) end - defnp knn_impute(x, placeholder_value, opts \\ []) do + defnp knn_impute(x, opts \\ []) do mask = Nx.is_nan(x) {num_rows, num_cols} = Nx.shape(x) num_neighbors = opts[:num_neighbors] + placeholder_value = Nx.tensor(:nan) + values_to_impute = Nx.broadcast(placeholder_value, x) {_, values_to_impute} = diff --git a/test/scholar/impute/knn_imputter_test.exs b/test/scholar/impute/knn_imputter_test.exs index 677a039f..87b297cb 100644 --- a/test/scholar/impute/knn_imputter_test.exs +++ b/test/scholar/impute/knn_imputter_test.exs @@ -72,7 +72,8 @@ defmodule KNNImputterTest do test "missing values different than :nan" do x = generate_data() - x = Nx.select(Nx.is_nan(x), Nx.tensor(19.0), x) + x = Nx.select(Nx.is_nan(x), 19.0, x) +# x = Nx.select(Nx.equal(x,19), :nan, x) jit_fit = Nx.Defn.jit(&KNNImputter.fit/2) jit_transform = Nx.Defn.jit(&KNNImputter.transform/2) @@ -103,17 +104,17 @@ defmodule KNNImputterTest do end describe "errors" do - test "Wrong impute rank" do + test "invalid impute rank" do x = Nx.tensor([1, 2, 2, 3]) assert_raise ArgumentError, - "Wrong input rank. Expected: 2, got: 1", + "wrong input rank. Expected: 2, got: 1", fn -> KNNImputter.fit(x, missing_values: 1, number_of_neighbors: 2) end end - test "Invalid n_neighbors value" do + test "invalid n_neighbors value" do x = generate_data() jit_fit = Nx.Defn.jit(&KNNImputter.fit/2) From 366584ea96a10d77bc7467723d12936dbaa0b8fd Mon Sep 17 00:00:00 2001 From: srzeszut Date: Wed, 27 Nov 2024 13:26:29 +0100 Subject: [PATCH 10/13] apply suggestions --- lib/scholar/impute/knn_imputter.ex | 38 +++++++++++------------ test/scholar/impute/knn_imputter_test.exs | 12 +++---- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/lib/scholar/impute/knn_imputter.ex b/lib/scholar/impute/knn_imputter.ex index b9d220f8..4b0aa019 100644 --- a/lib/scholar/impute/knn_imputter.ex +++ b/lib/scholar/impute/knn_imputter.ex @@ -22,7 +22,7 @@ defmodule Scholar.Impute.KNNImputter do The default value expects there are no NaNs in the input tensor. """ ], - number_of_neighbors: [ + num_neighbors: [ type: :pos_integer, default: 2, doc: "The number of nearest neighbors." @@ -35,7 +35,7 @@ defmodule Scholar.Impute.KNNImputter do Imputter for completing missing values using k-Nearest Neighbors. Preconditions: - * `number_of_neighbors` is a positive integer. + * `num_neighbors` is a positive integer. * number of neighbors must be less than number valid of rows - 1 (valid row is row with more than 1 non nan value) otherwise it is better to use simple imputter * when you set a value different than :nan in `missing_values` there should be no NaNs in the input tensor @@ -50,13 +50,12 @@ defmodule Scholar.Impute.KNNImputter do * `:missing_values` - the same value as in `:missing_values` - * `:statistics` - The imputation fill value for each feature. Computing statistics can result in - [`Nx.Constant.nan/0`](https://hexdocs.pm/nx/Nx.Constants.html#nan/0) values. + * `:statistics` - The imputation fill value for each feature. Computing statistics can result in values. ## Examples iex> x = Nx.tensor([[40.0, 2.0],[4.0, 5.0],[7.0, :nan],[:nan, 8.0],[11.0, 11.0]]) - iex> Scholar.Impute.KNNImputter.fit(x, number_of_neighbors: 2) + iex> Scholar.Impute.KNNImputter.fit(x, num_neighbors: 2) %Scholar.Impute.KNNImputter{ statistics: Nx.tensor( [ @@ -89,7 +88,7 @@ defmodule Scholar.Impute.KNNImputter do else: x - statistics = knn_impute(x, num_neighbors: opts[:number_of_neighbors], missing_values: missing_values) + statistics = knn_impute(x, num_neighbors: opts[:num_neighbors], missing_values: missing_values) %__MODULE__{statistics: statistics, missing_values: missing_values} end @@ -103,7 +102,7 @@ defmodule Scholar.Impute.KNNImputter do ## Examples iex> x = Nx.tensor([[40.0, 2.0],[4.0, 5.0],[7.0, :nan],[:nan, 8.0],[11.0, 11.0]]) - iex> imputer = Scholar.Impute.KNNImputter.fit(x, number_of_neighbors: 2) + iex> imputer = Scholar.Impute.KNNImputter.fit(x, num_neighbors: 2) iex> Scholar.Impute.KNNImputter.transform(imputer, x) Nx.tensor( [ @@ -131,22 +130,17 @@ defmodule Scholar.Impute.KNNImputter do {_, values_to_impute} = while {{row = 0, mask, num_neighbors, num_rows, x}, values_to_impute}, - Nx.less(row, num_rows) do + row < num_rows do {_, values_to_impute} = while {{col = 0, mask, num_neighbors, num_cols, row, x}, values_to_impute}, - Nx.less(col, num_cols) do - if mask[row][col] > 0 do + col < num_cols do + if mask[row][col] do {rows, cols} = Nx.shape(x) neighbor_avg = calculate_knn(x, row, col, rows: rows, num_neighbors: opts[:num_neighbors]) - indices = - [Nx.stack(row), Nx.stack(col)] - |> Nx.concatenate() - |> Nx.stack() - - values_to_impute = Nx.indexed_put(values_to_impute, indices, Nx.stack(neighbor_avg)) + values_to_impute = Nx.put_slice(values_to_impute, [row, col], Nx.reshape(neighbor_avg, {1, 1})) {{col + 1, mask, num_neighbors, cols, row, x}, values_to_impute} else {{col + 1, mask, num_neighbors, num_cols, row, x}, values_to_impute} @@ -172,19 +166,25 @@ defmodule Scholar.Impute.KNNImputter do # to the row is under its index in the tensor {_, row_distances} = while {{i = 0, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances}, - Nx.less(i, rows) do + i < rows do +# potential_donors = Nx.vectorize(x, :rows) +# distances = nan_euclidean(row_with_value_to_fill, nan_col, potential_donors) |> Nx.devectorize() +# row_distances = Nx.indexed_put(distances, Nx.new_axis(i, 0), Nx.Constants.infinity()) potential_donor = x[i] distance = if i == nan_row do Nx.Constants.infinity(Nx.type(row_with_value_to_fill)) else - nan_euclidian(row_with_value_to_fill, nan_col, potential_donor) + nan_euclidean(row_with_value_to_fill, nan_col, potential_donor) end row_distances = Nx.indexed_put(row_distances, Nx.new_axis(i, 0), distance) {{i + 1, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances} end +# potential_donors = Nx.vectorize(x, :rows) +# distances = nan_euclidean(row_with_value_to_fill, nan_col, potential_donors) |> Nx.devectorize() +# row_distances = Nx.indexed_put(distances, [i], Nx.Constants.infinity()) {_, indices} = Nx.top_k(-row_distances, k: num_neighbors) @@ -194,7 +194,7 @@ defmodule Scholar.Impute.KNNImputter do end # nan_col is the column of the value to impute - defnp nan_euclidian(row, nan_col, potential_neighbor) do + defnp nan_euclidean(row, nan_col, potential_neighbor) do {coordinates} = Nx.shape(row) # minus nan column diff --git a/test/scholar/impute/knn_imputter_test.exs b/test/scholar/impute/knn_imputter_test.exs index 87b297cb..c76becb4 100644 --- a/test/scholar/impute/knn_imputter_test.exs +++ b/test/scholar/impute/knn_imputter_test.exs @@ -17,7 +17,7 @@ defmodule KNNImputterTest do knn_imputer = %KNNImputter{statistics: statistics, missing_values: missing_values} = - jit_fit.(x, missing_values: :nan, number_of_neighbors: 2) + jit_fit.(x, missing_values: :nan, num_neighbors: 2) assert missing_values == :nan @@ -47,7 +47,7 @@ defmodule KNNImputterTest do knn_imputter = %KNNImputter{statistics: statistics, missing_values: missing_values} = - jit_fit.(x, missing_values: :nan, number_of_neighbors: 1) + jit_fit.(x, missing_values: :nan, num_neighbors: 1) assert missing_values == :nan @@ -79,7 +79,7 @@ defmodule KNNImputterTest do knn_imputter = %KNNImputter{statistics: statistics, missing_values: missing_values} = - jit_fit.(x, missing_values: 19.0, number_of_neighbors: 2) + jit_fit.(x, missing_values: 19.0, num_neighbors: 2) assert missing_values == 19.0 @@ -110,7 +110,7 @@ defmodule KNNImputterTest do assert_raise ArgumentError, "wrong input rank. Expected: 2, got: 1", fn -> - KNNImputter.fit(x, missing_values: 1, number_of_neighbors: 2) + KNNImputter.fit(x, missing_values: 1, num_neighbors: 2) end end @@ -120,9 +120,9 @@ defmodule KNNImputterTest do jit_fit = Nx.Defn.jit(&KNNImputter.fit/2) assert_raise NimbleOptions.ValidationError, - "invalid value for :number_of_neighbors option: expected positive integer, got: -1", + "invalid value for :num_neighbors option: expected positive integer, got: -1", fn -> - jit_fit.(x, missing_values: 1.0, number_of_neighbors: -1) + jit_fit.(x, missing_values: 1.0, num_neighbors: -1) end end end From f4b6c3987f32286e77fd97fe83924fb8895bb2a9 Mon Sep 17 00:00:00 2001 From: srzeszut Date: Wed, 27 Nov 2024 14:59:58 +0100 Subject: [PATCH 11/13] apply suggestions --- lib/scholar/impute/knn_imputter.ex | 51 ++++++++++++++---------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/lib/scholar/impute/knn_imputter.ex b/lib/scholar/impute/knn_imputter.ex index 4b0aa019..d62bfaf2 100644 --- a/lib/scholar/impute/knn_imputter.ex +++ b/lib/scholar/impute/knn_imputter.ex @@ -14,7 +14,7 @@ defmodule Scholar.Impute.KNNImputter do opts_schema = [ missing_values: [ - type: {:or, [:float, :integer, {:in, [:nan]}]}, + type: {:or, [:float, :integer, {:in, [:infinity, :neg_infinity, :nan]}]}, default: :nan, doc: ~S""" The placeholder for the missing values. All occurrences of `:missing_values` will be imputed. @@ -35,10 +35,9 @@ defmodule Scholar.Impute.KNNImputter do Imputter for completing missing values using k-Nearest Neighbors. Preconditions: - * `num_neighbors` is a positive integer. - * number of neighbors must be less than number valid of rows - 1 - (valid row is row with more than 1 non nan value) otherwise it is better to use simple imputter - * when you set a value different than :nan in `missing_values` there should be no NaNs in the input tensor + * The number of neighbors must be less than the number of valid rows - 1. + * A valid row is a row with more than 1 non-NaN values. Otherwise it is better to use a simpler imputter. + * When you set a value different than :nan in `missing_values` there should be no NaNs in the input tensor ## Options @@ -48,7 +47,7 @@ defmodule Scholar.Impute.KNNImputter do The function returns a struct with the following parameters: - * `:missing_values` - the same value as in `:missing_values` + * `:missing_values` - the same value as in the `:missing_values` option * `:statistics` - The imputation fill value for each feature. Computing statistics can result in values. @@ -164,27 +163,25 @@ defmodule Scholar.Impute.KNNImputter do # calculate distance between row with nan to fill and all other rows where distance # to the row is under its index in the tensor - {_, row_distances} = - while {{i = 0, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances}, - i < rows do -# potential_donors = Nx.vectorize(x, :rows) -# distances = nan_euclidean(row_with_value_to_fill, nan_col, potential_donors) |> Nx.devectorize() -# row_distances = Nx.indexed_put(distances, Nx.new_axis(i, 0), Nx.Constants.infinity()) - potential_donor = x[i] - - distance = - if i == nan_row do - Nx.Constants.infinity(Nx.type(row_with_value_to_fill)) - else - nan_euclidean(row_with_value_to_fill, nan_col, potential_donor) - end - - row_distances = Nx.indexed_put(row_distances, Nx.new_axis(i, 0), distance) - {{i + 1, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances} - end -# potential_donors = Nx.vectorize(x, :rows) -# distances = nan_euclidean(row_with_value_to_fill, nan_col, potential_donors) |> Nx.devectorize() -# row_distances = Nx.indexed_put(distances, [i], Nx.Constants.infinity()) +# {_, row_distances} = +# while {{i = 0, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances}, +# i < rows do +# +# potential_donor = x[i] +# +# distance = +# if i == nan_row do +# Nx.Constants.infinity(Nx.type(row_with_value_to_fill)) +# else +# nan_euclidean(row_with_value_to_fill, nan_col, potential_donor) +# end +# +# row_distances = Nx.indexed_put(row_distances, Nx.new_axis(i, 0), distance) +# {{i + 1, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances} +# end + potential_donors = Nx.vectorize(x, :rows) + distances = nan_euclidean(row_with_value_to_fill, nan_col, potential_donors) |> Nx.devectorize() + row_distances = Nx.indexed_put(distances, Nx.tensor(nan_row), Nx.Constants.infinity()) {_, indices} = Nx.top_k(-row_distances, k: num_neighbors) From e23a9dd6c524c8397d77093159360b25af7f6bc9 Mon Sep 17 00:00:00 2001 From: srzeszut Date: Thu, 28 Nov 2024 12:45:51 +0100 Subject: [PATCH 12/13] apply suggested changes --- lib/scholar/impute/knn_imputter.ex | 38 +++++++++++++++--------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/lib/scholar/impute/knn_imputter.ex b/lib/scholar/impute/knn_imputter.ex index d62bfaf2..3107eca2 100644 --- a/lib/scholar/impute/knn_imputter.ex +++ b/lib/scholar/impute/knn_imputter.ex @@ -163,25 +163,18 @@ defmodule Scholar.Impute.KNNImputter do # calculate distance between row with nan to fill and all other rows where distance # to the row is under its index in the tensor -# {_, row_distances} = -# while {{i = 0, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances}, -# i < rows do -# -# potential_donor = x[i] -# -# distance = -# if i == nan_row do -# Nx.Constants.infinity(Nx.type(row_with_value_to_fill)) -# else -# nan_euclidean(row_with_value_to_fill, nan_col, potential_donor) -# end -# -# row_distances = Nx.indexed_put(row_distances, Nx.new_axis(i, 0), distance) -# {{i + 1, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances} -# end - potential_donors = Nx.vectorize(x, :rows) - distances = nan_euclidean(row_with_value_to_fill, nan_col, potential_donors) |> Nx.devectorize() - row_distances = Nx.indexed_put(distances, Nx.tensor(nan_row), Nx.Constants.infinity()) + {_, row_distances} = + while {{i = 0, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances}, + i < rows do + + potential_donor = x[i] + + distance = + calculate_distance(row_with_value_to_fill, nan_col, potential_donor,nan_row) + + row_distances = Nx.indexed_put(row_distances, Nx.new_axis(i, 0), distance) + {{i + 1, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances} + end {_, indices} = Nx.top_k(-row_distances, k: num_neighbors) @@ -190,6 +183,13 @@ defmodule Scholar.Impute.KNNImputter do Nx.sum(values) / num_neighbors end + defnp calculate_distance(row,nan_col,potential_donor,nan_row) do + case row do + ^nan_row -> Nx.Constants.infinity(Nx.type(row)) + _ -> nan_euclidean(row, nan_col, potential_donor) + end + end + # nan_col is the column of the value to impute defnp nan_euclidean(row, nan_col, potential_neighbor) do {coordinates} = Nx.shape(row) From d5913eb4d4b76d38a7ccb51aa1c423f6a8512966 Mon Sep 17 00:00:00 2001 From: srzeszut Date: Thu, 28 Nov 2024 12:55:25 +0100 Subject: [PATCH 13/13] mix format --- lib/scholar/impute/knn_imputter.ex | 16 +++++++++------- test/scholar/impute/knn_imputter_test.exs | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/lib/scholar/impute/knn_imputter.ex b/lib/scholar/impute/knn_imputter.ex index 3107eca2..6b5a602e 100644 --- a/lib/scholar/impute/knn_imputter.ex +++ b/lib/scholar/impute/knn_imputter.ex @@ -83,11 +83,12 @@ defmodule Scholar.Impute.KNNImputter do x = if missing_values != :nan, - do: Nx.select(Nx.equal(x, missing_values), :nan, x), - else: x + do: Nx.select(Nx.equal(x, missing_values), :nan, x), + else: x + statistics = + knn_impute(x, num_neighbors: opts[:num_neighbors], missing_values: missing_values) - statistics = knn_impute(x, num_neighbors: opts[:num_neighbors], missing_values: missing_values) %__MODULE__{statistics: statistics, missing_values: missing_values} end @@ -139,7 +140,9 @@ defmodule Scholar.Impute.KNNImputter do neighbor_avg = calculate_knn(x, row, col, rows: rows, num_neighbors: opts[:num_neighbors]) - values_to_impute = Nx.put_slice(values_to_impute, [row, col], Nx.reshape(neighbor_avg, {1, 1})) + values_to_impute = + Nx.put_slice(values_to_impute, [row, col], Nx.reshape(neighbor_avg, {1, 1})) + {{col + 1, mask, num_neighbors, cols, row, x}, values_to_impute} else {{col + 1, mask, num_neighbors, num_cols, row, x}, values_to_impute} @@ -166,11 +169,10 @@ defmodule Scholar.Impute.KNNImputter do {_, row_distances} = while {{i = 0, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances}, i < rows do - potential_donor = x[i] distance = - calculate_distance(row_with_value_to_fill, nan_col, potential_donor,nan_row) + calculate_distance(row_with_value_to_fill, nan_col, potential_donor, nan_row) row_distances = Nx.indexed_put(row_distances, Nx.new_axis(i, 0), distance) {{i + 1, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances} @@ -183,7 +185,7 @@ defmodule Scholar.Impute.KNNImputter do Nx.sum(values) / num_neighbors end - defnp calculate_distance(row,nan_col,potential_donor,nan_row) do + defnp calculate_distance(row, nan_col, potential_donor, nan_row) do case row do ^nan_row -> Nx.Constants.infinity(Nx.type(row)) _ -> nan_euclidean(row, nan_col, potential_donor) diff --git a/test/scholar/impute/knn_imputter_test.exs b/test/scholar/impute/knn_imputter_test.exs index c76becb4..8937e7c9 100644 --- a/test/scholar/impute/knn_imputter_test.exs +++ b/test/scholar/impute/knn_imputter_test.exs @@ -73,7 +73,7 @@ defmodule KNNImputterTest do test "missing values different than :nan" do x = generate_data() x = Nx.select(Nx.is_nan(x), 19.0, x) -# x = Nx.select(Nx.equal(x,19), :nan, x) + # x = Nx.select(Nx.equal(x,19), :nan, x) jit_fit = Nx.Defn.jit(&KNNImputter.fit/2) jit_transform = Nx.Defn.jit(&KNNImputter.transform/2)