From ea158f196623e8f5cdaaa3cd2af5b01633576e38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= Date: Tue, 14 May 2024 15:13:10 +0200 Subject: [PATCH] Add metric to RandomProjectionForest and LargeVis, more unit-tests, etc --- lib/scholar/neighbors/brute_knn.ex | 2 +- lib/scholar/neighbors/knn_classifier.ex | 138 ++++++++++-------- lib/scholar/neighbors/large_vis.ex | 32 +++- .../neighbors/random_projection_forest.ex | 24 ++- lib/scholar/neighbors/utils.ex | 18 ++- .../scholar/neighbors/knn_classifier_test.exs | 24 +++ 6 files changed, 161 insertions(+), 77 deletions(-) diff --git a/lib/scholar/neighbors/brute_knn.ex b/lib/scholar/neighbors/brute_knn.ex index bfb584af..97ecbaab 100644 --- a/lib/scholar/neighbors/brute_knn.ex +++ b/lib/scholar/neighbors/brute_knn.ex @@ -25,7 +25,7 @@ defmodule Scholar.Neighbors.BruteKNN do type: {:or, [{:custom, Scholar.Options, :metric, []}, {:fun, 2}]}, default: {:minkowski, 2}, doc: ~S""" - The function that measures distance between two points. Possible values: + The function that measures the distance between two points. Possible values: * `{:minkowski, p}` - Minkowski metric. By changing value of `p` parameter (a positive number or `:infinity`) we can set Manhattan (`1`), Euclidean (`2`), Chebyshev (`:infinity`), or any arbitrary $L_p$ metric. diff --git a/lib/scholar/neighbors/knn_classifier.ex b/lib/scholar/neighbors/knn_classifier.ex index d5bfb0a7..a6a963b8 100644 --- a/lib/scholar/neighbors/knn_classifier.ex +++ b/lib/scholar/neighbors/knn_classifier.ex @@ -2,23 +2,22 @@ defmodule Scholar.Neighbors.KNNClassifier do @moduledoc """ K-Nearest Neighbors Classifier. - Performs classifiction by looking at the k-nearest neighbors of a point and using (weighted) majority voting. + Performs classifiction by computing the (weighted) majority voting among k-nearest neighbors. """ import Nx.Defn + import Scholar.Shared require Nx - @derive {Nx.Container, keep: [:algorithm, :num_classes, :weights], containers: [:labels]} + @derive {Nx.Container, keep: [:num_classes, :weights], containers: [:algorithm, :labels]} defstruct [:algorithm, :num_classes, :weights, :labels] opts = [ algorithm: [ - type: {:or, [:atom, {:tuple, [:atom, :keyword_list]}]}, + type: :atom, default: :brute, doc: """ - k-NN algorithm to be used for finding the nearest neighbors. It can be provided as - an atom or a tuple containing an atom and algorithm specific options. - Possible values for the atom: + Algorithm used to compute the k-nearest neighbors. Possible values: * `:brute` - Brute-force search. See `Scholar.Neighbors.BruteKNN` for more details. @@ -26,26 +25,8 @@ defmodule Scholar.Neighbors.KNNClassifier do * `:random_projection_forest` - Random projection forest. See `Scholar.Neighbors.RandomProjectionForest` for more details. - * Module implementing fit/2 and predict/2. - """ - ], - num_neighbors: [ - required: true, - type: :pos_integer, - doc: "The number of nearest neighbors." - ], - metric: [ - type: {:or, [{:custom, Scholar.Options, :metric, []}, {:fun, 2}]}, - default: {:minkowski, 2}, - doc: """ - The function that measures distance between two points. Possible values: - - * `{:minkowski, p}` - Minkowski metric. By changing value of `p` parameter (a positive number or `:infinity`) - we can set Manhattan (`1`), Euclidean (`2`), Chebyshev (`:infinity`), or any arbitrary $L_p$ metric. - - * `:cosine` - Cosine metric. - - Keep in mind that different algorithms support different metrics. For more information have a look at the corresponding modules. + * Module implementing `fit(data, opts)` and `predict(model, query)`. predict/2 must return tuple containing indices + of k-nearest neighbors of query points as well as distances between query points and their k-nearest neighbors. """ ], num_classes: [ @@ -76,6 +57,8 @@ defmodule Scholar.Neighbors.KNNClassifier do #{NimbleOptions.docs(@opts_schema)} + Algorithm-specific options (e.g. `:num_neighbors`, `:metric`) should be provided together with the classifier options. + ## Examples iex> x = Nx.tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]) @@ -85,31 +68,54 @@ defmodule Scholar.Neighbors.KNNClassifier do Scholar.Neighbors.BruteKNN.fit(x, num_neighbors: 3) iex> model.labels Nx.tensor([0, 0, 0, 1, 1]) + + iex> x = Nx.tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]) + iex> y = Nx.tensor([0, 0, 0, 1, 1]) + iex> model = Scholar.Neighbors.KNNClassifier.fit(x, y, algorithm: :kd_tree, num_neighbors: 3, metric: {:minkowski, 1}, num_classes: 2) + iex> model.algorithm + Scholar.Neighbors.KDTree.fit(x, num_neighbors: 3, metric: {:minkowski, 1}) + iex> model.labels + Nx.tensor([0, 0, 0, 1, 1]) + + iex> x = Nx.tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]) + iex> y = Nx.tensor([0, 0, 0, 1, 1]) + iex> key = Nx.Random.key(12) + iex> model = Scholar.Neighbors.KNNClassifier.fit(x, y, algorithm: :random_projection_forest, num_neighbors: 2, num_classes: 2, num_trees: 4, key: key) + iex> model.algorithm + Scholar.Neighbors.RandomProjectionForest.fit(x, num_neighbors: 2, num_trees: 4, key: key) + iex> model.labels + Nx.tensor([0, 0, 0, 1, 1]) """ deftransform fit(x, y, opts) do if Nx.rank(x) != 2 do raise ArgumentError, - "expected x to have shape {num_samples, num_features}, - got tensor with shape: #{inspect(Nx.shape(x))}" + """ + expected x to have shape {num_samples, num_features}, \ + got tensor with shape: #{inspect(Nx.shape(x))} + """ end - if Nx.rank(y) != 1 and Nx.axis_size(x, 0) == Nx.axis_size(y, 0) do + if Nx.rank(y) != 1 do raise ArgumentError, - "expected y to have shape {num_samples}, - got tensor with shape: #{inspect(Nx.shape(y))}" + """ + expected y to have shape {num_samples}, \ + got tensor with shape: #{inspect(Nx.shape(y))} + """ end - opts = NimbleOptions.validate!(opts, @opts_schema) + if Nx.axis_size(x, 0) != Nx.axis_size(y, 0) do + raise ArgumentError, + """ + expected x and y to have the same first dimension, \ + got #{Nx.axis_size(x, 0)} and #{Nx.axis_size(y, 0)} + """ + end - {algorithm_name, algorithm_opts} = - if is_atom(opts[:algorithm]) do - {opts[:algorithm], []} - else - opts[:algorithm] - end + {opts, algorithm_opts} = Keyword.split(opts, [:algorithm, :num_classes, :weights]) + opts = NimbleOptions.validate!(opts, @opts_schema) - knn_module = - case algorithm_name do + algorithm_module = + case opts[:algorithm] do :brute -> Scholar.Neighbors.BruteKNN @@ -119,22 +125,11 @@ defmodule Scholar.Neighbors.KNNClassifier do :random_projection_forest -> Scholar.Neighbors.RandomProjectionForest - knn_module when is_atom(knn_module) -> - knn_module - - _ -> - raise ArgumentError, - """ - not supported - """ + module when is_atom(module) -> + module end - # TODO: Maybe raise an error if :num_neighbors or :metric is already in algorithm_opts? - - algorithm_opts = Keyword.put(algorithm_opts, :num_neighbors, opts[:num_neighbors]) - algorithm_opts = Keyword.put(algorithm_opts, :metric, opts[:metric]) - - algorithm = knn_module.fit(x, algorithm_opts) + algorithm = algorithm_module.fit(x, algorithm_opts) %__MODULE__{ algorithm: algorithm, @@ -156,9 +151,8 @@ defmodule Scholar.Neighbors.KNNClassifier do iex> Scholar.Neighbors.KNNClassifier.predict(model, x_test) Nx.tensor([0, 0, 1]) """ - deftransform predict(model, x) do - knn_module = model.algorithm.__struct__ - {neighbors, distances} = knn_module.predict(model.algorithm, x) + defn predict(model, x) do + {neighbors, distances} = compute_knn(model.algorithm, x) labels_pred = Nx.take(model.labels, neighbors) case model.weights do @@ -167,6 +161,36 @@ defmodule Scholar.Neighbors.KNNClassifier do end end + defn predict_proba(model, x) do + num_samples = Nx.axis_size(x, 0) + {neighbors, distances} = compute_knn(model.algorithm, x) + labels_pred = Nx.take(model.labels, neighbors) + type = Nx.Type.merge(to_float_type(x), {:f, 32}) + proba = Nx.broadcast(Nx.tensor(0.0, type: type), {num_samples, model.num_classes}) + + weights = + case model.weights do + :distance -> check_weights(distances) + :uniform -> Nx.broadcast(1.0, neighbors) + end + + indices = + Nx.stack( + [Nx.iota(Nx.shape(labels_pred), axis: 0), Nx.take(model.labels, labels_pred)], + axis: -1 # TODO: Replace -1 here + ) + |> Nx.flatten(axes: [0, 1]) + + proba = Nx.indexed_add(proba, indices, Nx.flatten(weights)) + normalizer = Nx.sum(proba, axes: [1]) + normalizer = Nx.select(normalizer == 0, 1, normalizer) + proba / Nx.new_axis(normalizer, -1) # TODO: Replace -1 here + end + + deftransformp compute_knn(algorithm, x) do + algorithm.__struct__.predict(algorithm, x) + end + defnp check_weights(weights) do zero_mask = weights == 0 zero_rows = zero_mask |> Nx.any(axes: [1], keep_axes: true) |> Nx.broadcast(weights) diff --git a/lib/scholar/neighbors/large_vis.ex b/lib/scholar/neighbors/large_vis.ex index 8f98fdbf..076a9c9c 100644 --- a/lib/scholar/neighbors/large_vis.ex +++ b/lib/scholar/neighbors/large_vis.ex @@ -14,7 +14,7 @@ defmodule Scholar.Neighbors.LargeVis do import Nx.Defn import Scholar.Shared require Nx - alias Scholar.Neighbors.RandomProjectionForest, as: Forest + alias Scholar.Neighbors.RandomProjectionForest alias Scholar.Neighbors.Utils opts = [ @@ -23,6 +23,11 @@ defmodule Scholar.Neighbors.LargeVis do type: :pos_integer, doc: "The number of neighbors in the graph." ], + metric: [ + type: {:in, [:squared_euclidean, :euclidean]}, + default: :euclidean, + doc: "The function that measures distance between two points." + ], min_leaf_size: [ type: :pos_integer, doc: """ @@ -63,7 +68,7 @@ defmodule Scholar.Neighbors.LargeVis do iex> key = Nx.Random.key(12) iex> tensor = Nx.iota({5, 2}) - iex> {graph, distances} = Scholar.Neighbors.LargeVis.fit(tensor, num_neighbors: 2, min_leaf_size: 2, num_trees: 3, key: key) + iex> {graph, distances} = Scholar.Neighbors.LargeVis.fit(tensor, num_neighbors: 2, metric: :squared_euclidean, min_leaf_size: 2, num_trees: 3, key: key) iex> graph #Nx.Tensor< u32[5][2] @@ -98,6 +103,13 @@ defmodule Scholar.Neighbors.LargeVis do opts = NimbleOptions.validate!(opts, @opts_schema) k = opts[:num_neighbors] + + metric = + case opts[:metric] do + :euclidean -> &Scholar.Metrics.Distance.euclidean/2 + :squared_euclidean -> &Scholar.Metrics.Distance.squared_euclidean/2 + end + min_leaf_size = opts[:min_leaf_size] || max(10, 2 * k) size = Nx.axis_size(tensor, 0) @@ -108,6 +120,7 @@ defmodule Scholar.Neighbors.LargeVis do tensor, key, num_neighbors: k, + metric: metric, min_leaf_size: min_leaf_size, num_trees: num_trees, num_iters: opts[:num_iters] @@ -116,15 +129,15 @@ defmodule Scholar.Neighbors.LargeVis do defnp fit_n(tensor, key, opts) do forest = - Forest.fit(tensor, + RandomProjectionForest.fit(tensor, num_neighbors: opts[:num_neighbors], min_leaf_size: opts[:min_leaf_size], num_trees: opts[:num_trees], key: key ) - {graph, _} = Forest.predict(forest, tensor) - expand(graph, tensor, num_iters: opts[:num_iters]) + {graph, _} = RandomProjectionForest.predict(forest, tensor) + expand(graph, tensor, metric: opts[:metric], num_iters: opts[:num_iters]) end defn expand(graph, tensor, opts) do @@ -140,17 +153,20 @@ defmodule Scholar.Neighbors.LargeVis do {tensor, iter = 0} }, iter < num_iters do - {expansion_iter(graph, tensor), {tensor, iter + 1}} + {expansion_iter(graph, tensor, metric: opts[:metric]), {tensor, iter + 1}} end result end - defnp expansion_iter(graph, tensor) do + defnp expansion_iter(graph, tensor, opts) do {size, k} = Nx.shape(graph) candidate_indices = Nx.take(graph, graph) |> Nx.reshape({size, k * k}) candidate_indices = Nx.concatenate([graph, candidate_indices], axis: 1) - Utils.find_neighbors(tensor, tensor, candidate_indices, num_neighbors: k) + Utils.brute_force_search_with_candidates(tensor, tensor, candidate_indices, + num_neighbors: k, + metric: opts[:metric] + ) end end diff --git a/lib/scholar/neighbors/random_projection_forest.ex b/lib/scholar/neighbors/random_projection_forest.ex index 5c89d173..b610299e 100644 --- a/lib/scholar/neighbors/random_projection_forest.ex +++ b/lib/scholar/neighbors/random_projection_forest.ex @@ -23,7 +23,7 @@ defmodule Scholar.Neighbors.RandomProjectionForest do alias Scholar.Neighbors.Utils @derive {Nx.Container, - keep: [:num_neighbors, :depth, :leaf_size, :num_trees], + keep: [:num_neighbors, :metric, :depth, :leaf_size, :num_trees], containers: [:indices, :data, :hyperplanes, :medians]} @enforce_keys [ :num_neighbors, @@ -37,6 +37,7 @@ defmodule Scholar.Neighbors.RandomProjectionForest do ] defstruct [ :num_neighbors, + :metric, :depth, :leaf_size, :num_trees, @@ -52,6 +53,11 @@ defmodule Scholar.Neighbors.RandomProjectionForest do type: :pos_integer, doc: "The number of nearest neighbors." ], + metric: [ + type: {:in, [:squared_euclidean, :euclidean]}, + default: :euclidean, + doc: "The function that measures the distance between two points." + ], min_leaf_size: [ type: :pos_integer, doc: "The minumum number of points in the leaf." @@ -107,6 +113,12 @@ defmodule Scholar.Neighbors.RandomProjectionForest do num_neighbors = opts[:num_neighbors] min_leaf_size = opts[:min_leaf_size] + metric = + case opts[:metric] do + :euclidean -> &Scholar.Metrics.Distance.euclidean/2 + :squared_euclidean -> &Scholar.Metrics.Distance.squared_euclidean/2 + end + min_leaf_size = cond do is_nil(min_leaf_size) -> @@ -142,6 +154,7 @@ defmodule Scholar.Neighbors.RandomProjectionForest do %__MODULE__{ num_neighbors: num_neighbors, + metric: metric, depth: depth, leaf_size: leaf_size, num_trees: num_trees, @@ -283,7 +296,7 @@ defmodule Scholar.Neighbors.RandomProjectionForest do iex> key = Nx.Random.key(12) iex> tensor = Nx.iota({5, 2}) - iex> forest = Scholar.Neighbors.RandomProjectionForest.fit(tensor, num_neighbors: 2, num_trees: 3, key: key) + iex> forest = Scholar.Neighbors.RandomProjectionForest.fit(tensor, num_neighbors: 2, metric: :squared_euclidean, num_trees: 3, key: key) iex> query = Nx.tensor([[3, 4]]) iex> {neighbors, distances} = Scholar.Neighbors.RandomProjectionForest.predict(forest, query) iex> neighbors @@ -323,9 +336,12 @@ defmodule Scholar.Neighbors.RandomProjectionForest do end defnp predict_n(forest, query) do - k = forest.num_neighbors candidate_indices = get_leaves(forest, query) - Utils.find_neighbors(query, forest.data, candidate_indices, num_neighbors: k) + + Utils.brute_force_search_with_candidates(forest.data, query, candidate_indices, + num_neighbors: forest.num_neighbors, + metric: forest.metric + ) end @doc false diff --git a/lib/scholar/neighbors/utils.ex b/lib/scholar/neighbors/utils.ex index f202348b..06acc8c4 100644 --- a/lib/scholar/neighbors/utils.ex +++ b/lib/scholar/neighbors/utils.ex @@ -3,16 +3,20 @@ defmodule Scholar.Neighbors.Utils do import Nx.Defn require Nx - defn find_neighbors(query, data, candidate_indices, opts) do + defn brute_force_search_with_candidates(data, query, candidate_indices, opts) do k = opts[:num_neighbors] + metric = opts[:metric] + dim = Nx.axis_size(data, 1) {size, length} = Nx.shape(candidate_indices) - distances = + x = query |> Nx.new_axis(1) - |> Nx.subtract(Nx.take(data, candidate_indices)) - |> Nx.pow(2) - |> Nx.sum(axes: [2]) + |> Nx.broadcast({size, length, dim}) + |> Nx.vectorize([:query, :candidates]) + + y = Nx.take(data, candidate_indices) |> Nx.vectorize([:query, :candidates]) + distances = metric.(x, y) |> Nx.devectorize() |> Nx.rename(nil) distances = if length > 1 do @@ -55,11 +59,11 @@ defmodule Scholar.Neighbors.Utils do target = Nx.broadcast(Nx.u32(0), {size, length}) samples = Nx.iota({size, length, 1}, axis: 0) - indices = + target_indices = Nx.concatenate([samples, Nx.new_axis(indices, 2)], axis: 2) |> Nx.reshape({size * length, 2}) updates = Nx.iota({size, length}, axis: 1) |> Nx.reshape({size * length}) - Nx.indexed_add(target, indices, updates) + Nx.indexed_add(target, target_indices, updates) end end diff --git a/test/scholar/neighbors/knn_classifier_test.exs b/test/scholar/neighbors/knn_classifier_test.exs index 5d6731b2..19819f4f 100644 --- a/test/scholar/neighbors/knn_classifier_test.exs +++ b/test/scholar/neighbors/knn_classifier_test.exs @@ -49,6 +49,30 @@ defmodule Scholar.Neighbors.KNNClassifierTest do assert model.labels == y_train() assert model.weights == :uniform end + + test "fit with random projection forest" do + key = Nx.Random.key(12) + + model = + KNNClassifier.fit(x_train(), y_train(), + algorithm: :random_projection_forest, + num_neighbors: 3, + num_classes: 2, + num_trees: 4, + key: key + ) + + assert model.algorithm == + Scholar.Neighbors.RandomProjectionForest.fit(x_train(), + num_neighbors: 3, + num_trees: 4, + key: key + ) + + assert model.num_classes == 2 + assert model.labels == y_train() + assert model.weights == :uniform + end end describe "predict" do