Skip to content

Commit

Permalink
Add metric to RandomProjectionForest and LargeVis, more unit-tests, etc
Browse files Browse the repository at this point in the history
  • Loading branch information
Krsto Proroković committed May 14, 2024
1 parent 6b9ddd0 commit ea158f1
Show file tree
Hide file tree
Showing 6 changed files with 161 additions and 77 deletions.
2 changes: 1 addition & 1 deletion lib/scholar/neighbors/brute_knn.ex
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ defmodule Scholar.Neighbors.BruteKNN do
type: {:or, [{:custom, Scholar.Options, :metric, []}, {:fun, 2}]},
default: {:minkowski, 2},
doc: ~S"""
The function that measures distance between two points. Possible values:
The function that measures the distance between two points. Possible values:
* `{:minkowski, p}` - Minkowski metric. By changing value of `p` parameter (a positive number or `:infinity`)
we can set Manhattan (`1`), Euclidean (`2`), Chebyshev (`:infinity`), or any arbitrary $L_p$ metric.
Expand Down
138 changes: 81 additions & 57 deletions lib/scholar/neighbors/knn_classifier.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,50 +2,31 @@ defmodule Scholar.Neighbors.KNNClassifier do
@moduledoc """
K-Nearest Neighbors Classifier.
Performs classifiction by looking at the k-nearest neighbors of a point and using (weighted) majority voting.
Performs classifiction by computing the (weighted) majority voting among k-nearest neighbors.
"""

import Nx.Defn
import Scholar.Shared
require Nx

@derive {Nx.Container, keep: [:algorithm, :num_classes, :weights], containers: [:labels]}
@derive {Nx.Container, keep: [:num_classes, :weights], containers: [:algorithm, :labels]}
defstruct [:algorithm, :num_classes, :weights, :labels]

opts = [
algorithm: [
type: {:or, [:atom, {:tuple, [:atom, :keyword_list]}]},
type: :atom,
default: :brute,
doc: """
k-NN algorithm to be used for finding the nearest neighbors. It can be provided as
an atom or a tuple containing an atom and algorithm specific options.
Possible values for the atom:
Algorithm used to compute the k-nearest neighbors. Possible values:
* `:brute` - Brute-force search. See `Scholar.Neighbors.BruteKNN` for more details.
* `:kd_tree` - k-d tree. See `Scholar.Neighbors.KDTree` for more details.
* `:random_projection_forest` - Random projection forest. See `Scholar.Neighbors.RandomProjectionForest` for more details.
* Module implementing fit/2 and predict/2.
"""
],
num_neighbors: [
required: true,
type: :pos_integer,
doc: "The number of nearest neighbors."
],
metric: [
type: {:or, [{:custom, Scholar.Options, :metric, []}, {:fun, 2}]},
default: {:minkowski, 2},
doc: """
The function that measures distance between two points. Possible values:
* `{:minkowski, p}` - Minkowski metric. By changing value of `p` parameter (a positive number or `:infinity`)
we can set Manhattan (`1`), Euclidean (`2`), Chebyshev (`:infinity`), or any arbitrary $L_p$ metric.
* `:cosine` - Cosine metric.
Keep in mind that different algorithms support different metrics. For more information have a look at the corresponding modules.
* Module implementing `fit(data, opts)` and `predict(model, query)`. predict/2 must return tuple containing indices
of k-nearest neighbors of query points as well as distances between query points and their k-nearest neighbors.
"""
],
num_classes: [
Expand Down Expand Up @@ -76,6 +57,8 @@ defmodule Scholar.Neighbors.KNNClassifier do
#{NimbleOptions.docs(@opts_schema)}
Algorithm-specific options (e.g. `:num_neighbors`, `:metric`) should be provided together with the classifier options.
## Examples
iex> x = Nx.tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]])
Expand All @@ -85,31 +68,54 @@ defmodule Scholar.Neighbors.KNNClassifier do
Scholar.Neighbors.BruteKNN.fit(x, num_neighbors: 3)
iex> model.labels
Nx.tensor([0, 0, 0, 1, 1])
iex> x = Nx.tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]])
iex> y = Nx.tensor([0, 0, 0, 1, 1])
iex> model = Scholar.Neighbors.KNNClassifier.fit(x, y, algorithm: :kd_tree, num_neighbors: 3, metric: {:minkowski, 1}, num_classes: 2)
iex> model.algorithm
Scholar.Neighbors.KDTree.fit(x, num_neighbors: 3, metric: {:minkowski, 1})
iex> model.labels
Nx.tensor([0, 0, 0, 1, 1])
iex> x = Nx.tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]])
iex> y = Nx.tensor([0, 0, 0, 1, 1])
iex> key = Nx.Random.key(12)
iex> model = Scholar.Neighbors.KNNClassifier.fit(x, y, algorithm: :random_projection_forest, num_neighbors: 2, num_classes: 2, num_trees: 4, key: key)
iex> model.algorithm
Scholar.Neighbors.RandomProjectionForest.fit(x, num_neighbors: 2, num_trees: 4, key: key)
iex> model.labels
Nx.tensor([0, 0, 0, 1, 1])
"""
deftransform fit(x, y, opts) do
if Nx.rank(x) != 2 do
raise ArgumentError,
"expected x to have shape {num_samples, num_features},
got tensor with shape: #{inspect(Nx.shape(x))}"
"""
expected x to have shape {num_samples, num_features}, \
got tensor with shape: #{inspect(Nx.shape(x))}
"""
end

if Nx.rank(y) != 1 and Nx.axis_size(x, 0) == Nx.axis_size(y, 0) do
if Nx.rank(y) != 1 do
raise ArgumentError,
"expected y to have shape {num_samples},
got tensor with shape: #{inspect(Nx.shape(y))}"
"""
expected y to have shape {num_samples}, \
got tensor with shape: #{inspect(Nx.shape(y))}
"""
end

opts = NimbleOptions.validate!(opts, @opts_schema)
if Nx.axis_size(x, 0) != Nx.axis_size(y, 0) do
raise ArgumentError,
"""
expected x and y to have the same first dimension, \
got #{Nx.axis_size(x, 0)} and #{Nx.axis_size(y, 0)}
"""
end

{algorithm_name, algorithm_opts} =
if is_atom(opts[:algorithm]) do
{opts[:algorithm], []}
else
opts[:algorithm]
end
{opts, algorithm_opts} = Keyword.split(opts, [:algorithm, :num_classes, :weights])
opts = NimbleOptions.validate!(opts, @opts_schema)

knn_module =
case algorithm_name do
algorithm_module =
case opts[:algorithm] do
:brute ->
Scholar.Neighbors.BruteKNN

Expand All @@ -119,22 +125,11 @@ defmodule Scholar.Neighbors.KNNClassifier do
:random_projection_forest ->
Scholar.Neighbors.RandomProjectionForest

knn_module when is_atom(knn_module) ->
knn_module

_ ->
raise ArgumentError,
"""
not supported
"""
module when is_atom(module) ->
module
end

# TODO: Maybe raise an error if :num_neighbors or :metric is already in algorithm_opts?

algorithm_opts = Keyword.put(algorithm_opts, :num_neighbors, opts[:num_neighbors])
algorithm_opts = Keyword.put(algorithm_opts, :metric, opts[:metric])

algorithm = knn_module.fit(x, algorithm_opts)
algorithm = algorithm_module.fit(x, algorithm_opts)

%__MODULE__{
algorithm: algorithm,
Expand All @@ -156,9 +151,8 @@ defmodule Scholar.Neighbors.KNNClassifier do
iex> Scholar.Neighbors.KNNClassifier.predict(model, x_test)
Nx.tensor([0, 0, 1])
"""
deftransform predict(model, x) do
knn_module = model.algorithm.__struct__
{neighbors, distances} = knn_module.predict(model.algorithm, x)
defn predict(model, x) do
{neighbors, distances} = compute_knn(model.algorithm, x)
labels_pred = Nx.take(model.labels, neighbors)

case model.weights do
Expand All @@ -167,6 +161,36 @@ defmodule Scholar.Neighbors.KNNClassifier do
end
end

defn predict_proba(model, x) do
num_samples = Nx.axis_size(x, 0)
{neighbors, distances} = compute_knn(model.algorithm, x)
labels_pred = Nx.take(model.labels, neighbors)
type = Nx.Type.merge(to_float_type(x), {:f, 32})
proba = Nx.broadcast(Nx.tensor(0.0, type: type), {num_samples, model.num_classes})

weights =
case model.weights do
:distance -> check_weights(distances)
:uniform -> Nx.broadcast(1.0, neighbors)
end

indices =
Nx.stack(
[Nx.iota(Nx.shape(labels_pred), axis: 0), Nx.take(model.labels, labels_pred)],
axis: -1 # TODO: Replace -1 here
)
|> Nx.flatten(axes: [0, 1])

proba = Nx.indexed_add(proba, indices, Nx.flatten(weights))
normalizer = Nx.sum(proba, axes: [1])
normalizer = Nx.select(normalizer == 0, 1, normalizer)
proba / Nx.new_axis(normalizer, -1) # TODO: Replace -1 here
end

deftransformp compute_knn(algorithm, x) do
algorithm.__struct__.predict(algorithm, x)
end

defnp check_weights(weights) do
zero_mask = weights == 0
zero_rows = zero_mask |> Nx.any(axes: [1], keep_axes: true) |> Nx.broadcast(weights)
Expand Down
32 changes: 24 additions & 8 deletions lib/scholar/neighbors/large_vis.ex
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ defmodule Scholar.Neighbors.LargeVis do
import Nx.Defn
import Scholar.Shared
require Nx
alias Scholar.Neighbors.RandomProjectionForest, as: Forest
alias Scholar.Neighbors.RandomProjectionForest
alias Scholar.Neighbors.Utils

opts = [
Expand All @@ -23,6 +23,11 @@ defmodule Scholar.Neighbors.LargeVis do
type: :pos_integer,
doc: "The number of neighbors in the graph."
],
metric: [
type: {:in, [:squared_euclidean, :euclidean]},
default: :euclidean,
doc: "The function that measures distance between two points."
],
min_leaf_size: [
type: :pos_integer,
doc: """
Expand Down Expand Up @@ -63,7 +68,7 @@ defmodule Scholar.Neighbors.LargeVis do
iex> key = Nx.Random.key(12)
iex> tensor = Nx.iota({5, 2})
iex> {graph, distances} = Scholar.Neighbors.LargeVis.fit(tensor, num_neighbors: 2, min_leaf_size: 2, num_trees: 3, key: key)
iex> {graph, distances} = Scholar.Neighbors.LargeVis.fit(tensor, num_neighbors: 2, metric: :squared_euclidean, min_leaf_size: 2, num_trees: 3, key: key)
iex> graph
#Nx.Tensor<
u32[5][2]
Expand Down Expand Up @@ -98,6 +103,13 @@ defmodule Scholar.Neighbors.LargeVis do

opts = NimbleOptions.validate!(opts, @opts_schema)
k = opts[:num_neighbors]

metric =
case opts[:metric] do
:euclidean -> &Scholar.Metrics.Distance.euclidean/2
:squared_euclidean -> &Scholar.Metrics.Distance.squared_euclidean/2
end

min_leaf_size = opts[:min_leaf_size] || max(10, 2 * k)

size = Nx.axis_size(tensor, 0)
Expand All @@ -108,6 +120,7 @@ defmodule Scholar.Neighbors.LargeVis do
tensor,
key,
num_neighbors: k,
metric: metric,
min_leaf_size: min_leaf_size,
num_trees: num_trees,
num_iters: opts[:num_iters]
Expand All @@ -116,15 +129,15 @@ defmodule Scholar.Neighbors.LargeVis do

defnp fit_n(tensor, key, opts) do
forest =
Forest.fit(tensor,
RandomProjectionForest.fit(tensor,
num_neighbors: opts[:num_neighbors],
min_leaf_size: opts[:min_leaf_size],
num_trees: opts[:num_trees],
key: key
)

{graph, _} = Forest.predict(forest, tensor)
expand(graph, tensor, num_iters: opts[:num_iters])
{graph, _} = RandomProjectionForest.predict(forest, tensor)
expand(graph, tensor, metric: opts[:metric], num_iters: opts[:num_iters])
end

defn expand(graph, tensor, opts) do
Expand All @@ -140,17 +153,20 @@ defmodule Scholar.Neighbors.LargeVis do
{tensor, iter = 0}
},
iter < num_iters do
{expansion_iter(graph, tensor), {tensor, iter + 1}}
{expansion_iter(graph, tensor, metric: opts[:metric]), {tensor, iter + 1}}
end

result
end

defnp expansion_iter(graph, tensor) do
defnp expansion_iter(graph, tensor, opts) do
{size, k} = Nx.shape(graph)
candidate_indices = Nx.take(graph, graph) |> Nx.reshape({size, k * k})
candidate_indices = Nx.concatenate([graph, candidate_indices], axis: 1)

Utils.find_neighbors(tensor, tensor, candidate_indices, num_neighbors: k)
Utils.brute_force_search_with_candidates(tensor, tensor, candidate_indices,
num_neighbors: k,
metric: opts[:metric]
)
end
end
24 changes: 20 additions & 4 deletions lib/scholar/neighbors/random_projection_forest.ex
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ defmodule Scholar.Neighbors.RandomProjectionForest do
alias Scholar.Neighbors.Utils

@derive {Nx.Container,
keep: [:num_neighbors, :depth, :leaf_size, :num_trees],
keep: [:num_neighbors, :metric, :depth, :leaf_size, :num_trees],
containers: [:indices, :data, :hyperplanes, :medians]}
@enforce_keys [
:num_neighbors,
Expand All @@ -37,6 +37,7 @@ defmodule Scholar.Neighbors.RandomProjectionForest do
]
defstruct [
:num_neighbors,
:metric,
:depth,
:leaf_size,
:num_trees,
Expand All @@ -52,6 +53,11 @@ defmodule Scholar.Neighbors.RandomProjectionForest do
type: :pos_integer,
doc: "The number of nearest neighbors."
],
metric: [
type: {:in, [:squared_euclidean, :euclidean]},
default: :euclidean,
doc: "The function that measures the distance between two points."
],
min_leaf_size: [
type: :pos_integer,
doc: "The minumum number of points in the leaf."
Expand Down Expand Up @@ -107,6 +113,12 @@ defmodule Scholar.Neighbors.RandomProjectionForest do
num_neighbors = opts[:num_neighbors]
min_leaf_size = opts[:min_leaf_size]

metric =
case opts[:metric] do
:euclidean -> &Scholar.Metrics.Distance.euclidean/2
:squared_euclidean -> &Scholar.Metrics.Distance.squared_euclidean/2
end

min_leaf_size =
cond do
is_nil(min_leaf_size) ->
Expand Down Expand Up @@ -142,6 +154,7 @@ defmodule Scholar.Neighbors.RandomProjectionForest do

%__MODULE__{
num_neighbors: num_neighbors,
metric: metric,
depth: depth,
leaf_size: leaf_size,
num_trees: num_trees,
Expand Down Expand Up @@ -283,7 +296,7 @@ defmodule Scholar.Neighbors.RandomProjectionForest do
iex> key = Nx.Random.key(12)
iex> tensor = Nx.iota({5, 2})
iex> forest = Scholar.Neighbors.RandomProjectionForest.fit(tensor, num_neighbors: 2, num_trees: 3, key: key)
iex> forest = Scholar.Neighbors.RandomProjectionForest.fit(tensor, num_neighbors: 2, metric: :squared_euclidean, num_trees: 3, key: key)
iex> query = Nx.tensor([[3, 4]])
iex> {neighbors, distances} = Scholar.Neighbors.RandomProjectionForest.predict(forest, query)
iex> neighbors
Expand Down Expand Up @@ -323,9 +336,12 @@ defmodule Scholar.Neighbors.RandomProjectionForest do
end

defnp predict_n(forest, query) do
k = forest.num_neighbors
candidate_indices = get_leaves(forest, query)
Utils.find_neighbors(query, forest.data, candidate_indices, num_neighbors: k)

Utils.brute_force_search_with_candidates(forest.data, query, candidate_indices,
num_neighbors: forest.num_neighbors,
metric: forest.metric
)
end

@doc false
Expand Down
Loading

0 comments on commit ea158f1

Please sign in to comment.