From b55b1021b64ed3ca7f9e3522b6e9d69e563fd3a1 Mon Sep 17 00:00:00 2001 From: norm4nn Date: Fri, 9 Aug 2024 23:49:12 +0200 Subject: [PATCH 1/8] Added OPTICS clustering algorithm, description and doctests --- lib/scholar/cluster/optics.ex | 201 ++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 lib/scholar/cluster/optics.ex diff --git a/lib/scholar/cluster/optics.ex b/lib/scholar/cluster/optics.ex new file mode 100644 index 00000000..6b3e88ee --- /dev/null +++ b/lib/scholar/cluster/optics.ex @@ -0,0 +1,201 @@ +defmodule Scholar.Cluster.OPTICS do + @moduledoc """ + OPTICS (Ordering Points To Identify the Clustering Structure), closely related to DBSCAN, finds core sample of high density and expands clusters from them. Unlike DBSCAN, keeps cluster hierarchy for a variable neighborhood radius. Clusters are then extracted using a DBSCAN-like method. + """ + import Nx.Defn + require Nx + + opts = [ + min_samples: [ + default: 5, + type: :pos_integer, + doc: "The number of samples in a neighborhood for a point to be considered as a core point." + ], + max_eps: [ + default: Nx.Constants.infinity(), + type: {:custom, Scholar.Options, :beta, []}, + doc: + "The maximum distance between two samples for one to be considered as in the neighborhood of the other. Default value of Nx.Constants.infinity() will identify clusters across all scales " + ], + eps: [ + default: Nx.Constants.nan(), + type: {:custom, Scholar.Options, :beta, []}, + doc: + "The maximum distance between two samples for one to be considered as in the neighborhood of the other. By default it assumes the same value as max_eps." + ] + ] + + @opts_schema NimbleOptions.new!(opts) + + @doc """ + + Perform OPTICS clustering for `x` which is tensor of {n_samples, n_features} shape. + + ## Options + + #{NimbleOptions.docs(@opts_schema)} + + ## Return Values + + The function returns a labels tensor of shape {n_samples} + Cluster labels for each point in the dataset given to fit(). + Noisy samples are labeled as -1. + + ## Examples + + iex> x = Nx.tensor([[1, 2], [2, 5], [3, 6], [8, 7], [8, 8], [7, 3]]) + iex> Scholar.Cluster.OPTICS.fit(x, min_samples: 2) + #Nx.Tensor< + s64[6] + [-1, -1, -1, -1, -1, -1] + > + iex> Scholar.Cluster.OPTICS.fit(x, eps: 4.5, min_samples: 2) + #Nx.Tensor< + s64[6] + [0, 0, 0, 1, 1, 1] + > + iex> Scholar.Cluster.OPTICS.fit(x, eps: 2, min_samples: 2) + #Nx.Tensor< + s64[6] + [-1, 0, 0, 1, 1, -1] + > + iex> Scholar.Cluster.OPTICS.fit(x, eps: 1, min_samples: 2) + #Nx.Tensor< + s64[6] + [-1, -1, -1, 0, 0, -1] + > + iex> Scholar.Cluster.OPTICS.fit(x, eps: 4.5, min_samples: 3) + #Nx.Tensor< + s64[6] + [0, 0, 0, 1, 1, -1] + > + """ + + deftransform fit(x, opts \\ []) do + fit_p(x, NimbleOptions.validate!(opts, @opts_schema)) + end + + defnp fit_p(x, opts \\ []) do + {core_distances, reachability, _predecessor, ordering} = compute_optics_graph(x, opts) + eps = opts[:eps] + cluster_optics_dbscan(reachability, core_distances, ordering, eps: eps) + end + + defnp compute_optics_graph(x, opts \\ []) do + min_samples = opts[:min_samples] + max_eps = opts[:max_eps] + n_samples = Nx.axis_size(x, 0) + reachability = Nx.broadcast(Nx.Constants.max_finite({:f, 32}), {n_samples}) + predecessor = Nx.broadcast(-1, {n_samples}) + neighbors = Scholar.Neighbors.BruteKNN.fit(x, num_neighbors: min_samples) + core_distances = compute_core_distances(x, neighbors, min_samples: min_samples) + + core_distances = + Nx.select(Nx.greater(core_distances, max_eps), Nx.Constants.infinity(), core_distances) + + ordering = Nx.broadcast(0, {n_samples}) + processed = Nx.broadcast(0, {n_samples}) + + {_order_idx, core_distances, reachability, predecessor, _processed, ordering, _x, _max_eps} = + while {order_idx = 0, core_distances, reachability, predecessor, processed, ordering, x, + max_eps}, + order_idx < n_samples do + unprocessed_mask = Nx.equal(processed, 0) + point = Nx.argmin(Nx.select(unprocessed_mask, reachability, Nx.Constants.infinity())) + processed = Nx.put_slice(processed, [point], Nx.broadcast(1, {1})) + ordering = Nx.put_slice(ordering, [order_idx], Nx.broadcast(point, {1})) + + {reachability, predecessor} = + set_reach_dist(core_distances, reachability, predecessor, point, processed, x, + max_eps: max_eps + ) + + {order_idx + 1, core_distances, reachability, predecessor, processed, ordering, x, + max_eps} + end + + reachability = + Nx.select( + Nx.equal(reachability, Nx.Constants.max_finite({:f, 32})), + Nx.Constants.infinity(), + reachability + ) + + {core_distances, reachability, predecessor, ordering} + end + + defnp compute_core_distances(x, neighbors, opts \\ []) do + {_neighbors, distances} = Scholar.Neighbors.BruteKNN.predict(neighbors, x) + Nx.slice_along_axis(distances, opts[:min_samples] - 1, 1, axis: 1) + end + + defnp set_reach_dist( + core_distances, + reachability, + predecessor, + point_index, + processed, + x, + opts \\ [] + ) do + max_eps = opts[:max_eps] + n_features = Nx.axis_size(x, 1) + n_samples = Nx.axis_size(x, 0) + nbrs = Scholar.Neighbors.BruteKNN.fit(x, num_neighbors: n_samples) + t = Nx.take(x, point_index, axis: 0) + p = Nx.broadcast(t, {1, n_features}) + {neighbors, distances} = Scholar.Neighbors.BruteKNN.predict(nbrs, p) + + neighbors = Nx.flatten(neighbors) + distances = Nx.flatten(distances) + indices_ngbrs = Nx.argsort(neighbors) + neighbors = Nx.take(neighbors, indices_ngbrs) + distances = Nx.take(distances, indices_ngbrs) + are_neighbors_processed = Nx.take(processed, neighbors) + + filtered_neighbors = + Nx.select( + Nx.logical_or(are_neighbors_processed, Nx.greater(distances, max_eps)), + Nx.multiply(-1, neighbors), + neighbors + ) + + dists = Nx.flatten(Scholar.Metrics.Distance.pairwise_minkowski(p, x)) + core_distance = Nx.take(core_distances, point_index) + rdists = Nx.max(dists, core_distance) + improved = Nx.less(rdists, reachability) + improved = Nx.select(improved, filtered_neighbors, -1) + + improved = + Nx.select( + Nx.logical_and(Nx.equal(improved, -1), Nx.greater(filtered_neighbors, 0)), + Nx.multiply(filtered_neighbors, -1), + filtered_neighbors + ) + + rdists = Nx.select(Nx.greater_equal(improved, 0), rdists, 0) + reversed_improved = Nx.max(Nx.multiply(improved, -1), 0) + + reachability = + Nx.select(Nx.less_equal(improved, 0), Nx.take(reachability, reversed_improved), rdists) + + predecessor = + Nx.select(Nx.less_equal(improved, 0), Nx.take(predecessor, reversed_improved), point_index) + + {reachability, predecessor} + end + + defnp cluster_optics_dbscan(reachability, core_distances, ordering, opts \\ []) do + eps = opts[:eps] + far_reach = Nx.flatten(Nx.greater(reachability, eps)) + near_core = Nx.flatten(Nx.less_equal(core_distances, eps)) + far_and_not_near = Nx.multiply(far_reach, Nx.subtract(1, near_core)) + far_reach = Nx.take(far_reach, ordering) + near_core = Nx.take(near_core, ordering) + far_and_near = Nx.multiply(far_reach, near_core) + Nx.as_type(Nx.cumulative_sum(far_and_near), :s8) + labels = Nx.subtract(Nx.as_type(Nx.cumulative_sum(far_and_near), :s8), 1) + labels = Nx.take(labels, Nx.argsort(ordering)) + Nx.select(far_and_not_near, -1, labels) + end +end From 35e89e68a88b6c99c55b75a838b35c3a27fcefa1 Mon Sep 17 00:00:00 2001 From: norm4nn Date: Fri, 9 Aug 2024 23:57:32 +0200 Subject: [PATCH 2/8] Updated eps default value to match its desc --- lib/scholar/cluster/optics.ex | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/scholar/cluster/optics.ex b/lib/scholar/cluster/optics.ex index 6b3e88ee..b8bf7800 100644 --- a/lib/scholar/cluster/optics.ex +++ b/lib/scholar/cluster/optics.ex @@ -77,7 +77,14 @@ defmodule Scholar.Cluster.OPTICS do defnp fit_p(x, opts \\ []) do {core_distances, reachability, _predecessor, ordering} = compute_optics_graph(x, opts) - eps = opts[:eps] + + eps = + if Nx.equal(opts[:eps], Nx.Constants.nan()) do + opts[:max_eps] + else + opts[:eps] + end + cluster_optics_dbscan(reachability, core_distances, ordering, eps: eps) end From 43f88318f5b2e1afa2c0082e78f614fd4cd22f4c Mon Sep 17 00:00:00 2001 From: norm4nn Date: Sun, 11 Aug 2024 21:09:55 +0200 Subject: [PATCH 3/8] Applied suggested changes --- lib/scholar/cluster/optics.ex | 39 +++++++++++++++++------------------ 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/lib/scholar/cluster/optics.ex b/lib/scholar/cluster/optics.ex index b8bf7800..86c4608c 100644 --- a/lib/scholar/cluster/optics.ex +++ b/lib/scholar/cluster/optics.ex @@ -18,7 +18,7 @@ defmodule Scholar.Cluster.OPTICS do "The maximum distance between two samples for one to be considered as in the neighborhood of the other. Default value of Nx.Constants.infinity() will identify clusters across all scales " ], eps: [ - default: Nx.Constants.nan(), + default: Nx.Constants.infinity(), type: {:custom, Scholar.Options, :beta, []}, doc: "The maximum distance between two samples for one to be considered as in the neighborhood of the other. By default it assumes the same value as max_eps." @@ -79,7 +79,7 @@ defmodule Scholar.Cluster.OPTICS do {core_distances, reachability, _predecessor, ordering} = compute_optics_graph(x, opts) eps = - if Nx.equal(opts[:eps], Nx.Constants.nan()) do + if opts[:eps] == Nx.Constants.infinity() do opts[:max_eps] else opts[:eps] @@ -98,7 +98,7 @@ defmodule Scholar.Cluster.OPTICS do core_distances = compute_core_distances(x, neighbors, min_samples: min_samples) core_distances = - Nx.select(Nx.greater(core_distances, max_eps), Nx.Constants.infinity(), core_distances) + Nx.select(core_distances > max_eps, Nx.Constants.infinity(), core_distances) ordering = Nx.broadcast(0, {n_samples}) processed = Nx.broadcast(0, {n_samples}) @@ -107,10 +107,10 @@ defmodule Scholar.Cluster.OPTICS do while {order_idx = 0, core_distances, reachability, predecessor, processed, ordering, x, max_eps}, order_idx < n_samples do - unprocessed_mask = Nx.equal(processed, 0) + unprocessed_mask = processed == 0 point = Nx.argmin(Nx.select(unprocessed_mask, reachability, Nx.Constants.infinity())) - processed = Nx.put_slice(processed, [point], Nx.broadcast(1, {1})) - ordering = Nx.put_slice(ordering, [order_idx], Nx.broadcast(point, {1})) + processed = Nx.put_slice(processed, [point], Nx.new_axis(1, 0)) + ordering = Nx.put_slice(ordering, [order_idx], Nx.new_axis(point, 0)) {reachability, predecessor} = set_reach_dist(core_distances, reachability, predecessor, point, processed, x, @@ -123,7 +123,7 @@ defmodule Scholar.Cluster.OPTICS do reachability = Nx.select( - Nx.equal(reachability, Nx.Constants.max_finite({:f, 32})), + reachability == Nx.Constants.max_finite({:f, 32}), Nx.Constants.infinity(), reachability ) @@ -162,46 +162,45 @@ defmodule Scholar.Cluster.OPTICS do filtered_neighbors = Nx.select( - Nx.logical_or(are_neighbors_processed, Nx.greater(distances, max_eps)), - Nx.multiply(-1, neighbors), + are_neighbors_processed or distances > max_eps, + -1 * neighbors, neighbors ) dists = Nx.flatten(Scholar.Metrics.Distance.pairwise_minkowski(p, x)) core_distance = Nx.take(core_distances, point_index) rdists = Nx.max(dists, core_distance) - improved = Nx.less(rdists, reachability) + improved = rdists < reachability improved = Nx.select(improved, filtered_neighbors, -1) improved = Nx.select( - Nx.logical_and(Nx.equal(improved, -1), Nx.greater(filtered_neighbors, 0)), + improved == -1 and filtered_neighbors > 0, Nx.multiply(filtered_neighbors, -1), filtered_neighbors ) - rdists = Nx.select(Nx.greater_equal(improved, 0), rdists, 0) + rdists = Nx.select(improved >= 0, rdists, 0) reversed_improved = Nx.max(Nx.multiply(improved, -1), 0) reachability = - Nx.select(Nx.less_equal(improved, 0), Nx.take(reachability, reversed_improved), rdists) + Nx.select(improved <= 0, Nx.take(reachability, reversed_improved), rdists) predecessor = - Nx.select(Nx.less_equal(improved, 0), Nx.take(predecessor, reversed_improved), point_index) + Nx.select(improved <= 0, Nx.take(predecessor, reversed_improved), point_index) {reachability, predecessor} end defnp cluster_optics_dbscan(reachability, core_distances, ordering, opts \\ []) do eps = opts[:eps] - far_reach = Nx.flatten(Nx.greater(reachability, eps)) - near_core = Nx.flatten(Nx.less_equal(core_distances, eps)) - far_and_not_near = Nx.multiply(far_reach, Nx.subtract(1, near_core)) + far_reach = Nx.flatten(reachability > eps) + near_core = Nx.flatten(core_distances <= eps) + far_and_not_near = Nx.multiply(far_reach, 1 - near_core) far_reach = Nx.take(far_reach, ordering) near_core = Nx.take(near_core, ordering) - far_and_near = Nx.multiply(far_reach, near_core) - Nx.as_type(Nx.cumulative_sum(far_and_near), :s8) - labels = Nx.subtract(Nx.as_type(Nx.cumulative_sum(far_and_near), :s8), 1) + far_and_near = far_reach * near_core + labels = Nx.as_type(Nx.cumulative_sum(far_and_near), :s8) - 1 labels = Nx.take(labels, Nx.argsort(ordering)) Nx.select(far_and_not_near, -1, labels) end From 5c7812e1a64ef4fe5250722b383a21dcd042eee4 Mon Sep 17 00:00:00 2001 From: norm4nn Date: Thu, 15 Aug 2024 11:58:27 +0200 Subject: [PATCH 4/8] Formatted description, added choice of KNN algorithm, Small fixes --- lib/scholar/cluster/optics.ex | 98 ++++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 24 deletions(-) diff --git a/lib/scholar/cluster/optics.ex b/lib/scholar/cluster/optics.ex index 86c4608c..15e1827c 100644 --- a/lib/scholar/cluster/optics.ex +++ b/lib/scholar/cluster/optics.ex @@ -1,6 +1,10 @@ defmodule Scholar.Cluster.OPTICS do @moduledoc """ - OPTICS (Ordering Points To Identify the Clustering Structure), closely related to DBSCAN, finds core sample of high density and expands clusters from them. Unlike DBSCAN, keeps cluster hierarchy for a variable neighborhood radius. Clusters are then extracted using a DBSCAN-like method. + OPTICS (Ordering Points To Identify the Clustering Structure) is an algorithm + for finding density-based clusters in spatial data. It is closely related + to DBSCAN, finds core sample of high density and expands clusters from them. + Unlike DBSCAN, keeps cluster hierarchy for a variable neighborhood radius. + Clusters are then extracted using a DBSCAN-like method. """ import Nx.Defn require Nx @@ -9,27 +13,49 @@ defmodule Scholar.Cluster.OPTICS do min_samples: [ default: 5, type: :pos_integer, - doc: "The number of samples in a neighborhood for a point to be considered as a core point." + doc: """ + The number of samples in a neighborhood for a point to be considered as a core point. + """ ], max_eps: [ default: Nx.Constants.infinity(), type: {:custom, Scholar.Options, :beta, []}, - doc: - "The maximum distance between two samples for one to be considered as in the neighborhood of the other. Default value of Nx.Constants.infinity() will identify clusters across all scales " + doc: """ + The maximum distance between two samples for one to be considered as in the neighborhood of the other. + Default value of Nx.Constants.infinity() will identify clusters across all scales. + """ ], eps: [ default: Nx.Constants.infinity(), type: {:custom, Scholar.Options, :beta, []}, - doc: - "The maximum distance between two samples for one to be considered as in the neighborhood of the other. By default it assumes the same value as max_eps." + doc: """ + The maximum distance between two samples for one to be considered as in the neighborhood of the other. + By default it assumes the same value as max_eps. + """ + ], + algorithm: [ + default: :brute, + type: :atom, + doc: """ + Algorithm used to compute the k-nearest neighbors. Possible values: + + * `:brute` - Brute-force search. See `Scholar.Neighbors.BruteKNN` for more details. + + * `:kd_tree` - k-d tree. See `Scholar.Neighbors.KDTree` for more details. + + * `:random_projection_forest` - Random projection forest. See `Scholar.Neighbors.RandomProjectionForest` for more details. + + * Module implementing `fit(data, opts)` and `predict(model, query)`. predict/2 must return a tuple containing indices + of k-nearest neighbors of query points as well as distances between query points and their k-nearest neighbors. + Also has to take num_neighbors as argument. + """ ] ] @opts_schema NimbleOptions.new!(opts) @doc """ - - Perform OPTICS clustering for `x` which is tensor of {n_samples, n_features} shape. + Perform OPTICS clustering for `x` which is tensor of `{n_samples, n_features} shape. ## Options @@ -37,9 +63,9 @@ defmodule Scholar.Cluster.OPTICS do ## Return Values - The function returns a labels tensor of shape {n_samples} - Cluster labels for each point in the dataset given to fit(). - Noisy samples are labeled as -1. + The function returns a labels tensor of shape `{n_samples}`. + Cluster labels for each point in the dataset given to fit(). + Noisy samples are labeled as -1. ## Examples @@ -69,14 +95,46 @@ defmodule Scholar.Cluster.OPTICS do s64[6] [0, 0, 0, 1, 1, -1] > + iex> Scholar.Cluster.OPTICS.fit(x, max_eps: 2, min_samples: 1, algorithm: :kd_tree, metric: {:minkowski, 1}) + #Nx.Tensor< + s64[6] + [0, 1, 1, 2, 2, 3] + > """ deftransform fit(x, opts \\ []) do - fit_p(x, NimbleOptions.validate!(opts, @opts_schema)) + if Nx.rank(x) != 2 do + raise ArgumentError, + """ + expected x to have shape {num_samples, num_features}, \ + got tensor with shape: #{inspect(Nx.shape(x))} + """ + end + {opts, algorithm_opts} = Keyword.split(opts, [:min_samples, :max_eps, :eps, :algorithm]) + opts = NimbleOptions.validate!(opts, @opts_schema) + algorithm_opts = Keyword.put(algorithm_opts, :num_neighbors, opts[:min_samples]) + + algorithm_module = + case opts[:algorithm] do + :brute -> + Scholar.Neighbors.BruteKNN + + :kd_tree -> + Scholar.Neighbors.KDTree + + :random_projection_forest -> + Scholar.Neighbors.RandomProjectionForest + + module when is_atom(module) -> + module + end + model = algorithm_module.fit(x, algorithm_opts) + {_neighbors, distances} = algorithm_module.predict(model, x) + fit_p(x, distances, opts) end - defnp fit_p(x, opts \\ []) do - {core_distances, reachability, _predecessor, ordering} = compute_optics_graph(x, opts) + defnp fit_p(x, core_distances, opts \\ []) do + {core_distances, reachability, _predecessor, ordering} = compute_optics_graph(x, core_distances, opts) eps = if opts[:eps] == Nx.Constants.infinity() do @@ -88,15 +146,12 @@ defmodule Scholar.Cluster.OPTICS do cluster_optics_dbscan(reachability, core_distances, ordering, eps: eps) end - defnp compute_optics_graph(x, opts \\ []) do - min_samples = opts[:min_samples] + defnp compute_optics_graph(x, distances, opts \\ []) do max_eps = opts[:max_eps] n_samples = Nx.axis_size(x, 0) reachability = Nx.broadcast(Nx.Constants.max_finite({:f, 32}), {n_samples}) predecessor = Nx.broadcast(-1, {n_samples}) - neighbors = Scholar.Neighbors.BruteKNN.fit(x, num_neighbors: min_samples) - core_distances = compute_core_distances(x, neighbors, min_samples: min_samples) - + core_distances = Nx.slice_along_axis(distances, opts[:min_samples] - 1, 1, axis: 1) core_distances = Nx.select(core_distances > max_eps, Nx.Constants.infinity(), core_distances) @@ -131,11 +186,6 @@ defmodule Scholar.Cluster.OPTICS do {core_distances, reachability, predecessor, ordering} end - defnp compute_core_distances(x, neighbors, opts \\ []) do - {_neighbors, distances} = Scholar.Neighbors.BruteKNN.predict(neighbors, x) - Nx.slice_along_axis(distances, opts[:min_samples] - 1, 1, axis: 1) - end - defnp set_reach_dist( core_distances, reachability, From a3c30b8a933672eb9af9e5111363d0f65b62a815 Mon Sep 17 00:00:00 2001 From: Szymon Date: Thu, 15 Aug 2024 19:48:44 +0200 Subject: [PATCH 5/8] Update lib/scholar/cluster/optics.ex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: José Valim --- lib/scholar/cluster/optics.ex | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/scholar/cluster/optics.ex b/lib/scholar/cluster/optics.ex index 15e1827c..be0a5d56 100644 --- a/lib/scholar/cluster/optics.ex +++ b/lib/scholar/cluster/optics.ex @@ -1,10 +1,12 @@ defmodule Scholar.Cluster.OPTICS do @moduledoc """ OPTICS (Ordering Points To Identify the Clustering Structure) is an algorithm - for finding density-based clusters in spatial data. It is closely related - to DBSCAN, finds core sample of high density and expands clusters from them. - Unlike DBSCAN, keeps cluster hierarchy for a variable neighborhood radius. - Clusters are then extracted using a DBSCAN-like method. + for finding density-based clusters in spatial data. + + It is closely related to DBSCAN, finds core sample of high density and expands + clusters from them. Unlike DBSCAN, keeps cluster hierarchy for a variable + neighborhood radius. Clusters are then extracted using a DBSCAN-like + method. """ import Nx.Defn require Nx From 78e37bce330400bc4bb9bf467a2c63e40cbc16da Mon Sep 17 00:00:00 2001 From: norm4nn Date: Fri, 16 Aug 2024 13:51:53 +0200 Subject: [PATCH 6/8] Added struct to module, moved knn algorithm, small fixes and additional validation --- lib/scholar/cluster/optics.ex | 125 +++++++++++++++++++++++----------- 1 file changed, 84 insertions(+), 41 deletions(-) diff --git a/lib/scholar/cluster/optics.ex b/lib/scholar/cluster/optics.ex index be0a5d56..cd0a8e0e 100644 --- a/lib/scholar/cluster/optics.ex +++ b/lib/scholar/cluster/optics.ex @@ -1,8 +1,7 @@ defmodule Scholar.Cluster.OPTICS do @moduledoc """ - OPTICS (Ordering Points To Identify the Clustering Structure) is an algorithm + OPTICS (Ordering Points To Identify the Clustering Structure) is an algorithm for finding density-based clusters in spatial data. - It is closely related to DBSCAN, finds core sample of high density and expands clusters from them. Unlike DBSCAN, keeps cluster hierarchy for a variable neighborhood radius. Clusters are then extracted using a DBSCAN-like @@ -11,6 +10,9 @@ defmodule Scholar.Cluster.OPTICS do import Nx.Defn require Nx + @derive {Nx.Container, containers: [:labels, :min_samples, :max_eps, :eps, :algorithm]} + defstruct [:labels, :min_samples, :max_eps, :eps, :algorithm] + opts = [ min_samples: [ default: 5, @@ -20,7 +22,6 @@ defmodule Scholar.Cluster.OPTICS do """ ], max_eps: [ - default: Nx.Constants.infinity(), type: {:custom, Scholar.Options, :beta, []}, doc: """ The maximum distance between two samples for one to be considered as in the neighborhood of the other. @@ -28,7 +29,6 @@ defmodule Scholar.Cluster.OPTICS do """ ], eps: [ - default: Nx.Constants.infinity(), type: {:custom, Scholar.Options, :beta, []}, doc: """ The maximum distance between two samples for one to be considered as in the neighborhood of the other. @@ -87,6 +87,11 @@ defmodule Scholar.Cluster.OPTICS do s64[6] [-1, 0, 0, 1, 1, -1] > + iex> Scholar.Cluster.OPTICS.fit(x, eps: 2, min_samples: 2, algorithm: :kd_tree, metric: {:minkowski, 1}) + #Nx.Tensor< + s64[6] + [-1, 0, 0, 1, 1, -1] + > iex> Scholar.Cluster.OPTICS.fit(x, eps: 1, min_samples: 2) #Nx.Tensor< s64[6] @@ -97,14 +102,9 @@ defmodule Scholar.Cluster.OPTICS do s64[6] [0, 0, 0, 1, 1, -1] > - iex> Scholar.Cluster.OPTICS.fit(x, max_eps: 2, min_samples: 1, algorithm: :kd_tree, metric: {:minkowski, 1}) - #Nx.Tensor< - s64[6] - [0, 1, 1, 2, 2, 3] - > """ - deftransform fit(x, opts \\ []) do + defn fit(x, opts \\ []) do if Nx.rank(x) != 2 do raise ArgumentError, """ @@ -112,9 +112,29 @@ defmodule Scholar.Cluster.OPTICS do got tensor with shape: #{inspect(Nx.shape(x))} """ end + + x = Scholar.Shared.to_float(x) + module = validate_options(x, opts) + + %__MODULE__{ + module + | labels: fit_p(x, module) + } + end + + deftransformp validate_options(x, opts \\ []) do {opts, algorithm_opts} = Keyword.split(opts, [:min_samples, :max_eps, :eps, :algorithm]) opts = NimbleOptions.validate!(opts, @opts_schema) - algorithm_opts = Keyword.put(algorithm_opts, :num_neighbors, opts[:min_samples]) + min_samples = opts[:min_samples] + + if min_samples < 2 do + raise ArgumentError, + """ + min_samples must be an int in the range [2, inf), got min_samples = #{inspect(min_samples)} + """ + end + + algorithm_opts = Keyword.put(algorithm_opts, :num_neighbors, min_samples) algorithm_module = case opts[:algorithm] do @@ -130,39 +150,59 @@ defmodule Scholar.Cluster.OPTICS do module when is_atom(module) -> module end + model = algorithm_module.fit(x, algorithm_opts) - {_neighbors, distances} = algorithm_module.predict(model, x) - fit_p(x, distances, opts) - end - defnp fit_p(x, core_distances, opts \\ []) do - {core_distances, reachability, _predecessor, ordering} = compute_optics_graph(x, core_distances, opts) + max_eps = + case opts[:max_eps] do + nil -> Nx.Constants.infinity(Nx.type(x)) + any -> any + end eps = - if opts[:eps] == Nx.Constants.infinity() do - opts[:max_eps] - else - opts[:eps] + case opts[:eps] do + nil -> max_eps + any -> any end - cluster_optics_dbscan(reachability, core_distances, ordering, eps: eps) + if eps > max_eps do + raise ArgumentError, + """ + eps can't be greater than max_eps, got eps = #{inspect(eps)} and max_eps = #{inspect(max_eps)} + """ + end + + %__MODULE__{ + labels: Nx.broadcast(-1, {Nx.axis_size(x, 0)}), + min_samples: min_samples, + max_eps: max_eps, + eps: eps, + algorithm: model + } + end + + defnp fit_p(x, module) do + {core_distances, reachability, _predecessor, ordering} = compute_optics_graph(x, module) + + cluster_optics_dbscan(reachability, core_distances, ordering, module) end - defnp compute_optics_graph(x, distances, opts \\ []) do - max_eps = opts[:max_eps] + defnp compute_optics_graph(x, %__MODULE__{max_eps: max_eps, min_samples: min_samples} = module) do n_samples = Nx.axis_size(x, 0) - reachability = Nx.broadcast(Nx.Constants.max_finite({:f, 32}), {n_samples}) + reachability = Nx.broadcast(Nx.Constants.max_finite(Nx.type(x)), {n_samples}) predecessor = Nx.broadcast(-1, {n_samples}) - core_distances = Nx.slice_along_axis(distances, opts[:min_samples] - 1, 1, axis: 1) + {_neighbors, distances} = run_knn(x, x, module) + core_distances = Nx.slice_along_axis(distances, min_samples - 1, 1, axis: 1) + core_distances = Nx.select(core_distances > max_eps, Nx.Constants.infinity(), core_distances) ordering = Nx.broadcast(0, {n_samples}) processed = Nx.broadcast(0, {n_samples}) - {_order_idx, core_distances, reachability, predecessor, _processed, ordering, _x, _max_eps} = + {_order_idx, core_distances, reachability, predecessor, _processed, ordering, _x, _module} = while {order_idx = 0, core_distances, reachability, predecessor, processed, ordering, x, - max_eps}, + module}, order_idx < n_samples do unprocessed_mask = processed == 0 point = Nx.argmin(Nx.select(unprocessed_mask, reachability, Nx.Constants.infinity())) @@ -170,17 +210,14 @@ defmodule Scholar.Cluster.OPTICS do ordering = Nx.put_slice(ordering, [order_idx], Nx.new_axis(point, 0)) {reachability, predecessor} = - set_reach_dist(core_distances, reachability, predecessor, point, processed, x, - max_eps: max_eps - ) + set_reach_dist(core_distances, reachability, predecessor, point, processed, x, module) - {order_idx + 1, core_distances, reachability, predecessor, processed, ordering, x, - max_eps} + {order_idx + 1, core_distances, reachability, predecessor, processed, ordering, x, module} end reachability = Nx.select( - reachability == Nx.Constants.max_finite({:f, 32}), + reachability == Nx.Constants.max_finite(Nx.type(x)), Nx.Constants.infinity(), reachability ) @@ -195,16 +232,13 @@ defmodule Scholar.Cluster.OPTICS do point_index, processed, x, - opts \\ [] + %__MODULE__{max_eps: max_eps} = module ) do - max_eps = opts[:max_eps] n_features = Nx.axis_size(x, 1) n_samples = Nx.axis_size(x, 0) - nbrs = Scholar.Neighbors.BruteKNN.fit(x, num_neighbors: n_samples) t = Nx.take(x, point_index, axis: 0) p = Nx.broadcast(t, {1, n_features}) - {neighbors, distances} = Scholar.Neighbors.BruteKNN.predict(nbrs, p) - + {neighbors, distances} = run_knn(x, p, %__MODULE__{module | min_samples: n_samples}) neighbors = Nx.flatten(neighbors) distances = Nx.flatten(distances) indices_ngbrs = Nx.argsort(neighbors) @@ -233,7 +267,7 @@ defmodule Scholar.Cluster.OPTICS do ) rdists = Nx.select(improved >= 0, rdists, 0) - reversed_improved = Nx.max(Nx.multiply(improved, -1), 0) + reversed_improved = Nx.max(improved * -1, 0) reachability = Nx.select(improved <= 0, Nx.take(reachability, reversed_improved), rdists) @@ -244,8 +278,17 @@ defmodule Scholar.Cluster.OPTICS do {reachability, predecessor} end - defnp cluster_optics_dbscan(reachability, core_distances, ordering, opts \\ []) do - eps = opts[:eps] + deftransformp run_knn(x, p, %__MODULE__{algorithm: algorithm_module, min_samples: k} = _module) do + nbrs = algorithm_module.__struct__.fit(x, num_neighbors: k) + algorithm_module.__struct__.predict(nbrs, p) + end + + defnp cluster_optics_dbscan( + reachability, + core_distances, + ordering, + %__MODULE__{eps: eps} = _module + ) do far_reach = Nx.flatten(reachability > eps) near_core = Nx.flatten(core_distances <= eps) far_and_not_near = Nx.multiply(far_reach, 1 - near_core) From 990ba968cf91bee986a56cb852ac438b4aecf324 Mon Sep 17 00:00:00 2001 From: norm4nn Date: Sat, 17 Aug 2024 18:30:38 +0200 Subject: [PATCH 7/8] Updated doctests --- lib/scholar/cluster/optics.ex | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/scholar/cluster/optics.ex b/lib/scholar/cluster/optics.ex index cd0a8e0e..c5de109a 100644 --- a/lib/scholar/cluster/optics.ex +++ b/lib/scholar/cluster/optics.ex @@ -72,32 +72,32 @@ defmodule Scholar.Cluster.OPTICS do ## Examples iex> x = Nx.tensor([[1, 2], [2, 5], [3, 6], [8, 7], [8, 8], [7, 3]]) - iex> Scholar.Cluster.OPTICS.fit(x, min_samples: 2) + iex> Scholar.Cluster.OPTICS.fit(x, min_samples: 2).labels #Nx.Tensor< s64[6] [-1, -1, -1, -1, -1, -1] > - iex> Scholar.Cluster.OPTICS.fit(x, eps: 4.5, min_samples: 2) + iex> Scholar.Cluster.OPTICS.fit(x, eps: 4.5, min_samples: 2).labels #Nx.Tensor< s64[6] [0, 0, 0, 1, 1, 1] > - iex> Scholar.Cluster.OPTICS.fit(x, eps: 2, min_samples: 2) + iex> Scholar.Cluster.OPTICS.fit(x, eps: 2, min_samples: 2).labels #Nx.Tensor< s64[6] [-1, 0, 0, 1, 1, -1] > - iex> Scholar.Cluster.OPTICS.fit(x, eps: 2, min_samples: 2, algorithm: :kd_tree, metric: {:minkowski, 1}) + iex> Scholar.Cluster.OPTICS.fit(x, eps: 2, min_samples: 2, algorithm: :kd_tree, metric: {:minkowski, 1}).labels #Nx.Tensor< s64[6] [-1, 0, 0, 1, 1, -1] > - iex> Scholar.Cluster.OPTICS.fit(x, eps: 1, min_samples: 2) + iex> Scholar.Cluster.OPTICS.fit(x, eps: 1, min_samples: 2).labels #Nx.Tensor< s64[6] [-1, -1, -1, 0, 0, -1] > - iex> Scholar.Cluster.OPTICS.fit(x, eps: 4.5, min_samples: 3) + iex> Scholar.Cluster.OPTICS.fit(x, eps: 4.5, min_samples: 3).labels #Nx.Tensor< s64[6] [0, 0, 0, 1, 1, -1] @@ -134,7 +134,7 @@ defmodule Scholar.Cluster.OPTICS do """ end - algorithm_opts = Keyword.put(algorithm_opts, :num_neighbors, min_samples) + algorithm_opts = Keyword.put(algorithm_opts, :num_neighbors, 1) algorithm_module = case opts[:algorithm] do From f1e863aa5a1f3d9a2d44bc71ce03c493d75843b5 Mon Sep 17 00:00:00 2001 From: Szymon Date: Sat, 17 Aug 2024 18:34:58 +0200 Subject: [PATCH 8/8] Update lib/scholar/cluster/optics.ex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: José Valim --- lib/scholar/cluster/optics.ex | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/scholar/cluster/optics.ex b/lib/scholar/cluster/optics.ex index c5de109a..c6548b78 100644 --- a/lib/scholar/cluster/optics.ex +++ b/lib/scholar/cluster/optics.ex @@ -1,7 +1,8 @@ defmodule Scholar.Cluster.OPTICS do @moduledoc """ - OPTICS (Ordering Points To Identify the Clustering Structure) is an algorithm + OPTICS (Ordering Points To Identify the Clustering Structure) is an algorithm for finding density-based clusters in spatial data. + It is closely related to DBSCAN, finds core sample of high density and expands clusters from them. Unlike DBSCAN, keeps cluster hierarchy for a variable neighborhood radius. Clusters are then extracted using a DBSCAN-like