From b55b1021b64ed3ca7f9e3522b6e9d69e563fd3a1 Mon Sep 17 00:00:00 2001
From: norm4nn <szymon.jan.mamon@gmail.com>
Date: Fri, 9 Aug 2024 23:49:12 +0200
Subject: [PATCH 1/8] Added OPTICS clustering algorithm, description and
 doctests

---
 lib/scholar/cluster/optics.ex | 201 ++++++++++++++++++++++++++++++++++
 1 file changed, 201 insertions(+)
 create mode 100644 lib/scholar/cluster/optics.ex

diff --git a/lib/scholar/cluster/optics.ex b/lib/scholar/cluster/optics.ex
new file mode 100644
index 00000000..6b3e88ee
--- /dev/null
+++ b/lib/scholar/cluster/optics.ex
@@ -0,0 +1,201 @@
+defmodule Scholar.Cluster.OPTICS do
+  @moduledoc """
+  OPTICS (Ordering Points To Identify the Clustering Structure), closely related to DBSCAN, finds core sample of high density and expands clusters from them. Unlike DBSCAN, keeps cluster hierarchy for a variable neighborhood radius. Clusters are then extracted using a DBSCAN-like method.
+  """
+  import Nx.Defn
+  require Nx
+
+  opts = [
+    min_samples: [
+      default: 5,
+      type: :pos_integer,
+      doc: "The number of samples in a neighborhood for a point to be considered as a core point."
+    ],
+    max_eps: [
+      default: Nx.Constants.infinity(),
+      type: {:custom, Scholar.Options, :beta, []},
+      doc:
+        "The maximum distance between two samples for one to be considered as in the neighborhood of the other. Default value of Nx.Constants.infinity() will identify clusters across all scales "
+    ],
+    eps: [
+      default: Nx.Constants.nan(),
+      type: {:custom, Scholar.Options, :beta, []},
+      doc:
+        "The maximum distance between two samples for one to be considered as in the neighborhood of the other. By default it assumes the same value as max_eps."
+    ]
+  ]
+
+  @opts_schema NimbleOptions.new!(opts)
+
+  @doc """
+
+  Perform OPTICS clustering for `x` which is tensor of {n_samples, n_features} shape.
+
+  ## Options
+
+  #{NimbleOptions.docs(@opts_schema)}
+
+  ## Return Values
+
+  The function returns a labels tensor of shape {n_samples}
+        Cluster labels for each point in the dataset given to fit().
+        Noisy samples are labeled as -1.
+
+  ## Examples
+
+      iex> x = Nx.tensor([[1, 2], [2, 5], [3, 6], [8, 7], [8, 8], [7, 3]])
+      iex> Scholar.Cluster.OPTICS.fit(x, min_samples: 2)
+      #Nx.Tensor<
+        s64[6]
+        [-1, -1, -1, -1, -1, -1]
+      >
+      iex> Scholar.Cluster.OPTICS.fit(x, eps: 4.5, min_samples: 2)
+      #Nx.Tensor<
+        s64[6]
+        [0, 0, 0, 1, 1, 1]
+      >
+      iex> Scholar.Cluster.OPTICS.fit(x, eps: 2, min_samples: 2)
+      #Nx.Tensor<
+        s64[6]
+        [-1, 0, 0, 1, 1, -1]
+      >
+      iex> Scholar.Cluster.OPTICS.fit(x, eps: 1, min_samples: 2)
+      #Nx.Tensor<
+        s64[6]
+        [-1, -1, -1, 0, 0, -1]
+      >
+      iex> Scholar.Cluster.OPTICS.fit(x, eps: 4.5, min_samples: 3)
+      #Nx.Tensor<
+        s64[6]
+        [0, 0, 0, 1, 1, -1]
+      >
+  """
+
+  deftransform fit(x, opts \\ []) do
+    fit_p(x, NimbleOptions.validate!(opts, @opts_schema))
+  end
+
+  defnp fit_p(x, opts \\ []) do
+    {core_distances, reachability, _predecessor, ordering} = compute_optics_graph(x, opts)
+    eps = opts[:eps]
+    cluster_optics_dbscan(reachability, core_distances, ordering, eps: eps)
+  end
+
+  defnp compute_optics_graph(x, opts \\ []) do
+    min_samples = opts[:min_samples]
+    max_eps = opts[:max_eps]
+    n_samples = Nx.axis_size(x, 0)
+    reachability = Nx.broadcast(Nx.Constants.max_finite({:f, 32}), {n_samples})
+    predecessor = Nx.broadcast(-1, {n_samples})
+    neighbors = Scholar.Neighbors.BruteKNN.fit(x, num_neighbors: min_samples)
+    core_distances = compute_core_distances(x, neighbors, min_samples: min_samples)
+
+    core_distances =
+      Nx.select(Nx.greater(core_distances, max_eps), Nx.Constants.infinity(), core_distances)
+
+    ordering = Nx.broadcast(0, {n_samples})
+    processed = Nx.broadcast(0, {n_samples})
+
+    {_order_idx, core_distances, reachability, predecessor, _processed, ordering, _x, _max_eps} =
+      while {order_idx = 0, core_distances, reachability, predecessor, processed, ordering, x,
+             max_eps},
+            order_idx < n_samples do
+        unprocessed_mask = Nx.equal(processed, 0)
+        point = Nx.argmin(Nx.select(unprocessed_mask, reachability, Nx.Constants.infinity()))
+        processed = Nx.put_slice(processed, [point], Nx.broadcast(1, {1}))
+        ordering = Nx.put_slice(ordering, [order_idx], Nx.broadcast(point, {1}))
+
+        {reachability, predecessor} =
+          set_reach_dist(core_distances, reachability, predecessor, point, processed, x,
+            max_eps: max_eps
+          )
+
+        {order_idx + 1, core_distances, reachability, predecessor, processed, ordering, x,
+         max_eps}
+      end
+
+    reachability =
+      Nx.select(
+        Nx.equal(reachability, Nx.Constants.max_finite({:f, 32})),
+        Nx.Constants.infinity(),
+        reachability
+      )
+
+    {core_distances, reachability, predecessor, ordering}
+  end
+
+  defnp compute_core_distances(x, neighbors, opts \\ []) do
+    {_neighbors, distances} = Scholar.Neighbors.BruteKNN.predict(neighbors, x)
+    Nx.slice_along_axis(distances, opts[:min_samples] - 1, 1, axis: 1)
+  end
+
+  defnp set_reach_dist(
+          core_distances,
+          reachability,
+          predecessor,
+          point_index,
+          processed,
+          x,
+          opts \\ []
+        ) do
+    max_eps = opts[:max_eps]
+    n_features = Nx.axis_size(x, 1)
+    n_samples = Nx.axis_size(x, 0)
+    nbrs = Scholar.Neighbors.BruteKNN.fit(x, num_neighbors: n_samples)
+    t = Nx.take(x, point_index, axis: 0)
+    p = Nx.broadcast(t, {1, n_features})
+    {neighbors, distances} = Scholar.Neighbors.BruteKNN.predict(nbrs, p)
+
+    neighbors = Nx.flatten(neighbors)
+    distances = Nx.flatten(distances)
+    indices_ngbrs = Nx.argsort(neighbors)
+    neighbors = Nx.take(neighbors, indices_ngbrs)
+    distances = Nx.take(distances, indices_ngbrs)
+    are_neighbors_processed = Nx.take(processed, neighbors)
+
+    filtered_neighbors =
+      Nx.select(
+        Nx.logical_or(are_neighbors_processed, Nx.greater(distances, max_eps)),
+        Nx.multiply(-1, neighbors),
+        neighbors
+      )
+
+    dists = Nx.flatten(Scholar.Metrics.Distance.pairwise_minkowski(p, x))
+    core_distance = Nx.take(core_distances, point_index)
+    rdists = Nx.max(dists, core_distance)
+    improved = Nx.less(rdists, reachability)
+    improved = Nx.select(improved, filtered_neighbors, -1)
+
+    improved =
+      Nx.select(
+        Nx.logical_and(Nx.equal(improved, -1), Nx.greater(filtered_neighbors, 0)),
+        Nx.multiply(filtered_neighbors, -1),
+        filtered_neighbors
+      )
+
+    rdists = Nx.select(Nx.greater_equal(improved, 0), rdists, 0)
+    reversed_improved = Nx.max(Nx.multiply(improved, -1), 0)
+
+    reachability =
+      Nx.select(Nx.less_equal(improved, 0), Nx.take(reachability, reversed_improved), rdists)
+
+    predecessor =
+      Nx.select(Nx.less_equal(improved, 0), Nx.take(predecessor, reversed_improved), point_index)
+
+    {reachability, predecessor}
+  end
+
+  defnp cluster_optics_dbscan(reachability, core_distances, ordering, opts \\ []) do
+    eps = opts[:eps]
+    far_reach = Nx.flatten(Nx.greater(reachability, eps))
+    near_core = Nx.flatten(Nx.less_equal(core_distances, eps))
+    far_and_not_near = Nx.multiply(far_reach, Nx.subtract(1, near_core))
+    far_reach = Nx.take(far_reach, ordering)
+    near_core = Nx.take(near_core, ordering)
+    far_and_near = Nx.multiply(far_reach, near_core)
+    Nx.as_type(Nx.cumulative_sum(far_and_near), :s8)
+    labels = Nx.subtract(Nx.as_type(Nx.cumulative_sum(far_and_near), :s8), 1)
+    labels = Nx.take(labels, Nx.argsort(ordering))
+    Nx.select(far_and_not_near, -1, labels)
+  end
+end

From 35e89e68a88b6c99c55b75a838b35c3a27fcefa1 Mon Sep 17 00:00:00 2001
From: norm4nn <szymon.jan.mamon@gmail.com>
Date: Fri, 9 Aug 2024 23:57:32 +0200
Subject: [PATCH 2/8] Updated eps default value to match its desc

---
 lib/scholar/cluster/optics.ex | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/lib/scholar/cluster/optics.ex b/lib/scholar/cluster/optics.ex
index 6b3e88ee..b8bf7800 100644
--- a/lib/scholar/cluster/optics.ex
+++ b/lib/scholar/cluster/optics.ex
@@ -77,7 +77,14 @@ defmodule Scholar.Cluster.OPTICS do
 
   defnp fit_p(x, opts \\ []) do
     {core_distances, reachability, _predecessor, ordering} = compute_optics_graph(x, opts)
-    eps = opts[:eps]
+
+    eps =
+      if Nx.equal(opts[:eps], Nx.Constants.nan()) do
+        opts[:max_eps]
+      else
+        opts[:eps]
+      end
+
     cluster_optics_dbscan(reachability, core_distances, ordering, eps: eps)
   end
 

From 43f88318f5b2e1afa2c0082e78f614fd4cd22f4c Mon Sep 17 00:00:00 2001
From: norm4nn <szymon.jan.mamon@gmail.com>
Date: Sun, 11 Aug 2024 21:09:55 +0200
Subject: [PATCH 3/8] Applied suggested changes

---
 lib/scholar/cluster/optics.ex | 39 +++++++++++++++++------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/lib/scholar/cluster/optics.ex b/lib/scholar/cluster/optics.ex
index b8bf7800..86c4608c 100644
--- a/lib/scholar/cluster/optics.ex
+++ b/lib/scholar/cluster/optics.ex
@@ -18,7 +18,7 @@ defmodule Scholar.Cluster.OPTICS do
         "The maximum distance between two samples for one to be considered as in the neighborhood of the other. Default value of Nx.Constants.infinity() will identify clusters across all scales "
     ],
     eps: [
-      default: Nx.Constants.nan(),
+      default: Nx.Constants.infinity(),
       type: {:custom, Scholar.Options, :beta, []},
       doc:
         "The maximum distance between two samples for one to be considered as in the neighborhood of the other. By default it assumes the same value as max_eps."
@@ -79,7 +79,7 @@ defmodule Scholar.Cluster.OPTICS do
     {core_distances, reachability, _predecessor, ordering} = compute_optics_graph(x, opts)
 
     eps =
-      if Nx.equal(opts[:eps], Nx.Constants.nan()) do
+      if opts[:eps] == Nx.Constants.infinity() do
         opts[:max_eps]
       else
         opts[:eps]
@@ -98,7 +98,7 @@ defmodule Scholar.Cluster.OPTICS do
     core_distances = compute_core_distances(x, neighbors, min_samples: min_samples)
 
     core_distances =
-      Nx.select(Nx.greater(core_distances, max_eps), Nx.Constants.infinity(), core_distances)
+      Nx.select(core_distances > max_eps, Nx.Constants.infinity(), core_distances)
 
     ordering = Nx.broadcast(0, {n_samples})
     processed = Nx.broadcast(0, {n_samples})
@@ -107,10 +107,10 @@ defmodule Scholar.Cluster.OPTICS do
       while {order_idx = 0, core_distances, reachability, predecessor, processed, ordering, x,
              max_eps},
             order_idx < n_samples do
-        unprocessed_mask = Nx.equal(processed, 0)
+        unprocessed_mask = processed == 0
         point = Nx.argmin(Nx.select(unprocessed_mask, reachability, Nx.Constants.infinity()))
-        processed = Nx.put_slice(processed, [point], Nx.broadcast(1, {1}))
-        ordering = Nx.put_slice(ordering, [order_idx], Nx.broadcast(point, {1}))
+        processed = Nx.put_slice(processed, [point], Nx.new_axis(1, 0))
+        ordering = Nx.put_slice(ordering, [order_idx], Nx.new_axis(point, 0))
 
         {reachability, predecessor} =
           set_reach_dist(core_distances, reachability, predecessor, point, processed, x,
@@ -123,7 +123,7 @@ defmodule Scholar.Cluster.OPTICS do
 
     reachability =
       Nx.select(
-        Nx.equal(reachability, Nx.Constants.max_finite({:f, 32})),
+        reachability == Nx.Constants.max_finite({:f, 32}),
         Nx.Constants.infinity(),
         reachability
       )
@@ -162,46 +162,45 @@ defmodule Scholar.Cluster.OPTICS do
 
     filtered_neighbors =
       Nx.select(
-        Nx.logical_or(are_neighbors_processed, Nx.greater(distances, max_eps)),
-        Nx.multiply(-1, neighbors),
+        are_neighbors_processed or distances > max_eps,
+        -1 * neighbors,
         neighbors
       )
 
     dists = Nx.flatten(Scholar.Metrics.Distance.pairwise_minkowski(p, x))
     core_distance = Nx.take(core_distances, point_index)
     rdists = Nx.max(dists, core_distance)
-    improved = Nx.less(rdists, reachability)
+    improved = rdists < reachability
     improved = Nx.select(improved, filtered_neighbors, -1)
 
     improved =
       Nx.select(
-        Nx.logical_and(Nx.equal(improved, -1), Nx.greater(filtered_neighbors, 0)),
+        improved == -1 and filtered_neighbors > 0,
         Nx.multiply(filtered_neighbors, -1),
         filtered_neighbors
       )
 
-    rdists = Nx.select(Nx.greater_equal(improved, 0), rdists, 0)
+    rdists = Nx.select(improved >= 0, rdists, 0)
     reversed_improved = Nx.max(Nx.multiply(improved, -1), 0)
 
     reachability =
-      Nx.select(Nx.less_equal(improved, 0), Nx.take(reachability, reversed_improved), rdists)
+      Nx.select(improved <= 0, Nx.take(reachability, reversed_improved), rdists)
 
     predecessor =
-      Nx.select(Nx.less_equal(improved, 0), Nx.take(predecessor, reversed_improved), point_index)
+      Nx.select(improved <= 0, Nx.take(predecessor, reversed_improved), point_index)
 
     {reachability, predecessor}
   end
 
   defnp cluster_optics_dbscan(reachability, core_distances, ordering, opts \\ []) do
     eps = opts[:eps]
-    far_reach = Nx.flatten(Nx.greater(reachability, eps))
-    near_core = Nx.flatten(Nx.less_equal(core_distances, eps))
-    far_and_not_near = Nx.multiply(far_reach, Nx.subtract(1, near_core))
+    far_reach = Nx.flatten(reachability > eps)
+    near_core = Nx.flatten(core_distances <= eps)
+    far_and_not_near = Nx.multiply(far_reach, 1 - near_core)
     far_reach = Nx.take(far_reach, ordering)
     near_core = Nx.take(near_core, ordering)
-    far_and_near = Nx.multiply(far_reach, near_core)
-    Nx.as_type(Nx.cumulative_sum(far_and_near), :s8)
-    labels = Nx.subtract(Nx.as_type(Nx.cumulative_sum(far_and_near), :s8), 1)
+    far_and_near = far_reach * near_core
+    labels = Nx.as_type(Nx.cumulative_sum(far_and_near), :s8) - 1
     labels = Nx.take(labels, Nx.argsort(ordering))
     Nx.select(far_and_not_near, -1, labels)
   end

From 5c7812e1a64ef4fe5250722b383a21dcd042eee4 Mon Sep 17 00:00:00 2001
From: norm4nn <szymon.jan.mamon@gmail.com>
Date: Thu, 15 Aug 2024 11:58:27 +0200
Subject: [PATCH 4/8] Formatted description, added choice of KNN algorithm,
 Small fixes

---
 lib/scholar/cluster/optics.ex | 98 ++++++++++++++++++++++++++---------
 1 file changed, 74 insertions(+), 24 deletions(-)

diff --git a/lib/scholar/cluster/optics.ex b/lib/scholar/cluster/optics.ex
index 86c4608c..15e1827c 100644
--- a/lib/scholar/cluster/optics.ex
+++ b/lib/scholar/cluster/optics.ex
@@ -1,6 +1,10 @@
 defmodule Scholar.Cluster.OPTICS do
   @moduledoc """
-  OPTICS (Ordering Points To Identify the Clustering Structure), closely related to DBSCAN, finds core sample of high density and expands clusters from them. Unlike DBSCAN, keeps cluster hierarchy for a variable neighborhood radius. Clusters are then extracted using a DBSCAN-like method.
+  OPTICS (Ordering Points To Identify the Clustering Structure) is an algorithm
+  for finding density-based clusters in spatial data. It is closely related
+  to DBSCAN, finds core sample of high density and expands clusters from them. 
+  Unlike DBSCAN, keeps cluster hierarchy for a variable neighborhood radius. 
+  Clusters are then extracted using a DBSCAN-like method.
   """
   import Nx.Defn
   require Nx
@@ -9,27 +13,49 @@ defmodule Scholar.Cluster.OPTICS do
     min_samples: [
       default: 5,
       type: :pos_integer,
-      doc: "The number of samples in a neighborhood for a point to be considered as a core point."
+      doc: """
+      The number of samples in a neighborhood for a point to be considered as a core point.
+      """
     ],
     max_eps: [
       default: Nx.Constants.infinity(),
       type: {:custom, Scholar.Options, :beta, []},
-      doc:
-        "The maximum distance between two samples for one to be considered as in the neighborhood of the other. Default value of Nx.Constants.infinity() will identify clusters across all scales "
+      doc: """
+        The maximum distance between two samples for one to be considered as in the neighborhood of the other. 
+        Default value of Nx.Constants.infinity() will identify clusters across all scales.
+      """
     ],
     eps: [
       default: Nx.Constants.infinity(),
       type: {:custom, Scholar.Options, :beta, []},
-      doc:
-        "The maximum distance between two samples for one to be considered as in the neighborhood of the other. By default it assumes the same value as max_eps."
+      doc: """
+        The maximum distance between two samples for one to be considered as in the neighborhood of the other. 
+        By default it assumes the same value as max_eps.
+      """
+    ],
+    algorithm: [
+      default: :brute,
+      type: :atom,
+      doc: """
+        Algorithm used to compute the k-nearest neighbors. Possible values:
+
+          * `:brute` - Brute-force search. See `Scholar.Neighbors.BruteKNN` for more details.
+
+          * `:kd_tree` - k-d tree. See `Scholar.Neighbors.KDTree` for more details.
+
+          * `:random_projection_forest` - Random projection forest. See `Scholar.Neighbors.RandomProjectionForest` for more details.
+
+          * Module implementing `fit(data, opts)` and `predict(model, query)`. predict/2 must return a tuple containing indices
+          of k-nearest neighbors of query points as well as distances between query points and their k-nearest neighbors.
+          Also has to take num_neighbors as argument.
+      """
     ]
   ]
 
   @opts_schema NimbleOptions.new!(opts)
 
   @doc """
-
-  Perform OPTICS clustering for `x` which is tensor of {n_samples, n_features} shape.
+  Perform OPTICS clustering for `x` which is tensor of `{n_samples, n_features} shape.
 
   ## Options
 
@@ -37,9 +63,9 @@ defmodule Scholar.Cluster.OPTICS do
 
   ## Return Values
 
-  The function returns a labels tensor of shape {n_samples}
-        Cluster labels for each point in the dataset given to fit().
-        Noisy samples are labeled as -1.
+  The function returns a labels tensor of shape `{n_samples}`.
+  Cluster labels for each point in the dataset given to fit().
+  Noisy samples are labeled as -1.
 
   ## Examples
 
@@ -69,14 +95,46 @@ defmodule Scholar.Cluster.OPTICS do
         s64[6]
         [0, 0, 0, 1, 1, -1]
       >
+      iex> Scholar.Cluster.OPTICS.fit(x, max_eps: 2, min_samples: 1, algorithm: :kd_tree, metric: {:minkowski, 1})
+      #Nx.Tensor<
+        s64[6]
+        [0, 1, 1, 2, 2, 3]
+      >
   """
 
   deftransform fit(x, opts \\ []) do
-    fit_p(x, NimbleOptions.validate!(opts, @opts_schema))
+    if Nx.rank(x) != 2 do
+      raise ArgumentError,
+            """
+            expected x to have shape {num_samples, num_features}, \
+            got tensor with shape: #{inspect(Nx.shape(x))}
+            """
+    end
+    {opts, algorithm_opts} = Keyword.split(opts, [:min_samples, :max_eps, :eps, :algorithm])
+    opts = NimbleOptions.validate!(opts, @opts_schema)
+    algorithm_opts = Keyword.put(algorithm_opts, :num_neighbors, opts[:min_samples])
+
+    algorithm_module =
+      case opts[:algorithm] do
+        :brute ->
+          Scholar.Neighbors.BruteKNN
+
+        :kd_tree ->
+          Scholar.Neighbors.KDTree
+
+        :random_projection_forest ->
+          Scholar.Neighbors.RandomProjectionForest
+
+        module when is_atom(module) ->
+          module
+      end
+    model = algorithm_module.fit(x, algorithm_opts)
+    {_neighbors, distances} = algorithm_module.predict(model, x)
+    fit_p(x, distances,  opts)
   end
 
-  defnp fit_p(x, opts \\ []) do
-    {core_distances, reachability, _predecessor, ordering} = compute_optics_graph(x, opts)
+  defnp fit_p(x, core_distances, opts \\ []) do
+    {core_distances, reachability, _predecessor, ordering} = compute_optics_graph(x, core_distances, opts)
 
     eps =
       if opts[:eps] == Nx.Constants.infinity() do
@@ -88,15 +146,12 @@ defmodule Scholar.Cluster.OPTICS do
     cluster_optics_dbscan(reachability, core_distances, ordering, eps: eps)
   end
 
-  defnp compute_optics_graph(x, opts \\ []) do
-    min_samples = opts[:min_samples]
+  defnp compute_optics_graph(x, distances, opts \\ []) do
     max_eps = opts[:max_eps]
     n_samples = Nx.axis_size(x, 0)
     reachability = Nx.broadcast(Nx.Constants.max_finite({:f, 32}), {n_samples})
     predecessor = Nx.broadcast(-1, {n_samples})
-    neighbors = Scholar.Neighbors.BruteKNN.fit(x, num_neighbors: min_samples)
-    core_distances = compute_core_distances(x, neighbors, min_samples: min_samples)
-
+    core_distances = Nx.slice_along_axis(distances, opts[:min_samples] - 1, 1, axis: 1)
     core_distances =
       Nx.select(core_distances > max_eps, Nx.Constants.infinity(), core_distances)
 
@@ -131,11 +186,6 @@ defmodule Scholar.Cluster.OPTICS do
     {core_distances, reachability, predecessor, ordering}
   end
 
-  defnp compute_core_distances(x, neighbors, opts \\ []) do
-    {_neighbors, distances} = Scholar.Neighbors.BruteKNN.predict(neighbors, x)
-    Nx.slice_along_axis(distances, opts[:min_samples] - 1, 1, axis: 1)
-  end
-
   defnp set_reach_dist(
           core_distances,
           reachability,

From a3c30b8a933672eb9af9e5111363d0f65b62a815 Mon Sep 17 00:00:00 2001
From: Szymon <szymon.jan.mamon@gmail.com>
Date: Thu, 15 Aug 2024 19:48:44 +0200
Subject: [PATCH 5/8] Update lib/scholar/cluster/optics.ex
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: José Valim <jose.valim@gmail.com>
---
 lib/scholar/cluster/optics.ex | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/lib/scholar/cluster/optics.ex b/lib/scholar/cluster/optics.ex
index 15e1827c..be0a5d56 100644
--- a/lib/scholar/cluster/optics.ex
+++ b/lib/scholar/cluster/optics.ex
@@ -1,10 +1,12 @@
 defmodule Scholar.Cluster.OPTICS do
   @moduledoc """
   OPTICS (Ordering Points To Identify the Clustering Structure) is an algorithm
-  for finding density-based clusters in spatial data. It is closely related
-  to DBSCAN, finds core sample of high density and expands clusters from them. 
-  Unlike DBSCAN, keeps cluster hierarchy for a variable neighborhood radius. 
-  Clusters are then extracted using a DBSCAN-like method.
+  for finding density-based clusters in spatial data.
+
+  It is closely related to DBSCAN, finds core sample of high density and expands
+  clusters from them.  Unlike DBSCAN, keeps cluster hierarchy for a variable
+  neighborhood radius.  Clusters are then extracted using a DBSCAN-like
+  method.
   """
   import Nx.Defn
   require Nx

From 78e37bce330400bc4bb9bf467a2c63e40cbc16da Mon Sep 17 00:00:00 2001
From: norm4nn <szymon.jan.mamon@gmail.com>
Date: Fri, 16 Aug 2024 13:51:53 +0200
Subject: [PATCH 6/8] Added struct to module, moved knn algorithm, small fixes
 and additional validation

---
 lib/scholar/cluster/optics.ex | 125 +++++++++++++++++++++++-----------
 1 file changed, 84 insertions(+), 41 deletions(-)

diff --git a/lib/scholar/cluster/optics.ex b/lib/scholar/cluster/optics.ex
index be0a5d56..cd0a8e0e 100644
--- a/lib/scholar/cluster/optics.ex
+++ b/lib/scholar/cluster/optics.ex
@@ -1,8 +1,7 @@
 defmodule Scholar.Cluster.OPTICS do
   @moduledoc """
-  OPTICS (Ordering Points To Identify the Clustering Structure) is an algorithm
+    OPTICS (Ordering Points To Identify the Clustering Structure) is an algorithm
   for finding density-based clusters in spatial data.
-
   It is closely related to DBSCAN, finds core sample of high density and expands
   clusters from them.  Unlike DBSCAN, keeps cluster hierarchy for a variable
   neighborhood radius.  Clusters are then extracted using a DBSCAN-like
@@ -11,6 +10,9 @@ defmodule Scholar.Cluster.OPTICS do
   import Nx.Defn
   require Nx
 
+  @derive {Nx.Container, containers: [:labels, :min_samples, :max_eps, :eps, :algorithm]}
+  defstruct [:labels, :min_samples, :max_eps, :eps, :algorithm]
+
   opts = [
     min_samples: [
       default: 5,
@@ -20,7 +22,6 @@ defmodule Scholar.Cluster.OPTICS do
       """
     ],
     max_eps: [
-      default: Nx.Constants.infinity(),
       type: {:custom, Scholar.Options, :beta, []},
       doc: """
         The maximum distance between two samples for one to be considered as in the neighborhood of the other. 
@@ -28,7 +29,6 @@ defmodule Scholar.Cluster.OPTICS do
       """
     ],
     eps: [
-      default: Nx.Constants.infinity(),
       type: {:custom, Scholar.Options, :beta, []},
       doc: """
         The maximum distance between two samples for one to be considered as in the neighborhood of the other. 
@@ -87,6 +87,11 @@ defmodule Scholar.Cluster.OPTICS do
         s64[6]
         [-1, 0, 0, 1, 1, -1]
       >
+      iex> Scholar.Cluster.OPTICS.fit(x, eps: 2, min_samples: 2, algorithm: :kd_tree, metric: {:minkowski, 1})
+      #Nx.Tensor<
+        s64[6]
+        [-1, 0, 0, 1, 1, -1]
+      >
       iex> Scholar.Cluster.OPTICS.fit(x, eps: 1, min_samples: 2)
       #Nx.Tensor<
         s64[6]
@@ -97,14 +102,9 @@ defmodule Scholar.Cluster.OPTICS do
         s64[6]
         [0, 0, 0, 1, 1, -1]
       >
-      iex> Scholar.Cluster.OPTICS.fit(x, max_eps: 2, min_samples: 1, algorithm: :kd_tree, metric: {:minkowski, 1})
-      #Nx.Tensor<
-        s64[6]
-        [0, 1, 1, 2, 2, 3]
-      >
   """
 
-  deftransform fit(x, opts \\ []) do
+  defn fit(x, opts \\ []) do
     if Nx.rank(x) != 2 do
       raise ArgumentError,
             """
@@ -112,9 +112,29 @@ defmodule Scholar.Cluster.OPTICS do
             got tensor with shape: #{inspect(Nx.shape(x))}
             """
     end
+
+    x = Scholar.Shared.to_float(x)
+    module = validate_options(x, opts)
+
+    %__MODULE__{
+      module
+      | labels: fit_p(x, module)
+    }
+  end
+
+  deftransformp validate_options(x, opts \\ []) do
     {opts, algorithm_opts} = Keyword.split(opts, [:min_samples, :max_eps, :eps, :algorithm])
     opts = NimbleOptions.validate!(opts, @opts_schema)
-    algorithm_opts = Keyword.put(algorithm_opts, :num_neighbors, opts[:min_samples])
+    min_samples = opts[:min_samples]
+
+    if min_samples < 2 do
+      raise ArgumentError,
+            """
+            min_samples must be an int in the range [2, inf), got min_samples = #{inspect(min_samples)}
+            """
+    end
+
+    algorithm_opts = Keyword.put(algorithm_opts, :num_neighbors, min_samples)
 
     algorithm_module =
       case opts[:algorithm] do
@@ -130,39 +150,59 @@ defmodule Scholar.Cluster.OPTICS do
         module when is_atom(module) ->
           module
       end
+
     model = algorithm_module.fit(x, algorithm_opts)
-    {_neighbors, distances} = algorithm_module.predict(model, x)
-    fit_p(x, distances,  opts)
-  end
 
-  defnp fit_p(x, core_distances, opts \\ []) do
-    {core_distances, reachability, _predecessor, ordering} = compute_optics_graph(x, core_distances, opts)
+    max_eps =
+      case opts[:max_eps] do
+        nil -> Nx.Constants.infinity(Nx.type(x))
+        any -> any
+      end
 
     eps =
-      if opts[:eps] == Nx.Constants.infinity() do
-        opts[:max_eps]
-      else
-        opts[:eps]
+      case opts[:eps] do
+        nil -> max_eps
+        any -> any
       end
 
-    cluster_optics_dbscan(reachability, core_distances, ordering, eps: eps)
+    if eps > max_eps do
+      raise ArgumentError,
+            """
+            eps can't be greater than max_eps, got eps = #{inspect(eps)} and max_eps = #{inspect(max_eps)}
+            """
+    end
+
+    %__MODULE__{
+      labels: Nx.broadcast(-1, {Nx.axis_size(x, 0)}),
+      min_samples: min_samples,
+      max_eps: max_eps,
+      eps: eps,
+      algorithm: model
+    }
+  end
+
+  defnp fit_p(x, module) do
+    {core_distances, reachability, _predecessor, ordering} = compute_optics_graph(x, module)
+
+    cluster_optics_dbscan(reachability, core_distances, ordering, module)
   end
 
-  defnp compute_optics_graph(x, distances, opts \\ []) do
-    max_eps = opts[:max_eps]
+  defnp compute_optics_graph(x, %__MODULE__{max_eps: max_eps, min_samples: min_samples} = module) do
     n_samples = Nx.axis_size(x, 0)
-    reachability = Nx.broadcast(Nx.Constants.max_finite({:f, 32}), {n_samples})
+    reachability = Nx.broadcast(Nx.Constants.max_finite(Nx.type(x)), {n_samples})
     predecessor = Nx.broadcast(-1, {n_samples})
-    core_distances = Nx.slice_along_axis(distances, opts[:min_samples] - 1, 1, axis: 1)
+    {_neighbors, distances} = run_knn(x, x, module)
+    core_distances = Nx.slice_along_axis(distances, min_samples - 1, 1, axis: 1)
+
     core_distances =
       Nx.select(core_distances > max_eps, Nx.Constants.infinity(), core_distances)
 
     ordering = Nx.broadcast(0, {n_samples})
     processed = Nx.broadcast(0, {n_samples})
 
-    {_order_idx, core_distances, reachability, predecessor, _processed, ordering, _x, _max_eps} =
+    {_order_idx, core_distances, reachability, predecessor, _processed, ordering, _x, _module} =
       while {order_idx = 0, core_distances, reachability, predecessor, processed, ordering, x,
-             max_eps},
+             module},
             order_idx < n_samples do
         unprocessed_mask = processed == 0
         point = Nx.argmin(Nx.select(unprocessed_mask, reachability, Nx.Constants.infinity()))
@@ -170,17 +210,14 @@ defmodule Scholar.Cluster.OPTICS do
         ordering = Nx.put_slice(ordering, [order_idx], Nx.new_axis(point, 0))
 
         {reachability, predecessor} =
-          set_reach_dist(core_distances, reachability, predecessor, point, processed, x,
-            max_eps: max_eps
-          )
+          set_reach_dist(core_distances, reachability, predecessor, point, processed, x, module)
 
-        {order_idx + 1, core_distances, reachability, predecessor, processed, ordering, x,
-         max_eps}
+        {order_idx + 1, core_distances, reachability, predecessor, processed, ordering, x, module}
       end
 
     reachability =
       Nx.select(
-        reachability == Nx.Constants.max_finite({:f, 32}),
+        reachability == Nx.Constants.max_finite(Nx.type(x)),
         Nx.Constants.infinity(),
         reachability
       )
@@ -195,16 +232,13 @@ defmodule Scholar.Cluster.OPTICS do
           point_index,
           processed,
           x,
-          opts \\ []
+          %__MODULE__{max_eps: max_eps} = module
         ) do
-    max_eps = opts[:max_eps]
     n_features = Nx.axis_size(x, 1)
     n_samples = Nx.axis_size(x, 0)
-    nbrs = Scholar.Neighbors.BruteKNN.fit(x, num_neighbors: n_samples)
     t = Nx.take(x, point_index, axis: 0)
     p = Nx.broadcast(t, {1, n_features})
-    {neighbors, distances} = Scholar.Neighbors.BruteKNN.predict(nbrs, p)
-
+    {neighbors, distances} = run_knn(x, p, %__MODULE__{module | min_samples: n_samples})
     neighbors = Nx.flatten(neighbors)
     distances = Nx.flatten(distances)
     indices_ngbrs = Nx.argsort(neighbors)
@@ -233,7 +267,7 @@ defmodule Scholar.Cluster.OPTICS do
       )
 
     rdists = Nx.select(improved >= 0, rdists, 0)
-    reversed_improved = Nx.max(Nx.multiply(improved, -1), 0)
+    reversed_improved = Nx.max(improved * -1, 0)
 
     reachability =
       Nx.select(improved <= 0, Nx.take(reachability, reversed_improved), rdists)
@@ -244,8 +278,17 @@ defmodule Scholar.Cluster.OPTICS do
     {reachability, predecessor}
   end
 
-  defnp cluster_optics_dbscan(reachability, core_distances, ordering, opts \\ []) do
-    eps = opts[:eps]
+  deftransformp run_knn(x, p, %__MODULE__{algorithm: algorithm_module, min_samples: k} = _module) do
+    nbrs = algorithm_module.__struct__.fit(x, num_neighbors: k)
+    algorithm_module.__struct__.predict(nbrs, p)
+  end
+
+  defnp cluster_optics_dbscan(
+          reachability,
+          core_distances,
+          ordering,
+          %__MODULE__{eps: eps} = _module
+        ) do
     far_reach = Nx.flatten(reachability > eps)
     near_core = Nx.flatten(core_distances <= eps)
     far_and_not_near = Nx.multiply(far_reach, 1 - near_core)

From 990ba968cf91bee986a56cb852ac438b4aecf324 Mon Sep 17 00:00:00 2001
From: norm4nn <szymon.jan.mamon@gmail.com>
Date: Sat, 17 Aug 2024 18:30:38 +0200
Subject: [PATCH 7/8] Updated doctests

---
 lib/scholar/cluster/optics.ex | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/lib/scholar/cluster/optics.ex b/lib/scholar/cluster/optics.ex
index cd0a8e0e..c5de109a 100644
--- a/lib/scholar/cluster/optics.ex
+++ b/lib/scholar/cluster/optics.ex
@@ -72,32 +72,32 @@ defmodule Scholar.Cluster.OPTICS do
   ## Examples
 
       iex> x = Nx.tensor([[1, 2], [2, 5], [3, 6], [8, 7], [8, 8], [7, 3]])
-      iex> Scholar.Cluster.OPTICS.fit(x, min_samples: 2)
+      iex> Scholar.Cluster.OPTICS.fit(x, min_samples: 2).labels
       #Nx.Tensor<
         s64[6]
         [-1, -1, -1, -1, -1, -1]
       >
-      iex> Scholar.Cluster.OPTICS.fit(x, eps: 4.5, min_samples: 2)
+      iex> Scholar.Cluster.OPTICS.fit(x, eps: 4.5, min_samples: 2).labels
       #Nx.Tensor<
         s64[6]
         [0, 0, 0, 1, 1, 1]
       >
-      iex> Scholar.Cluster.OPTICS.fit(x, eps: 2, min_samples: 2)
+      iex> Scholar.Cluster.OPTICS.fit(x, eps: 2, min_samples: 2).labels
       #Nx.Tensor<
         s64[6]
         [-1, 0, 0, 1, 1, -1]
       >
-      iex> Scholar.Cluster.OPTICS.fit(x, eps: 2, min_samples: 2, algorithm: :kd_tree, metric: {:minkowski, 1})
+      iex> Scholar.Cluster.OPTICS.fit(x, eps: 2, min_samples: 2, algorithm: :kd_tree, metric: {:minkowski, 1}).labels
       #Nx.Tensor<
         s64[6]
         [-1, 0, 0, 1, 1, -1]
       >
-      iex> Scholar.Cluster.OPTICS.fit(x, eps: 1, min_samples: 2)
+      iex> Scholar.Cluster.OPTICS.fit(x, eps: 1, min_samples: 2).labels
       #Nx.Tensor<
         s64[6]
         [-1, -1, -1, 0, 0, -1]
       >
-      iex> Scholar.Cluster.OPTICS.fit(x, eps: 4.5, min_samples: 3)
+      iex> Scholar.Cluster.OPTICS.fit(x, eps: 4.5, min_samples: 3).labels
       #Nx.Tensor<
         s64[6]
         [0, 0, 0, 1, 1, -1]
@@ -134,7 +134,7 @@ defmodule Scholar.Cluster.OPTICS do
             """
     end
 
-    algorithm_opts = Keyword.put(algorithm_opts, :num_neighbors, min_samples)
+    algorithm_opts = Keyword.put(algorithm_opts, :num_neighbors, 1)
 
     algorithm_module =
       case opts[:algorithm] do

From f1e863aa5a1f3d9a2d44bc71ce03c493d75843b5 Mon Sep 17 00:00:00 2001
From: Szymon <szymon.jan.mamon@gmail.com>
Date: Sat, 17 Aug 2024 18:34:58 +0200
Subject: [PATCH 8/8] Update lib/scholar/cluster/optics.ex
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: José Valim <jose.valim@gmail.com>
---
 lib/scholar/cluster/optics.ex | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/scholar/cluster/optics.ex b/lib/scholar/cluster/optics.ex
index c5de109a..c6548b78 100644
--- a/lib/scholar/cluster/optics.ex
+++ b/lib/scholar/cluster/optics.ex
@@ -1,7 +1,8 @@
 defmodule Scholar.Cluster.OPTICS do
   @moduledoc """
-    OPTICS (Ordering Points To Identify the Clustering Structure) is an algorithm
+  OPTICS (Ordering Points To Identify the Clustering Structure) is an algorithm
   for finding density-based clusters in spatial data.
+
   It is closely related to DBSCAN, finds core sample of high density and expands
   clusters from them.  Unlike DBSCAN, keeps cluster hierarchy for a variable
   neighborhood radius.  Clusters are then extracted using a DBSCAN-like