From b3d8f0a4c7dbb8b4fd8be06043da121f2317e55e Mon Sep 17 00:00:00 2001
From: Mateusz <msluszniak@student.agh.edu.pl>
Date: Tue, 7 Nov 2023 20:37:21 +0100
Subject: [PATCH] Add tests, change default param, add new preprocessing to
 isotonic regression

---
 lib/scholar/linear/isotonic_regression.ex |  96 ++++++----
 lib/scholar/manifold/mds.ex               |  59 +++---
 test/scholar/manifold/mds_test.exs        | 207 ++++++++++++++++++++++
 3 files changed, 295 insertions(+), 67 deletions(-)

diff --git a/lib/scholar/linear/isotonic_regression.ex b/lib/scholar/linear/isotonic_regression.ex
index da76b825..2198200a 100644
--- a/lib/scholar/linear/isotonic_regression.ex
+++ b/lib/scholar/linear/isotonic_regression.ex
@@ -4,6 +4,9 @@ defmodule Scholar.Linear.IsotonicRegression do
   observations by solving a convex optimization problem. It is a form of
   regression analysis that can be used as an alternative to polynomial
   regression to fit nonlinear data.
+
+  Time complexity of isotonic regression is $O(N^2)$ where $N$ is the
+  number of points.
   """
   require Nx
   import Nx.Defn, except: [transform: 2]
@@ -38,7 +41,7 @@ defmodule Scholar.Linear.IsotonicRegression do
           y_thresholds: Nx.Tensor.t(),
           increasing: Nx.Tensor.t(),
           cutoff_index: Nx.Tensor.t(),
-          preprocess: Tuple.t() | Scholar.Interpolation.Linear.t()
+          preprocess: tuple() | Scholar.Interpolation.Linear.t()
         }
 
   opts = [
@@ -174,8 +177,6 @@ defmodule Scholar.Linear.IsotonicRegression do
           Nx.u8(0)
       end
 
-    # increasing = Nx.u8(1)
-
     fit_n(x, y, sample_weights, increasing, opts)
   end
 
@@ -206,12 +207,12 @@ defmodule Scholar.Linear.IsotonicRegression do
       iex> Scholar.Linear.IsotonicRegression.predict(model, to_predict)
       #Nx.Tensor<
         f32[10]
-        [1.0, 1.6666667461395264, 2.3333334922790527, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
+        [1.0, 1.6666667461395264, 2.3333332538604736, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
       >
   """
   defn predict(model, x) do
     check_input_shape(x)
-    # check_preprocess(model)
+    check_preprocess(model)
 
     x = Nx.flatten(x)
     x = Nx.clip(x, model.x_min, model.x_max)
@@ -261,43 +262,61 @@ defmodule Scholar.Linear.IsotonicRegression do
             ]
           ),
           x: Nx.tensor(
-            [1.0, 4.0, 7.0, 9.0, 10.0]
+            [1.0, 4.0, 7.0, 9.0, 10.0, 11.0]
           )
         }
       }
   """
-  defn preprocess(model) do
-    # cutoff = Nx.to_number(model.cutoff_index)
-    # x = model.x_thresholds[0..cutoff]
-    # y = model.y_thresholds[0..cutoff]
-
-    # {x, y} =
-    #   if trim_duplicates do
-    #     keep_mask =
-    #       Nx.logical_or(
-    #         Nx.not_equal(y[1..-2//1], y[0..-3//1]),
-    #         Nx.not_equal(y[1..-2//1], y[2..-1//1])
-    #       )
-
-    #     keep_mask = Nx.concatenate([Nx.tensor([1]), keep_mask, Nx.tensor([1])])
-
-    #     indices =
-    #       Nx.iota({Nx.axis_size(y, 0)})
-    #       |> Nx.add(1)
-    #       |> Nx.multiply(keep_mask)
-    #       |> Nx.to_flat_list()
-
-    #     indices = Enum.filter(indices, fn x -> x != 0 end) |> Nx.tensor() |> Nx.subtract(1)
-    #     x = Nx.take(x, indices)
-    #     y = Nx.take(y, indices)
-    #     {x, y}
-    #   else
-    #     {x, y}
-    #   end
-
-    # model = %__MODULE__{model | x_thresholds: x}
-    # model = %__MODULE__{model | y_thresholds: y}
+  def preprocess(model, trim_duplicates \\ true) do
+    cutoff = Nx.to_number(model.cutoff_index)
+    x = model.x_thresholds[0..cutoff]
+    y = model.y_thresholds[0..cutoff]
+
+    {x, y} =
+      if trim_duplicates do
+        keep_mask =
+          Nx.logical_or(
+            Nx.not_equal(y[1..-2//1], y[0..-3//1]),
+            Nx.not_equal(y[1..-2//1], y[2..-1//1])
+          )
+
+        keep_mask = Nx.concatenate([Nx.tensor([1]), keep_mask, Nx.tensor([1])])
+
+        indices =
+          Nx.iota({Nx.axis_size(y, 0)})
+          |> Nx.add(1)
+          |> Nx.multiply(keep_mask)
+          |> Nx.to_flat_list()
+
+        indices = Enum.filter(indices, fn x -> x != 0 end) |> Nx.tensor() |> Nx.subtract(1)
+        x = Nx.take(x, indices)
+        y = Nx.take(y, indices)
+        {x, y}
+      else
+        {x, y}
+      end
+
+    model = %__MODULE__{model | x_thresholds: x}
+    model = %__MODULE__{model | y_thresholds: y}
+
+    %__MODULE__{
+      model
+      | preprocess:
+          Scholar.Interpolation.Linear.fit(
+            model.x_thresholds,
+            model.y_thresholds
+          )
+    }
+  end
 
+  @doc """
+  Preprocesses the `model` for prediction.
+
+  Returns an updated `model`. This is a special version of `preprocess/1` that
+  does not trim duplicates so it can be used in defns. It is not recommended
+  to use this function directly.
+  """
+  defn special_preprocess(model) do
     %__MODULE__{
       model
       | preprocess:
@@ -517,7 +536,8 @@ defmodule Scholar.Linear.IsotonicRegression do
 
   defnp check_increasing(x, y) do
     x = Nx.new_axis(x, -1)
+    y = Nx.new_axis(y, -1)
     model = Scholar.Linear.LinearRegression.fit(x, y)
-    Nx.squeeze(model.coefficients[0] >= 0)
+    model.coefficients[0][0] >= 0
   end
 end
diff --git a/lib/scholar/manifold/mds.ex b/lib/scholar/manifold/mds.ex
index 09281e02..4d86d7e4 100644
--- a/lib/scholar/manifold/mds.ex
+++ b/lib/scholar/manifold/mds.ex
@@ -23,7 +23,7 @@ defmodule Scholar.Manifold.MDS do
     ],
     metric: [
       type: :boolean,
-      default: false,
+      default: true,
       doc: ~S"""
       If `true`, use dissimilarities as metric distances in the embedding space.
       """
@@ -33,6 +33,7 @@ defmodule Scholar.Manifold.MDS do
       default: false,
       doc: ~S"""
       If `true`, normalize the stress by the sum of squared dissimilarities.
+      Only valid if `metric` is `false`.
       """
     ],
     eps: [
@@ -78,6 +79,7 @@ defmodule Scholar.Manifold.MDS do
     metric = if opts[:metric], do: 1, else: 0
     normalized_stress = if opts[:normalized_stress], do: 1, else: 0
     eps = opts[:eps]
+    n = Nx.axis_size(dissimilarities, 0)
 
     {{x, stress, i}, _} =
       while {{x, _stress = Nx.Constants.infinity(Nx.type(dissimilarities)), i = 0},
@@ -86,7 +88,6 @@ defmodule Scholar.Manifold.MDS do
               metric, normalized_stress, eps, stop_value = 0}},
             i < max_iter and not stop_value do
         dis = Distance.pairwise_euclidean(x)
-        n = Nx.axis_size(dissimilarities, 0)
 
         disparities =
           if metric do
@@ -96,14 +97,14 @@ defmodule Scholar.Manifold.MDS do
 
             dis_flat_indices = lower_triangle_indices(dis)
 
-            n = Nx.axis_size(dis, 0)
-
             dis_flat_w = Nx.take(dis_flat, dis_flat_indices)
 
             disparities_flat_model =
-              Scholar.Linear.IsotonicRegression.fit(similarities_flat_w, dis_flat_w)
+              Scholar.Linear.IsotonicRegression.fit(similarities_flat_w, dis_flat_w,
+                increasing: true
+              )
 
-            model = Scholar.Linear.IsotonicRegression.preprocess(disparities_flat_model)
+            model = Scholar.Linear.IsotonicRegression.special_preprocess(disparities_flat_model)
 
             disparities_flat =
               Scholar.Linear.IsotonicRegression.predict(model, similarities_flat_w)
@@ -133,7 +134,7 @@ defmodule Scholar.Manifold.MDS do
         ratio = disparities / dis
         b = -ratio
         b = Nx.put_diagonal(b, Nx.take_diagonal(b) + Nx.sum(ratio, axes: [1]))
-        x = 1.0 / n * Nx.dot(b, x)
+        x = Nx.dot(b, x) * (1.0 / n)
 
         dis = Nx.sum(Nx.sqrt(Nx.sum(x ** 2, axes: [1])))
 
@@ -209,7 +210,7 @@ defmodule Scholar.Manifold.MDS do
     {best, best_stress, best_iter}
   end
 
-  defn lower_triangle_indices(tensor) do
+  defnp lower_triangle_indices(tensor) do
     n = Nx.axis_size(tensor, 0)
 
     temp = Nx.broadcast(Nx.s64(0), {div(n * (n - 1), 2)})
@@ -249,17 +250,17 @@ defmodule Scholar.Manifold.MDS do
       %Scholar.Manifold.MDS{
         embedding: Nx.tensor(
           [
-            [0.040477119386196136, -0.4997042417526245],
-            [-0.35801631212234497, -0.09504470974206924],
-            [-0.08517580479383469, 0.35293734073638916],
-            [0.42080432176589966, 0.23617777228355408]
+            [16.3013916015625, -3.444634437561035],
+            [5.866805553436279, 1.6378790140151978],
+            [-5.487184524536133, 0.5837264657020569],
+            [-16.681013107299805, 1.2230290174484253]
           ]
         ),
         stress: Nx.tensor(
-          0.0016479993937537074
+          0.3993147909641266
         ),
         n_iter: Nx.tensor(
-          19
+          23
         )
       }
   """
@@ -288,17 +289,17 @@ defmodule Scholar.Manifold.MDS do
       %Scholar.Manifold.MDS{
         embedding: Nx.tensor(
           [
-            [0.040477119386196136, -0.4997042417526245],
-            [-0.35801631212234497, -0.09504470974206924],
-            [-0.08517580479383469, 0.35293734073638916],
-            [0.42080432176589966, 0.23617777228355408]
+            [16.3013916015625, -3.444634437561035],
+            [5.866805553436279, 1.6378790140151978],
+            [-5.487184524536133, 0.5837264657020569],
+            [-16.681013107299805, 1.2230290174484253]
           ]
         ),
         stress: Nx.tensor(
-          0.0016479993937537074
+          0.3993147909641266
         ),
         n_iter: Nx.tensor(
-          19
+          23
         )
       }
   """
@@ -333,10 +334,10 @@ defmodule Scholar.Manifold.MDS do
       %Scholar.Manifold.MDS{
         embedding: Nx.tensor(
           [
-            [0.41079193353652954, 0.41079193353652954],
-            [0.1369306445121765, 0.1369306445121765],
-            [-0.1369306445121765, -0.1369306445121765],
-            [-0.41079193353652954, -0.41079193353652954]
+            [11.858541488647461, 11.858541488647461],
+            [3.9528470039367676, 3.9528470039367676],
+            [-3.9528470039367676, -3.9528470039367676],
+            [-11.858541488647461, -11.858541488647461]
           ]
         ),
         stress: Nx.tensor(
@@ -373,14 +374,14 @@ defmodule Scholar.Manifold.MDS do
       %Scholar.Manifold.MDS{
         embedding: Nx.tensor(
           [
-            [0.3354101777076721, 0.3354101777076721, 0.3354101777076721],
-            [0.11180339753627777, 0.11180339753627777, 0.11180339753627777],
-            [-0.11180339753627777, -0.11180340498685837, -0.11180339753627777],
-            [-0.3354102075099945, -0.3354102075099945, -0.3354102075099945]
+            [9.682458877563477, 9.682458877563477, 9.682458877563477],
+            [3.2274858951568604, 3.2274858951568604, 3.2274858951568604],
+            [-3.2274863719940186, -3.2274863719940186, -3.2274863719940186],
+            [-9.682458877563477, -9.682458877563477, -9.682458877563477]
           ]
         ),
         stress: Nx.tensor(
-          2.6645352591003757e-15
+          9.094947017729282e-12
         ),
         n_iter: Nx.tensor(
           3
diff --git a/test/scholar/manifold/mds_test.exs b/test/scholar/manifold/mds_test.exs
index 0658eff5..87d5cd93 100644
--- a/test/scholar/manifold/mds_test.exs
+++ b/test/scholar/manifold/mds_test.exs
@@ -2,4 +2,211 @@ defmodule Scholar.Manifold.MDSTest do
   use Scholar.Case, async: true
   alias Scholar.Manifold.MDS
   doctest MDS
+
+  def x() do
+    Nx.iota({10, 50})
+  end
+
+  def key() do
+    Nx.Random.key(42)
+  end
+
+  test "non-default num_components" do
+    key = key()
+    x = x()
+    model = EXLA.jit_apply(&MDS.fit(&1, num_components: 5, key: &2), [x, key])
+
+    assert_all_close(
+      model.embedding,
+      Nx.tensor([
+        [
+          57.28269577026367,
+          -678.6760864257812,
+          811.1503295898438,
+          -251.1714324951172,
+          1156.7987060546875
+        ],
+        [
+          7.623606204986572,
+          -544.2373046875,
+          604.0946655273438,
+          -225.99559020996094,
+          903.2800903320312
+        ],
+        [
+          -7.334737300872803,
+          -429.81671142578125,
+          402.1512145996094,
+          -163.3682861328125,
+          639.9016723632812
+        ],
+        [
+          13.86670207977295,
+          -296.5096435546875,
+          223.15061950683594,
+          -84.07274627685547,
+          374.4827575683594
+        ],
+        [
+          38.73623275756836,
+          -134.54620361328125,
+          50.4241943359375,
+          -38.010799407958984,
+          113.90003967285156
+        ],
+        [
+          18.940887451171875,
+          30.962879180908203,
+          -127.7795639038086,
+          45.001678466796875,
+          -131.29234313964844
+        ],
+        [
+          18.05344581604004,
+          222.0098114013672,
+          -292.34197998046875,
+          86.87554168701172,
+          -378.58544921875
+        ],
+        [
+          -3.060556173324585,
+          429.6268005371094,
+          -436.2151794433594,
+          146.84103393554688,
+          -621.5556640625
+        ],
+        [
+          -55.395423889160156,
+          613.6642456054688,
+          -565.1470947265625,
+          225.3615264892578,
+          -882.4739379882812
+        ],
+        [
+          -88.7128677368164,
+          787.5221557617188,
+          -669.4872436523438,
+          258.53912353515625,
+          -1174.455810546875
+        ]
+      ])
+    )
+
+    assert_all_close(model.stress, 698.4426879882812)
+    assert_all_close(model.n_iter, Nx.tensor(152))
+  end
+
+  test "non-default metric" do
+    key = key()
+    x = x()
+    model = EXLA.jit_apply(&MDS.fit(&1, metric: false, key: &2), [x, key])
+
+    assert_all_close(
+      model.embedding,
+      Nx.tensor([
+        [-0.23465712368488312, 0.6921732425689697],
+        [-0.3380763530731201, 0.4378605782985687],
+        [-0.15237200260162354, 0.26230522990226746],
+        [0.09990488737821579, 0.2603200674057007],
+        [0.15598554909229279, 0.03315458819270134],
+        [0.41043558716773987, 0.13559512794017792],
+        [0.24686546623706818, -0.24366283416748047],
+        [0.1395486444234848, -0.4151153564453125],
+        [-0.07875102013349533, -0.530768096446991],
+        [-0.21976199746131897, -0.6417303681373596]
+      ])
+    )
+
+    assert_all_close(model.stress, 0.1966342180967331)
+    assert_all_close(model.n_iter, Nx.tensor(38))
+  end
+
+  test "option normalized_stress with metric set to false" do
+    key = key()
+    x = x()
+
+    model =
+      EXLA.jit_apply(&MDS.fit(&1, metric: false, key: &2, normalized_stress: true), [x, key])
+
+    assert_all_close(
+      model.embedding,
+      Nx.tensor([
+        [-0.17997372150421143, 0.7225074768066406],
+        [-0.3138044774532318, 0.3934117257595062],
+        [-0.0900932177901268, 0.19507794082164764],
+        [0.2092301845550537, 0.295993834733963],
+        [0.24611115455627441, 0.0019988759886473417],
+        [0.4951189458370209, 0.08028026670217514],
+        [0.12963972985744476, -0.3193856179714203],
+        [0.19291982054710388, -0.44776636362075806],
+        [-0.2770233750343323, -0.4146113097667694],
+        [-0.3582141101360321, -0.5444929003715515]
+      ])
+    )
+
+    assert_all_close(model.stress, 0.13638167083263397)
+    assert_all_close(model.n_iter, Nx.tensor(20))
+  end
+
+  test "epsilon set to a smaller then default value" do
+    key = key()
+    x = x()
+
+    model =
+      EXLA.jit_apply(&MDS.fit(&1, metric: false, key: &2, normalized_stress: true, eps: 1.0e-4), [
+        x,
+        key
+      ])
+
+    assert_all_close(
+      model.embedding,
+      Nx.tensor([
+        [-0.35130712389945984, 0.6258886456489563],
+        [-0.4270354211330414, 0.4396686255931854],
+        [-0.30671024322509766, 0.2688262462615967],
+        [-0.12758131325244904, 0.18020282685756683],
+        [-0.05403336510062218, 0.01867777667939663],
+        [0.17203716933727264, 0.044468216598033905],
+        [0.2791652977466583, -0.09437420219182968],
+        [0.2869844138622284, -0.3071449398994446],
+        [0.2768166959285736, -0.49931082129478455],
+        [0.2563020884990692, -0.678329348564148]
+      ])
+    )
+
+    # as expected smaller value of stress (loss) and bigger number of iterations
+    assert_all_close(model.stress, 0.03167537972331047)
+    assert_all_close(model.n_iter, Nx.tensor(116))
+  end
+
+  test "smaller max_iter value (100)" do
+    key = key()
+    x = x()
+
+    model =
+      EXLA.jit_apply(
+        &MDS.fit(&1, metric: false, key: &2, normalized_stress: true, eps: 1.0e-4, max_iter: 100),
+        [x, key]
+      )
+
+    assert_all_close(
+      model.embedding,
+      Nx.tensor([
+        [-0.34521010518074036, 0.6345276236534119],
+        [-0.4247266352176666, 0.43899187445640564],
+        [-0.2903931438922882, 0.2677172124385834],
+        [-0.09941618889570236, 0.19031266868114471],
+        [-0.03261081129312515, 0.019261524081230164],
+        [0.2049849033355713, 0.07233452051877975],
+        [0.29381951689720154, -0.09455471485853195],
+        [0.27441200613975525, -0.320201575756073],
+        [0.2368578165769577, -0.5156480669975281],
+        [0.19262047111988068, -0.6936381459236145]
+      ])
+    )
+
+    # same params as in previous test, but smaller number of iterations, cupped on 100
+    assert_all_close(model.stress, 0.040396787226200104)
+    assert_all_close(model.n_iter, Nx.tensor(100))
+  end
 end