From b3d8f0a4c7dbb8b4fd8be06043da121f2317e55e Mon Sep 17 00:00:00 2001 From: Mateusz Date: Tue, 7 Nov 2023 20:37:21 +0100 Subject: [PATCH] Add tests, change default param, add new preprocessing to isotonic regression --- lib/scholar/linear/isotonic_regression.ex | 96 ++++++---- lib/scholar/manifold/mds.ex | 59 +++--- test/scholar/manifold/mds_test.exs | 207 ++++++++++++++++++++++ 3 files changed, 295 insertions(+), 67 deletions(-) diff --git a/lib/scholar/linear/isotonic_regression.ex b/lib/scholar/linear/isotonic_regression.ex index da76b825..2198200a 100644 --- a/lib/scholar/linear/isotonic_regression.ex +++ b/lib/scholar/linear/isotonic_regression.ex @@ -4,6 +4,9 @@ defmodule Scholar.Linear.IsotonicRegression do observations by solving a convex optimization problem. It is a form of regression analysis that can be used as an alternative to polynomial regression to fit nonlinear data. + + Time complexity of isotonic regression is $O(N^2)$ where $N$ is the + number of points. """ require Nx import Nx.Defn, except: [transform: 2] @@ -38,7 +41,7 @@ defmodule Scholar.Linear.IsotonicRegression do y_thresholds: Nx.Tensor.t(), increasing: Nx.Tensor.t(), cutoff_index: Nx.Tensor.t(), - preprocess: Tuple.t() | Scholar.Interpolation.Linear.t() + preprocess: tuple() | Scholar.Interpolation.Linear.t() } opts = [ @@ -174,8 +177,6 @@ defmodule Scholar.Linear.IsotonicRegression do Nx.u8(0) end - # increasing = Nx.u8(1) - fit_n(x, y, sample_weights, increasing, opts) end @@ -206,12 +207,12 @@ defmodule Scholar.Linear.IsotonicRegression do iex> Scholar.Linear.IsotonicRegression.predict(model, to_predict) #Nx.Tensor< f32[10] - [1.0, 1.6666667461395264, 2.3333334922790527, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] + [1.0, 1.6666667461395264, 2.3333332538604736, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] > """ defn predict(model, x) do check_input_shape(x) - # check_preprocess(model) + check_preprocess(model) x = Nx.flatten(x) x = Nx.clip(x, model.x_min, model.x_max) @@ -261,43 +262,61 @@ defmodule Scholar.Linear.IsotonicRegression do ] ), x: Nx.tensor( - [1.0, 4.0, 7.0, 9.0, 10.0] + [1.0, 4.0, 7.0, 9.0, 10.0, 11.0] ) } } """ - defn preprocess(model) do - # cutoff = Nx.to_number(model.cutoff_index) - # x = model.x_thresholds[0..cutoff] - # y = model.y_thresholds[0..cutoff] - - # {x, y} = - # if trim_duplicates do - # keep_mask = - # Nx.logical_or( - # Nx.not_equal(y[1..-2//1], y[0..-3//1]), - # Nx.not_equal(y[1..-2//1], y[2..-1//1]) - # ) - - # keep_mask = Nx.concatenate([Nx.tensor([1]), keep_mask, Nx.tensor([1])]) - - # indices = - # Nx.iota({Nx.axis_size(y, 0)}) - # |> Nx.add(1) - # |> Nx.multiply(keep_mask) - # |> Nx.to_flat_list() - - # indices = Enum.filter(indices, fn x -> x != 0 end) |> Nx.tensor() |> Nx.subtract(1) - # x = Nx.take(x, indices) - # y = Nx.take(y, indices) - # {x, y} - # else - # {x, y} - # end - - # model = %__MODULE__{model | x_thresholds: x} - # model = %__MODULE__{model | y_thresholds: y} + def preprocess(model, trim_duplicates \\ true) do + cutoff = Nx.to_number(model.cutoff_index) + x = model.x_thresholds[0..cutoff] + y = model.y_thresholds[0..cutoff] + + {x, y} = + if trim_duplicates do + keep_mask = + Nx.logical_or( + Nx.not_equal(y[1..-2//1], y[0..-3//1]), + Nx.not_equal(y[1..-2//1], y[2..-1//1]) + ) + + keep_mask = Nx.concatenate([Nx.tensor([1]), keep_mask, Nx.tensor([1])]) + + indices = + Nx.iota({Nx.axis_size(y, 0)}) + |> Nx.add(1) + |> Nx.multiply(keep_mask) + |> Nx.to_flat_list() + + indices = Enum.filter(indices, fn x -> x != 0 end) |> Nx.tensor() |> Nx.subtract(1) + x = Nx.take(x, indices) + y = Nx.take(y, indices) + {x, y} + else + {x, y} + end + + model = %__MODULE__{model | x_thresholds: x} + model = %__MODULE__{model | y_thresholds: y} + + %__MODULE__{ + model + | preprocess: + Scholar.Interpolation.Linear.fit( + model.x_thresholds, + model.y_thresholds + ) + } + end + @doc """ + Preprocesses the `model` for prediction. + + Returns an updated `model`. This is a special version of `preprocess/1` that + does not trim duplicates so it can be used in defns. It is not recommended + to use this function directly. + """ + defn special_preprocess(model) do %__MODULE__{ model | preprocess: @@ -517,7 +536,8 @@ defmodule Scholar.Linear.IsotonicRegression do defnp check_increasing(x, y) do x = Nx.new_axis(x, -1) + y = Nx.new_axis(y, -1) model = Scholar.Linear.LinearRegression.fit(x, y) - Nx.squeeze(model.coefficients[0] >= 0) + model.coefficients[0][0] >= 0 end end diff --git a/lib/scholar/manifold/mds.ex b/lib/scholar/manifold/mds.ex index 09281e02..4d86d7e4 100644 --- a/lib/scholar/manifold/mds.ex +++ b/lib/scholar/manifold/mds.ex @@ -23,7 +23,7 @@ defmodule Scholar.Manifold.MDS do ], metric: [ type: :boolean, - default: false, + default: true, doc: ~S""" If `true`, use dissimilarities as metric distances in the embedding space. """ @@ -33,6 +33,7 @@ defmodule Scholar.Manifold.MDS do default: false, doc: ~S""" If `true`, normalize the stress by the sum of squared dissimilarities. + Only valid if `metric` is `false`. """ ], eps: [ @@ -78,6 +79,7 @@ defmodule Scholar.Manifold.MDS do metric = if opts[:metric], do: 1, else: 0 normalized_stress = if opts[:normalized_stress], do: 1, else: 0 eps = opts[:eps] + n = Nx.axis_size(dissimilarities, 0) {{x, stress, i}, _} = while {{x, _stress = Nx.Constants.infinity(Nx.type(dissimilarities)), i = 0}, @@ -86,7 +88,6 @@ defmodule Scholar.Manifold.MDS do metric, normalized_stress, eps, stop_value = 0}}, i < max_iter and not stop_value do dis = Distance.pairwise_euclidean(x) - n = Nx.axis_size(dissimilarities, 0) disparities = if metric do @@ -96,14 +97,14 @@ defmodule Scholar.Manifold.MDS do dis_flat_indices = lower_triangle_indices(dis) - n = Nx.axis_size(dis, 0) - dis_flat_w = Nx.take(dis_flat, dis_flat_indices) disparities_flat_model = - Scholar.Linear.IsotonicRegression.fit(similarities_flat_w, dis_flat_w) + Scholar.Linear.IsotonicRegression.fit(similarities_flat_w, dis_flat_w, + increasing: true + ) - model = Scholar.Linear.IsotonicRegression.preprocess(disparities_flat_model) + model = Scholar.Linear.IsotonicRegression.special_preprocess(disparities_flat_model) disparities_flat = Scholar.Linear.IsotonicRegression.predict(model, similarities_flat_w) @@ -133,7 +134,7 @@ defmodule Scholar.Manifold.MDS do ratio = disparities / dis b = -ratio b = Nx.put_diagonal(b, Nx.take_diagonal(b) + Nx.sum(ratio, axes: [1])) - x = 1.0 / n * Nx.dot(b, x) + x = Nx.dot(b, x) * (1.0 / n) dis = Nx.sum(Nx.sqrt(Nx.sum(x ** 2, axes: [1]))) @@ -209,7 +210,7 @@ defmodule Scholar.Manifold.MDS do {best, best_stress, best_iter} end - defn lower_triangle_indices(tensor) do + defnp lower_triangle_indices(tensor) do n = Nx.axis_size(tensor, 0) temp = Nx.broadcast(Nx.s64(0), {div(n * (n - 1), 2)}) @@ -249,17 +250,17 @@ defmodule Scholar.Manifold.MDS do %Scholar.Manifold.MDS{ embedding: Nx.tensor( [ - [0.040477119386196136, -0.4997042417526245], - [-0.35801631212234497, -0.09504470974206924], - [-0.08517580479383469, 0.35293734073638916], - [0.42080432176589966, 0.23617777228355408] + [16.3013916015625, -3.444634437561035], + [5.866805553436279, 1.6378790140151978], + [-5.487184524536133, 0.5837264657020569], + [-16.681013107299805, 1.2230290174484253] ] ), stress: Nx.tensor( - 0.0016479993937537074 + 0.3993147909641266 ), n_iter: Nx.tensor( - 19 + 23 ) } """ @@ -288,17 +289,17 @@ defmodule Scholar.Manifold.MDS do %Scholar.Manifold.MDS{ embedding: Nx.tensor( [ - [0.040477119386196136, -0.4997042417526245], - [-0.35801631212234497, -0.09504470974206924], - [-0.08517580479383469, 0.35293734073638916], - [0.42080432176589966, 0.23617777228355408] + [16.3013916015625, -3.444634437561035], + [5.866805553436279, 1.6378790140151978], + [-5.487184524536133, 0.5837264657020569], + [-16.681013107299805, 1.2230290174484253] ] ), stress: Nx.tensor( - 0.0016479993937537074 + 0.3993147909641266 ), n_iter: Nx.tensor( - 19 + 23 ) } """ @@ -333,10 +334,10 @@ defmodule Scholar.Manifold.MDS do %Scholar.Manifold.MDS{ embedding: Nx.tensor( [ - [0.41079193353652954, 0.41079193353652954], - [0.1369306445121765, 0.1369306445121765], - [-0.1369306445121765, -0.1369306445121765], - [-0.41079193353652954, -0.41079193353652954] + [11.858541488647461, 11.858541488647461], + [3.9528470039367676, 3.9528470039367676], + [-3.9528470039367676, -3.9528470039367676], + [-11.858541488647461, -11.858541488647461] ] ), stress: Nx.tensor( @@ -373,14 +374,14 @@ defmodule Scholar.Manifold.MDS do %Scholar.Manifold.MDS{ embedding: Nx.tensor( [ - [0.3354101777076721, 0.3354101777076721, 0.3354101777076721], - [0.11180339753627777, 0.11180339753627777, 0.11180339753627777], - [-0.11180339753627777, -0.11180340498685837, -0.11180339753627777], - [-0.3354102075099945, -0.3354102075099945, -0.3354102075099945] + [9.682458877563477, 9.682458877563477, 9.682458877563477], + [3.2274858951568604, 3.2274858951568604, 3.2274858951568604], + [-3.2274863719940186, -3.2274863719940186, -3.2274863719940186], + [-9.682458877563477, -9.682458877563477, -9.682458877563477] ] ), stress: Nx.tensor( - 2.6645352591003757e-15 + 9.094947017729282e-12 ), n_iter: Nx.tensor( 3 diff --git a/test/scholar/manifold/mds_test.exs b/test/scholar/manifold/mds_test.exs index 0658eff5..87d5cd93 100644 --- a/test/scholar/manifold/mds_test.exs +++ b/test/scholar/manifold/mds_test.exs @@ -2,4 +2,211 @@ defmodule Scholar.Manifold.MDSTest do use Scholar.Case, async: true alias Scholar.Manifold.MDS doctest MDS + + def x() do + Nx.iota({10, 50}) + end + + def key() do + Nx.Random.key(42) + end + + test "non-default num_components" do + key = key() + x = x() + model = EXLA.jit_apply(&MDS.fit(&1, num_components: 5, key: &2), [x, key]) + + assert_all_close( + model.embedding, + Nx.tensor([ + [ + 57.28269577026367, + -678.6760864257812, + 811.1503295898438, + -251.1714324951172, + 1156.7987060546875 + ], + [ + 7.623606204986572, + -544.2373046875, + 604.0946655273438, + -225.99559020996094, + 903.2800903320312 + ], + [ + -7.334737300872803, + -429.81671142578125, + 402.1512145996094, + -163.3682861328125, + 639.9016723632812 + ], + [ + 13.86670207977295, + -296.5096435546875, + 223.15061950683594, + -84.07274627685547, + 374.4827575683594 + ], + [ + 38.73623275756836, + -134.54620361328125, + 50.4241943359375, + -38.010799407958984, + 113.90003967285156 + ], + [ + 18.940887451171875, + 30.962879180908203, + -127.7795639038086, + 45.001678466796875, + -131.29234313964844 + ], + [ + 18.05344581604004, + 222.0098114013672, + -292.34197998046875, + 86.87554168701172, + -378.58544921875 + ], + [ + -3.060556173324585, + 429.6268005371094, + -436.2151794433594, + 146.84103393554688, + -621.5556640625 + ], + [ + -55.395423889160156, + 613.6642456054688, + -565.1470947265625, + 225.3615264892578, + -882.4739379882812 + ], + [ + -88.7128677368164, + 787.5221557617188, + -669.4872436523438, + 258.53912353515625, + -1174.455810546875 + ] + ]) + ) + + assert_all_close(model.stress, 698.4426879882812) + assert_all_close(model.n_iter, Nx.tensor(152)) + end + + test "non-default metric" do + key = key() + x = x() + model = EXLA.jit_apply(&MDS.fit(&1, metric: false, key: &2), [x, key]) + + assert_all_close( + model.embedding, + Nx.tensor([ + [-0.23465712368488312, 0.6921732425689697], + [-0.3380763530731201, 0.4378605782985687], + [-0.15237200260162354, 0.26230522990226746], + [0.09990488737821579, 0.2603200674057007], + [0.15598554909229279, 0.03315458819270134], + [0.41043558716773987, 0.13559512794017792], + [0.24686546623706818, -0.24366283416748047], + [0.1395486444234848, -0.4151153564453125], + [-0.07875102013349533, -0.530768096446991], + [-0.21976199746131897, -0.6417303681373596] + ]) + ) + + assert_all_close(model.stress, 0.1966342180967331) + assert_all_close(model.n_iter, Nx.tensor(38)) + end + + test "option normalized_stress with metric set to false" do + key = key() + x = x() + + model = + EXLA.jit_apply(&MDS.fit(&1, metric: false, key: &2, normalized_stress: true), [x, key]) + + assert_all_close( + model.embedding, + Nx.tensor([ + [-0.17997372150421143, 0.7225074768066406], + [-0.3138044774532318, 0.3934117257595062], + [-0.0900932177901268, 0.19507794082164764], + [0.2092301845550537, 0.295993834733963], + [0.24611115455627441, 0.0019988759886473417], + [0.4951189458370209, 0.08028026670217514], + [0.12963972985744476, -0.3193856179714203], + [0.19291982054710388, -0.44776636362075806], + [-0.2770233750343323, -0.4146113097667694], + [-0.3582141101360321, -0.5444929003715515] + ]) + ) + + assert_all_close(model.stress, 0.13638167083263397) + assert_all_close(model.n_iter, Nx.tensor(20)) + end + + test "epsilon set to a smaller then default value" do + key = key() + x = x() + + model = + EXLA.jit_apply(&MDS.fit(&1, metric: false, key: &2, normalized_stress: true, eps: 1.0e-4), [ + x, + key + ]) + + assert_all_close( + model.embedding, + Nx.tensor([ + [-0.35130712389945984, 0.6258886456489563], + [-0.4270354211330414, 0.4396686255931854], + [-0.30671024322509766, 0.2688262462615967], + [-0.12758131325244904, 0.18020282685756683], + [-0.05403336510062218, 0.01867777667939663], + [0.17203716933727264, 0.044468216598033905], + [0.2791652977466583, -0.09437420219182968], + [0.2869844138622284, -0.3071449398994446], + [0.2768166959285736, -0.49931082129478455], + [0.2563020884990692, -0.678329348564148] + ]) + ) + + # as expected smaller value of stress (loss) and bigger number of iterations + assert_all_close(model.stress, 0.03167537972331047) + assert_all_close(model.n_iter, Nx.tensor(116)) + end + + test "smaller max_iter value (100)" do + key = key() + x = x() + + model = + EXLA.jit_apply( + &MDS.fit(&1, metric: false, key: &2, normalized_stress: true, eps: 1.0e-4, max_iter: 100), + [x, key] + ) + + assert_all_close( + model.embedding, + Nx.tensor([ + [-0.34521010518074036, 0.6345276236534119], + [-0.4247266352176666, 0.43899187445640564], + [-0.2903931438922882, 0.2677172124385834], + [-0.09941618889570236, 0.19031266868114471], + [-0.03261081129312515, 0.019261524081230164], + [0.2049849033355713, 0.07233452051877975], + [0.29381951689720154, -0.09455471485853195], + [0.27441200613975525, -0.320201575756073], + [0.2368578165769577, -0.5156480669975281], + [0.19262047111988068, -0.6936381459236145] + ]) + ) + + # same params as in previous test, but smaller number of iterations, cupped on 100 + assert_all_close(model.stress, 0.040396787226200104) + assert_all_close(model.n_iter, Nx.tensor(100)) + end end