diff --git a/benchmarks/kd_tree.exs b/benchmarks/kd_tree.exs new file mode 100644 index 00000000..18dda797 --- /dev/null +++ b/benchmarks/kd_tree.exs @@ -0,0 +1,15 @@ +# mix run benchmarks/kd_tree.exs +Nx.global_default_backend(EXLA.Backend) +Nx.Defn.global_default_options(compiler: EXLA) + +key = Nx.Random.key(System.os_time()) +{uniform, _new_key} = Nx.Random.uniform(key, shape: {1000, 3}) + +Benchee.run( + %{ + "unbounded" => fn -> Scholar.Neighbors.KDTree.unbounded(uniform) end, + "bounded" => fn -> Scholar.Neighbors.KDTree.bounded(uniform, 2) end + }, + time: 10, + memory_time: 2 +) diff --git a/lib/scholar/cluster/affinity_propagation.ex b/lib/scholar/cluster/affinity_propagation.ex index b2523829..028710d2 100644 --- a/lib/scholar/cluster/affinity_propagation.ex +++ b/lib/scholar/cluster/affinity_propagation.ex @@ -4,6 +4,10 @@ defmodule Scholar.Cluster.AffinityPropagation do of `:clusters_centers` is set to the number of samples in the dataset. The artificial centers are filled with `:infinity` values. To fillter them out use `prune` function. + + The algorithm has a time complexity of the order $O(N^2T)$, where $N$ is + the number of samples and $T$ is the number of iterations until convergence. + Further, the memory complexity is of the order $O(N^2)$. """ import Nx.Defn @@ -13,16 +17,16 @@ defmodule Scholar.Cluster.AffinityPropagation do containers: [ :labels, :cluster_centers_indices, - :affinity_matrix, :cluster_centers, - :num_clusters + :num_clusters, + :iterations ]} defstruct [ :labels, :cluster_centers_indices, - :affinity_matrix, :cluster_centers, - :num_clusters + :num_clusters, + :iterations ] @opts_schema [ @@ -39,9 +43,18 @@ defmodule Scholar.Cluster.AffinityPropagation do current value is maintained relative to incoming values (weighted 1 - damping). """ ], - self_preference: [ - type: {:or, [:float, :boolean, :integer]}, - doc: "Self preference." + preference: [ + type: {:or, [:float, :atom]}, + default: :reduce_min, + doc: """ + How to compute the preferences for each point - points with larger values + of preferences are more likely to be chosen as exemplars. The number of clusters is + influenced by this option. + + The preferences is either an atom, each is a `Nx` reduction function to + apply on the input similarities (such as `:reduce_min`, `:median`, `:mean`, + etc) or a float. + """ ], key: [ type: {:custom, Scholar.Options, :key, []}, @@ -56,6 +69,14 @@ defmodule Scholar.Cluster.AffinityPropagation do doc: ~S""" If `true`, the learning loop is unrolled. """ + ], + converge_after: [ + type: :pos_integer, + default: 15, + doc: ~S""" + Number of iterations with no change in the number of estimated clusters + that stops the convergence. + """ ] ] @@ -70,8 +91,6 @@ defmodule Scholar.Cluster.AffinityPropagation do The function returns a struct with the following parameters: - * `:affinity_matrix` - Affinity matrix. It is a negated squared euclidean distance of each pair of points. - * `:clusters_centers` - Cluster centers from the initial data. * `:cluster_centers_indices` - Indices of cluster centers. @@ -81,32 +100,25 @@ defmodule Scholar.Cluster.AffinityPropagation do ## Examples iex> key = Nx.Random.key(42) - iex> x = Nx.tensor([[12,5,78,2], [1,-5,7,32], [-1,3,6,1], [1,-2,5,2]]) + iex> x = Nx.tensor([[12,5,78,2], [9,3,81,-2], [-1,3,6,1], [1,-2,5,2]]) iex> Scholar.Cluster.AffinityPropagation.fit(x, key: key) %Scholar.Cluster.AffinityPropagation{ - labels: Nx.tensor([0, 3, 3, 3]), - cluster_centers_indices: Nx.tensor([0, -1, -1, 3]), - affinity_matrix: Nx.tensor( - [ - [-0.0, -6162.0, -5358.0, -5499.0], - [-6162.0, -0.0, -1030.0, -913.0], - [-5358.0, -1030.0, -0.0, -31.0], - [-5499.0, -913.0, -31.0, -0.0] - ]), + labels: Nx.tensor([0, 0, 2, 2]), + cluster_centers_indices: Nx.tensor([0, -1, 2, -1]), cluster_centers: Nx.tensor( [ [12.0, 5.0, 78.0, 2.0], [:infinity, :infinity, :infinity, :infinity], - [:infinity, :infinity, :infinity, :infinity], - [1.0, -2.0, 5.0, 2.0] + [-1.0, 3.0, 6.0, 1.0], + [:infinity, :infinity, :infinity, :infinity] ] ), - num_clusters: Nx.tensor(2, type: :u64) + num_clusters: Nx.tensor(2, type: :u64), + iterations: Nx.tensor(22, type: :s64) } """ deftransform fit(data, opts \\ []) do opts = NimbleOptions.validate!(opts, @opts_schema) - opts = Keyword.update(opts, :self_preference, false, fn x -> x end) key = Keyword.get_lazy(opts, :key, fn -> Nx.Random.key(System.system_time()) end) fit_n(data, key, NimbleOptions.validate!(opts, @opts_schema)) end @@ -115,13 +127,11 @@ defmodule Scholar.Cluster.AffinityPropagation do data = to_float(data) iterations = opts[:iterations] damping_factor = opts[:damping_factor] - self_preference = opts[:self_preference] - data = to_float(data) - - {initial_a, initial_r, s, affinity_matrix} = - initialize_matrices(data, self_preference: self_preference) + converge_after = opts[:converge_after] + n = Nx.axis_size(data, 0) + s = initialize_similarity(data, opts) - {n, _} = Nx.shape(initial_a) + zero_n = Nx.tensor(0, type: Nx.type(s)) |> Nx.broadcast({n, n}) {normal, _new_key} = Nx.Random.normal(key, 0, 1, shape: {n, n}, type: Nx.type(s)) s = @@ -132,9 +142,12 @@ defmodule Scholar.Cluster.AffinityPropagation do range = Nx.iota({n}) - {{a, r}, _} = - while {{a = initial_a, r = initial_r}, {s, range, i = 0}}, - i < iterations do + e = Nx.broadcast(Nx.s64(0), {n, converge_after}) + stop = Nx.u8(0) + + {{a, r, it}, _} = + while {{a = zero_n, r = zero_n, i = 0}, {s, range, stop, e}}, + i < iterations and not stop do temp = a + s indices = Nx.argmax(temp, axis: 1) y = Nx.reduce_max(temp, axes: [1]) @@ -160,7 +173,24 @@ defmodule Scholar.Cluster.AffinityPropagation do temp = temp * (1 - damping_factor) a = a * damping_factor - temp - {{a, r}, {s, range, i + 1}} + curr_e = Nx.take_diagonal(a) + Nx.take_diagonal(r) > 0 + curr_e_slice = Nx.reshape(curr_e, {:auto, 1}) + e = Nx.put_slice(e, [0, Nx.remainder(i, converge_after)], curr_e_slice) + k = Nx.sum(curr_e, axes: [0]) + + stop = + if i >= converge_after do + se = Nx.sum(e, axes: [1]) + unconverged = Nx.sum((se == 0) + (se == converge_after)) != n + + if (not unconverged and k > 0) or i == iterations do + Nx.u8(1) + else + stop + end + end + + {{a, r, i + 1}, {s, range, stop, e}} end diagonals = Nx.take_diagonal(a) + Nx.take_diagonal(r) > 0 @@ -198,14 +228,28 @@ defmodule Scholar.Cluster.AffinityPropagation do end %__MODULE__{ - affinity_matrix: affinity_matrix, cluster_centers_indices: cluster_centers_indices, cluster_centers: cluster_centers, labels: labels, - num_clusters: k + num_clusters: k, + iterations: it } end + defnp initialize_similarity(data, opts \\ []) do + n = Nx.axis_size(data, 0) + dist = -Scholar.Metrics.Distance.pairwise_squared_euclidean(data) + preference = initialize_preference(dist, opts[:preference]) + Nx.put_diagonal(dist, Nx.broadcast(preference, {n})) + end + + deftransformp initialize_preference(dist, preference) do + cond do + is_atom(preference) -> apply(Nx, preference, [dist]) + is_float(preference) -> preference + end + end + @doc """ Optionally prune clusters, indices, and labels to only valid entries. @@ -214,26 +258,20 @@ defmodule Scholar.Cluster.AffinityPropagation do ## Examples iex> key = Nx.Random.key(42) - iex> x = Nx.tensor([[12,5,78,2], [1,-5,7,32], [-1,3,6,1], [1,-2,5,2]]) + iex> x = Nx.tensor([[12,5,78,2], [9,3,81,-2], [-1,3,6,1], [1,-2,5,2]]) iex> model = Scholar.Cluster.AffinityPropagation.fit(x, key: key) iex> Scholar.Cluster.AffinityPropagation.prune(model) %Scholar.Cluster.AffinityPropagation{ - labels: Nx.tensor([0, 1, 1, 1]), - cluster_centers_indices: Nx.tensor([0, 3]), - affinity_matrix: Nx.tensor( - [ - [-0.0, -6162.0, -5358.0, -5499.0], - [-6162.0, -0.0, -1030.0, -913.0], - [-5358.0, -1030.0, -0.0, -31.0], - [-5499.0, -913.0, -31.0, -0.0] - ]), + labels: Nx.tensor([0, 0, 1, 1]), + cluster_centers_indices: Nx.tensor([0, 2]), cluster_centers: Nx.tensor( [ [12.0, 5.0, 78.0, 2.0], - [1.0, -2.0, 5.0, 2.0] + [-1.0, 3.0, 6.0, 1.0] ] ), - num_clusters: Nx.tensor(2, type: :u64) + num_clusters: Nx.tensor(2, type: :u64), + iterations: Nx.tensor(22, type: :s64) } """ def prune( @@ -271,64 +309,19 @@ defmodule Scholar.Cluster.AffinityPropagation do ## Examples iex> key = Nx.Random.key(42) - iex> x = Nx.tensor([[12,5,78,2], [1,5,7,32], [1,3,6,1], [1,2,5,2]]) + iex> x = Nx.tensor([[12,5,78,2], [9,3,81,-2], [-1,3,6,1], [1,-2,5,2]]) iex> model = Scholar.Cluster.AffinityPropagation.fit(x, key: key) iex> model = Scholar.Cluster.AffinityPropagation.prune(model) - iex> Scholar.Cluster.AffinityPropagation.predict(model, Nx.tensor([[1,6,2,6], [8,3,8,2]])) + iex> Scholar.Cluster.AffinityPropagation.predict(model, Nx.tensor([[10,3,50,6], [8,3,8,2]])) #Nx.Tensor< s64[2] - [1, 1] + [0, 1] > """ defn predict(%__MODULE__{cluster_centers: cluster_centers} = _model, x) do - {num_clusters, num_features} = Nx.shape(cluster_centers) - {num_samples, _} = Nx.shape(x) - broadcast_shape = {num_samples, num_clusters, num_features} - - Scholar.Metrics.Distance.euclidean( - Nx.new_axis(x, 1) |> Nx.broadcast(broadcast_shape), - Nx.new_axis(cluster_centers, 0) |> Nx.broadcast(broadcast_shape), - axes: [-1] - ) - dist = Scholar.Metrics.Distance.pairwise_euclidean(x, cluster_centers) Nx.select(Nx.is_nan(dist), Nx.Constants.infinity(Nx.type(dist)), dist) |> Nx.argmin(axis: 1) end - - defnp initialize_matrices(data, opts \\ []) do - {n, _} = Nx.shape(data) - self_preference = opts[:self_preference] - - {similarity_matrix, affinity_matrix} = - initialize_similarities(data, self_preference: self_preference) - - zero = Nx.tensor(0, type: Nx.type(similarity_matrix)) - availability_matrix = Nx.broadcast(zero, {n, n}) - responsibility_matrix = Nx.broadcast(zero, {n, n}) - - {availability_matrix, responsibility_matrix, similarity_matrix, affinity_matrix} - end - - defnp initialize_similarities(data, opts \\ []) do - n = Nx.axis_size(data, 0) - self_preference = opts[:self_preference] - - dist = -Scholar.Metrics.Distance.pairwise_squared_euclidean(data) - - fill_in = - cond do - self_preference == false -> - Nx.broadcast(Nx.median(dist), {n}) - - true -> - if Nx.size(self_preference) == 1, - do: Nx.broadcast(self_preference, {n}), - else: self_preference - end - - s_modified = dist |> Nx.put_diagonal(fill_in) - {s_modified, dist} - end end diff --git a/lib/scholar/cluster/dbscan.ex b/lib/scholar/cluster/dbscan.ex index 4ec5516d..dcd637c1 100644 --- a/lib/scholar/cluster/dbscan.ex +++ b/lib/scholar/cluster/dbscan.ex @@ -5,6 +5,9 @@ defmodule Scholar.Cluster.DBSCAN do DBSCAN - Density-Based Spatial Clustering of Applications with Noise. Finds core samples of high density and expands clusters from them. Good for data which contains clusters of similar density. + + The time complexity is $O(N^2)$ for $N$ samples. + The space complexity is $O(N^2)$. """ import Nx.Defn import Scholar.Shared diff --git a/lib/scholar/cluster/gmm.ex b/lib/scholar/cluster/gmm.ex index f0d08e78..3f95149c 100644 --- a/lib/scholar/cluster/gmm.ex +++ b/lib/scholar/cluster/gmm.ex @@ -12,6 +12,8 @@ defmodule Scholar.Cluster.GaussianMixture do the parameters. Thus the procedure consists of repeating the algorithm several times and taking the best obtained result. + Time complexity is $O(NKD^3)$ for $N$ data points, $K$ Gaussian components and $D$ dimensions + References: * [1] - Mixtures of Gaussians and the EM algorithm https://cs229.stanford.edu/notes2020spring/cs229-notes7b.pdf diff --git a/lib/scholar/cluster/k_means.ex b/lib/scholar/cluster/k_means.ex index 20f42827..2443bac0 100644 --- a/lib/scholar/cluster/k_means.ex +++ b/lib/scholar/cluster/k_means.ex @@ -11,6 +11,10 @@ defmodule Scholar.Cluster.KMeans do converges. Since some initializations are unfortunate and converge to sub-optimal results we need repeat the whole procedure a few times and take the best result. + Average time complexity is $O(CKNI)$, where $C$ is the number of clusters, $N$ is the number of samples, + $I$ is the number of iterations until convergence, and $K$ is the number of features. Space + complexity is $O(K*(N+C))$. + Reference: * [1] - [K-Means Algorithm](https://cs.nyu.edu/~roweis/csc2515-2006/readings/lloyd57.pdf) diff --git a/lib/scholar/covariance.ex b/lib/scholar/covariance.ex index f1ffc37c..57fefa51 100644 --- a/lib/scholar/covariance.ex +++ b/lib/scholar/covariance.ex @@ -1,6 +1,9 @@ defmodule Scholar.Covariance do @moduledoc ~S""" Algorithms to estimate the covariance of features given a set of points. + + Time complexity of covariance estimation is $O(N * K^2)$ where $N$ is the number of samples + and $K$ is the number of features. """ import Nx.Defn diff --git a/lib/scholar/decomposition/pca.ex b/lib/scholar/decomposition/pca.ex index 67e09a86..40624b54 100644 --- a/lib/scholar/decomposition/pca.ex +++ b/lib/scholar/decomposition/pca.ex @@ -6,6 +6,8 @@ defmodule Scholar.Decomposition.PCA do of data set [1]. The sample data is decomposed using linear combination of vectors that lie on the directions of those components. + The time complexity is $O(NP^2 + P^3)$ where $N$ is the number of samples and $P$ is the number of features. + Space complexity is $O(P * (P+N))$. Reference: * [1] - [Principal Component Analysis](https://en.wikipedia.org/wiki/Principal_component_analysis) diff --git a/lib/scholar/interpolation/bezier_spline.ex b/lib/scholar/interpolation/bezier_spline.ex index 94311f81..c039de25 100644 --- a/lib/scholar/interpolation/bezier_spline.ex +++ b/lib/scholar/interpolation/bezier_spline.ex @@ -15,6 +15,8 @@ defmodule Scholar.Interpolation.BezierSpline do or `Scholar.Interpolation.Linear` algorithms, will only affect the segments right next to it, instead of affecting the curve as a whole. + Computing Bezier curve is $O(N^2)$ where $N$ is the number of points. + Reference: * [1] - [Bezier theory](https://en.wikipedia.org/wiki/B%C3%A9zier_curve) diff --git a/lib/scholar/interpolation/cubic_spline.ex b/lib/scholar/interpolation/cubic_spline.ex index 925fbe78..00901f00 100644 --- a/lib/scholar/interpolation/cubic_spline.ex +++ b/lib/scholar/interpolation/cubic_spline.ex @@ -10,6 +10,8 @@ defmodule Scholar.Interpolation.CubicSpline do could appear if a single n-th degree polynomial were to be fitted over all of the points. + Cubic spline interpolation is $O(N)$ where $N$ is the number of points. + Reference: * [1] - [Cubic Spline Interpolation theory](https://en.wikiversity.org/wiki/Cubic_Spline_Interpolation) @@ -50,8 +52,8 @@ defmodule Scholar.Interpolation.CubicSpline do %Scholar.Interpolation.CubicSpline{ coefficients: Nx.tensor( [ - [0.0, 1.500000238418579, -3.500000238418579, 2.0], - [0.0, 1.5, -0.4999999403953552, 0.0] + [0.0, 1.5, -3.5, 2.0], + [0.0, 1.5, -0.5, 0.0] ] ), x: Nx.tensor( @@ -108,7 +110,7 @@ defmodule Scholar.Interpolation.CubicSpline do b = Nx.stack([2 * slope[0], 3 * (dx[0] * slope[1] + dx[1] * slope[0]), 2 * slope[1]]) - Nx.LinAlg.solve(a, b) + tridiagonal_solve(a, b) {_, :not_a_knot} -> up_diag = @@ -149,7 +151,7 @@ defmodule Scholar.Interpolation.CubicSpline do Nx.new_axis(b_n, 0) ]) - Nx.LinAlg.solve(a, b) + tridiagonal_solve(a, b) _ -> up_diag = @@ -188,7 +190,7 @@ defmodule Scholar.Interpolation.CubicSpline do Nx.new_axis(b_n, 0) ]) - Nx.LinAlg.solve(a, b) + tridiagonal_solve(a, b) end t = (s[0..-2//1] + s[1..-1//1] - 2 * slope) / dx @@ -289,4 +291,46 @@ defmodule Scholar.Interpolation.CubicSpline do Nx.reshape(result, original_shape) end + + defnp tridiagonal_solve(a, b) do + n = Nx.size(b) + w = Nx.broadcast(0, {n - 1}) + p = g = Nx.broadcast(0, {n}) + i = Nx.take_diagonal(a, offset: -1) + j = Nx.take_diagonal(a) + k = Nx.take_diagonal(a, offset: 1) + + w_0 = k[0] / j[0] + g_0 = b[0] / j[0] + w = Nx.indexed_put(w, Nx.new_axis(0, 0), w_0) + g = Nx.indexed_put(g, Nx.new_axis(0, 0), g_0) + + {{w, g}, _} = + while {{w, g}, {index = 1, i, j, k, b}}, index < n do + w = + if index < n - 1 do + w_i = k[index] / (j[index] - i[index - 1] * w[index - 1]) + Nx.indexed_put(w, Nx.new_axis(index, 0), w_i) + else + w + end + + g_i = (b[index] - i[index - 1] * g[index - 1]) / (j[index] - i[index - 1] * w[index - 1]) + g = Nx.indexed_put(g, Nx.new_axis(index, 0), g_i) + + {{w, g}, {index + 1, i, j, k, b}} + end + + p = Nx.indexed_put(p, Nx.new_axis(n - 1, 0), g[n - 1]) + + {p, _} = + while {p, {index = n - 1, g, w}}, index > 0 do + p_i = g[index - 1] - w[index - 1] * p[index] + p = Nx.indexed_put(p, Nx.new_axis(index - 1, 0), p_i) + + {p, {index - 1, g, w}} + end + + p + end end diff --git a/lib/scholar/interpolation/linear.ex b/lib/scholar/interpolation/linear.ex index 297627b9..d981eeb8 100644 --- a/lib/scholar/interpolation/linear.ex +++ b/lib/scholar/interpolation/linear.ex @@ -14,6 +14,8 @@ defmodule Scholar.Interpolation.Linear do b = y_1 - ax_1 = y_0 - ax_0 \end{cases} $$ + + Linear interpolation has $O(N)$ time and space complexity where $N$ is the number of points. """ import Nx.Defn import Scholar.Shared diff --git a/lib/scholar/linear/isotonic_regression.ex b/lib/scholar/linear/isotonic_regression.ex index 22996eb0..2198200a 100644 --- a/lib/scholar/linear/isotonic_regression.ex +++ b/lib/scholar/linear/isotonic_regression.ex @@ -4,6 +4,9 @@ defmodule Scholar.Linear.IsotonicRegression do observations by solving a convex optimization problem. It is a form of regression analysis that can be used as an alternative to polynomial regression to fit nonlinear data. + + Time complexity of isotonic regression is $O(N^2)$ where $N$ is the + number of points. """ require Nx import Nx.Defn, except: [transform: 2] @@ -306,6 +309,24 @@ defmodule Scholar.Linear.IsotonicRegression do } end + @doc """ + Preprocesses the `model` for prediction. + + Returns an updated `model`. This is a special version of `preprocess/1` that + does not trim duplicates so it can be used in defns. It is not recommended + to use this function directly. + """ + defn special_preprocess(model) do + %__MODULE__{ + model + | preprocess: + Scholar.Interpolation.Linear.fit( + model.x_thresholds, + model.y_thresholds + ) + } + end + deftransform check_preprocess(model) do if model.preprocess == {} do raise ArgumentError, diff --git a/lib/scholar/linear/linear_regression.ex b/lib/scholar/linear/linear_regression.ex index ddf0b7cd..c0033fc1 100644 --- a/lib/scholar/linear/linear_regression.ex +++ b/lib/scholar/linear/linear_regression.ex @@ -1,6 +1,9 @@ defmodule Scholar.Linear.LinearRegression do @moduledoc """ Ordinary least squares linear regression. + + Time complexity of linear regression is $O((K^2) * (K+N))$ where $N$ is the number of samples + and $K$ is the number of features. """ require Nx import Nx.Defn diff --git a/lib/scholar/linear/logistic_regression.ex b/lib/scholar/linear/logistic_regression.ex index c7dea72c..3c62c9fa 100644 --- a/lib/scholar/linear/logistic_regression.ex +++ b/lib/scholar/linear/logistic_regression.ex @@ -1,6 +1,8 @@ defmodule Scholar.Linear.LogisticRegression do @moduledoc """ Logistic regression in both binary and multinomial variants. + + Time complexity is $O(N * K * I)$ where $N$ is the number of samples, $K$ is the number of features, and $I$ is the number of iterations. """ import Nx.Defn import Scholar.Shared diff --git a/lib/scholar/linear/polynomial_regression.ex b/lib/scholar/linear/polynomial_regression.ex index 1a500d68..241d1a9f 100644 --- a/lib/scholar/linear/polynomial_regression.ex +++ b/lib/scholar/linear/polynomial_regression.ex @@ -1,6 +1,8 @@ defmodule Scholar.Linear.PolynomialRegression do @moduledoc """ Least squares polynomial regression. + + Time complexity of polynomial regression is $O((K^2) * (K+N))$ where $N$ is the number of samples and $K$ is the number of features. """ import Nx.Defn diff --git a/lib/scholar/linear/ridge_regression.ex b/lib/scholar/linear/ridge_regression.ex index ea00d5c1..c702b81d 100644 --- a/lib/scholar/linear/ridge_regression.ex +++ b/lib/scholar/linear/ridge_regression.ex @@ -15,6 +15,9 @@ defmodule Scholar.Linear.RidgeRegression do * $w$ is the model weights matrix * $\alpha$ is the parameter that controls level of regularization + + Time complexity is $O(N^2)$ for `:cholesky` solver and $O((N^2) * (K + N))$ for `:svd` solver, + where $N$ is the number of observations and $K$ is the number of features. """ require Nx import Nx.Defn diff --git a/lib/scholar/linear/svm.ex b/lib/scholar/linear/svm.ex index ad9ebd85..688c034b 100644 --- a/lib/scholar/linear/svm.ex +++ b/lib/scholar/linear/svm.ex @@ -1,13 +1,17 @@ defmodule Scholar.Linear.SVM do @moduledoc """ - SVM classifier + Support Vector Machine linear classifier. - It uses the OvR strategy to handle both binary and multinomial classification. + It uses the One-vs-Rest strategy to handle both binary and multinomial classification. This implementation uses stochastic gradient descent from default or any other optimizer available in `Polaris`. This makes it similar to a sklearn SGDClassifier [1]. - It means that on average it will work slower than algorithms that use QP and kernel trick (LIBSVM [2]) or - Coordinate Descent Algorithm (LIBLINEAR [3]). It also cannot use different kernels like in LIBSVM, - but you can use any type of optimizer available in `Polaris`. + + On average it is slower than algorithms that use QP and kernel trick (LIBSVM [2]) or + Coordinate Descent Algorithm (LIBLINEAR [3]). It also cannot use different kernels + like in LIBSVM, but you can use any type of optimizer available in `Polaris`. + + Time complexity is $O(N * K * I * C)$ where $N$ is the number of samples, $K$ is the + number of features $I$ is the number of iterations and $C$ is the number of classes. [1] - https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html [2] - https://www.csie.ntu.edu.tw/~cjlin/libsvm/ diff --git a/lib/scholar/manifold/mds.ex b/lib/scholar/manifold/mds.ex new file mode 100644 index 00000000..4effb024 --- /dev/null +++ b/lib/scholar/manifold/mds.ex @@ -0,0 +1,412 @@ +defmodule Scholar.Manifold.MDS do + @moduledoc """ + TSNE (t-Distributed Stochastic Neighbor Embedding) is a nonlinear dimensionality reduction technique. + + ## References + + * [t-SNE: t-Distributed Stochastic Neighbor Embedding](http://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf) + """ + import Nx.Defn + import Scholar.Shared + alias Scholar.Metrics.Distance + + @derive {Nx.Container, containers: [:embedding, :stress, :n_iter]} + defstruct [:embedding, :stress, :n_iter] + + opts_schema = [ + num_components: [ + type: :pos_integer, + default: 2, + doc: ~S""" + Dimension of the embedded space. + """ + ], + metric: [ + type: :boolean, + default: true, + doc: ~S""" + If `true`, use dissimilarities as metric distances in the embedding space. + """ + ], + normalized_stress: [ + type: :boolean, + default: false, + doc: ~S""" + If `true`, normalize the stress by the sum of squared dissimilarities. + Only valid if `metric` is `false`. + """ + ], + eps: [ + type: :float, + default: 1.0e-3, + doc: ~S""" + Tolerance for stopping criterion. + """ + ], + max_iter: [ + type: :pos_integer, + default: 300, + doc: ~S""" + Maximum number of iterations for the optimization. + """ + ], + key: [ + type: {:custom, Scholar.Options, :key, []}, + doc: """ + Determines random number generation for centroid initialization. + If the key is not provided, it is set to `Nx.Random.key(System.system_time())`. + """ + ], + n_init: [ + type: :pos_integer, + default: 8, + doc: ~S""" + Number of times the embedding will be computed with different centroid seeds. + The final embedding is the embedding with the lowest stress. + """ + ] + ] + + @opts_schema NimbleOptions.new!(opts_schema) + + # initialize x randomly or pass the init x earlier + defnp smacof(dissimilarities, x, max_iter, opts) do + similarities_flat = Nx.flatten(dissimilarities) + similarities_flat_indices = lower_triangle_indices(dissimilarities) + + similarities_flat_w = Nx.take(similarities_flat, similarities_flat_indices) + + metric = if opts[:metric], do: 1, else: 0 + normalized_stress = if opts[:normalized_stress], do: 1, else: 0 + eps = opts[:eps] + n = Nx.axis_size(dissimilarities, 0) + + {{x, stress, i}, _} = + while {{x, _stress = Nx.Constants.infinity(Nx.type(dissimilarities)), i = 0}, + {dissimilarities, max_iter, similarities_flat_indices, similarities_flat, + similarities_flat_w, old_stress = Nx.Constants.infinity(Nx.type(dissimilarities)), + metric, normalized_stress, eps, stop_value = 0}}, + i < max_iter and not stop_value do + dis = Distance.pairwise_euclidean(x) + + disparities = + if metric do + dissimilarities + else + dis_flat = Nx.flatten(dis) + + dis_flat_indices = lower_triangle_indices(dis) + + dis_flat_w = Nx.take(dis_flat, dis_flat_indices) + + disparities_flat_model = + Scholar.Linear.IsotonicRegression.fit(similarities_flat_w, dis_flat_w, + increasing: true + ) + + model = special_preprocess(disparities_flat_model) + + disparities_flat = + Scholar.Linear.IsotonicRegression.predict(model, similarities_flat_w) + + disparities = + Nx.indexed_put( + dis_flat, + Nx.new_axis(similarities_flat_indices, -1), + disparities_flat + ) + + disparities = Nx.reshape(disparities, {n, n}) + + disparities * Nx.sqrt(n * (n - 1) / 2 / Nx.sum(disparities ** 2)) + end + + stress = Nx.sum((Nx.flatten(dis) - Nx.flatten(disparities)) ** 2) / 2 + + stress = + if normalized_stress do + Nx.sqrt(stress / (Nx.sum(Nx.flatten(disparities) ** 2) / 2)) + else + stress + end + + dis = Nx.select(dis == 0, 1.0e-5, dis) + ratio = disparities / dis + b = -ratio + b = Nx.put_diagonal(b, Nx.take_diagonal(b) + Nx.sum(ratio, axes: [1])) + x = Nx.dot(b, x) * (1.0 / n) + + dis = Nx.sum(Nx.sqrt(Nx.sum(x ** 2, axes: [1]))) + + stop_value = if old_stress - stress / dis < eps, do: 1, else: 0 + + old_stress = stress / dis + + {{x, stress, i + 1}, + {dissimilarities, max_iter, similarities_flat_indices, similarities_flat, + similarities_flat_w, old_stress, metric, normalized_stress, eps, stop_value}} + end + + {x, stress, i} + end + + defnp mds_main_loop(dissimilarities, x, _key, opts) do + n_init = opts[:n_init] + + type = Nx.Type.merge(to_float_type(x), to_float_type(dissimilarities)) + dissimilarities = Nx.as_type(dissimilarities, type) + x = Nx.as_type(x, type) + + dissimilarities = Distance.pairwise_euclidean(dissimilarities) + + {{best, best_stress, best_iter}, _} = + while {{best = x, best_stress = Nx.Constants.infinity(type), best_iter = 0}, + {n_init, dissimilarities, x, i = 0}}, + i < n_init do + {temp, stress, iter} = smacof(dissimilarities, x, opts[:max_iter], opts) + + {best, best_stress, best_iter} = + if stress < best_stress, do: {temp, stress, iter}, else: {best, best_stress, best_iter} + + {{best, best_stress, best_iter}, {n_init, dissimilarities, x, i + 1}} + end + + {best, best_stress, best_iter} + end + + defnp mds_main_loop(dissimilarities, key, opts) do + n_init = opts[:n_init] + max_iter = opts[:max_iter] + num_samples = Nx.axis_size(dissimilarities, 0) + + type = to_float_type(dissimilarities) + dissimilarities = Nx.as_type(dissimilarities, type) + + {dummy, key} = + Nx.Random.uniform(key, + shape: {num_samples, opts[:num_components]}, + type: type + ) + + dissimilarities = Distance.pairwise_euclidean(dissimilarities) + + {{best, best_stress, best_iter}, _} = + while {{best = dummy, best_stress = Nx.Constants.infinity(type), best_iter = 0}, + {n_init, key, max_iter, dissimilarities, i = 0}}, + i < n_init do + num_samples = Nx.axis_size(dissimilarities, 0) + + {x, key} = + Nx.Random.uniform(key, shape: {num_samples, opts[:num_components]}, type: type) + + {temp, stress, iter} = smacof(dissimilarities, x, max_iter, opts) + + {best, best_stress, best_iter} = + if stress < best_stress, do: {temp, stress, iter}, else: {best, best_stress, best_iter} + + {{best, best_stress, best_iter}, {n_init, key, max_iter, dissimilarities, i + 1}} + end + + {best, best_stress, best_iter} + end + + defnp lower_triangle_indices(tensor) do + n = Nx.axis_size(tensor, 0) + + temp = Nx.broadcast(Nx.s64(0), {div(n * (n - 1), 2)}) + + {temp, _} = + while {temp, {i = 0, j = 0}}, i < n ** 2 do + {temp, j} = + if Nx.remainder(i, n) < Nx.quotient(i, n) do + temp = Nx.indexed_put(temp, Nx.new_axis(j, -1), i) + {temp, j + 1} + else + {temp, j} + end + + {temp, {i + 1, j}} + end + + temp + end + + @doc """ + Fits MDS for sample inputs `x`. It is simpyfied version of `fit/3` function. + + ## Options + + #{NimbleOptions.docs(@opts_schema)} + + ## Return Values + + Returns struct with embedded data, stress value, and number of iterations for best run. + + ## Examples + + iex> x = Nx.iota({4,5}) + iex> key = Nx.Random.key(42) + iex> Scholar.Manifold.MDS.fit(x, key: key) + %Scholar.Manifold.MDS{ + embedding: Nx.tensor( + [ + [13.072145462036133, -10.424199104309082], + [5.13038969039917, -2.341259479522705], + [-5.651908874511719, 1.7662434577941895], + [-12.550626754760742, 10.999215126037598] + ] + ), + stress: Nx.tensor( + 0.36994707584381104 + ), + n_iter: Nx.tensor( + 20 + ) + } + """ + deftransform fit(x) do + opts = NimbleOptions.validate!([], @opts_schema) + key = Keyword.get_lazy(opts, :key, fn -> Nx.Random.key(System.system_time()) end) + fit_n(x, key, opts) + end + + @doc """ + Fits MDS for sample inputs `x`. It is simpyfied version of `fit/3` function. + + ## Options + + #{NimbleOptions.docs(@opts_schema)} + + ## Return Values + + Returns struct with embedded data, stress value, and number of iterations for best run. + + ## Examples + + iex> x = Nx.iota({4,5}) + iex> key = Nx.Random.key(42) + iex> Scholar.Manifold.MDS.fit(x, num_components: 2, key: key) + %Scholar.Manifold.MDS{ + embedding: Nx.tensor( + [ + [13.072145462036133, -10.424199104309082], + [5.13038969039917, -2.341259479522705], + [-5.651908874511719, 1.7662434577941895], + [-12.550626754760742, 10.999215126037598] + ] + ), + stress: Nx.tensor( + 0.36994707584381104 + ), + n_iter: Nx.tensor( + 20 + ) + } + """ + deftransform fit(x, opts) when is_list(opts) do + opts = NimbleOptions.validate!(opts, @opts_schema) + key = Keyword.get_lazy(opts, :key, fn -> Nx.Random.key(System.system_time()) end) + fit_n(x, key, opts) + end + + defnp fit_n(x, key, opts) do + {best, best_stress, best_iter} = mds_main_loop(x, key, opts) + %__MODULE__{embedding: best, stress: best_stress, n_iter: best_iter} + end + + @doc """ + Fits MDS for sample inputs `x`. It is simpyfied version of `fit/3` function. + + ## Options + + #{NimbleOptions.docs(@opts_schema)} + + ## Return Values + + Returns struct with embedded data, stress value, and number of iterations for best run. + + ## Examples + + iex> x = Nx.iota({4,5}) + iex> key = Nx.Random.key(42) + iex> init = Nx.reverse(Nx.iota({4,2})) + iex> Scholar.Manifold.MDS.fit(x, init) + %Scholar.Manifold.MDS{ + embedding: Nx.tensor( + [ + [11.858541488647461, 11.858541488647461], + [3.9528470039367676, 3.9528470039367676], + [-3.9528470039367676, -3.9528470039367676], + [-11.858541488647461, -11.858541488647461] + ] + ), + stress: Nx.tensor( + 0.0 + ), + n_iter: Nx.tensor( + 3 + ) + } + """ + deftransform fit(x, init) do + opts = NimbleOptions.validate!([], @opts_schema) + key = Keyword.get_lazy(opts, :key, fn -> Nx.Random.key(System.system_time()) end) + fit_n(x, init, key, opts) + end + + @doc """ + Fits MDS for sample inputs `x`. It is simpyfied version of `fit/3` function. + + ## Options + + #{NimbleOptions.docs(@opts_schema)} + + ## Return Values + + Returns struct with embedded data, stress value, and number of iterations for best run. + + ## Examples + + iex> x = Nx.iota({4,5}) + iex> key = Nx.Random.key(42) + iex> init = Nx.reverse(Nx.iota({4,3})) + iex> Scholar.Manifold.MDS.fit(x, init, num_components: 3, key: key) + %Scholar.Manifold.MDS{ + embedding: Nx.tensor( + [ + [9.682458877563477, 9.682458877563477, 9.682458877563477], + [3.2274858951568604, 3.2274858951568604, 3.2274858951568604], + [-3.2274863719940186, -3.2274863719940186, -3.2274863719940186], + [-9.682458877563477, -9.682458877563477, -9.682458877563477] + ] + ), + stress: Nx.tensor( + 9.094947017729282e-12 + ), + n_iter: Nx.tensor( + 3 + ) + } + """ + deftransform fit(x, init, opts) when is_list(opts) do + opts = NimbleOptions.validate!(opts, @opts_schema) + key = Keyword.get_lazy(opts, :key, fn -> Nx.Random.key(System.system_time()) end) + fit_n(x, init, key, opts) + end + + defnp fit_n(x, init, key, opts) do + {best, best_stress, best_iter} = mds_main_loop(x, init, key, opts) + %__MODULE__{embedding: best, stress: best_stress, n_iter: best_iter} + end + + defnp special_preprocess(model) do + %Scholar.Linear.IsotonicRegression{ + model + | preprocess: + Scholar.Interpolation.Linear.fit( + model.x_thresholds, + model.y_thresholds + ) + } + end +end diff --git a/lib/scholar/manifold/tsne.ex b/lib/scholar/manifold/tsne.ex index 50245689..fbebf650 100644 --- a/lib/scholar/manifold/tsne.ex +++ b/lib/scholar/manifold/tsne.ex @@ -1,6 +1,8 @@ defmodule Scholar.Manifold.TSNE do @moduledoc """ - TSNE (t-Distributed Stochastic Neighbor Embedding) is a nonlinear dimensionality reduction technique. + t-SNE (t-Distributed Stochastic Neighbor Embedding) is a nonlinear dimensionality reduction technique. + + This is an exact implementation of t-SNE and therefore it has time complexity is $O(N^2)$ for $N$ samples. ## Reference diff --git a/lib/scholar/metrics/classification.ex b/lib/scholar/metrics/classification.ex index 797d1cd6..a1e1d124 100644 --- a/lib/scholar/metrics/classification.ex +++ b/lib/scholar/metrics/classification.ex @@ -13,6 +13,7 @@ defmodule Scholar.Metrics.Classification do import Nx.Defn, except: [assert_shape: 2, assert_shape_pattern: 2] import Scholar.Shared + import Scholar.Preprocessing alias Scholar.Integrate general_schema = [ @@ -161,6 +162,26 @@ defmodule Scholar.Metrics.Classification do ] ] + log_loss_schema = + general_schema ++ + [ + normalize: [ + type: :boolean, + default: true, + doc: """ + If `true`, return the mean loss over the samples. + Otherwise, return the sum of losses over the samples. + """ + ], + sample_weights: [ + type: {:custom, Scholar.Options, :weights, []}, + default: 1.0, + doc: """ + Sample weights of the observations. + """ + ] + ] + top_k_accuracy_score_schema = general_schema ++ [ @@ -203,6 +224,7 @@ defmodule Scholar.Metrics.Classification do ) @brier_score_loss_schema NimbleOptions.new!(brier_score_loss_schema) @accuracy_schema NimbleOptions.new!(accuracy_schema) + @log_loss_schema NimbleOptions.new!(log_loss_schema) @top_k_accuracy_score_schema NimbleOptions.new!(top_k_accuracy_score_schema) @zero_one_loss_schema NimbleOptions.new!(zero_one_loss_schema) @@ -1233,6 +1255,91 @@ defmodule Scholar.Metrics.Classification do 1 - Nx.sum(weights_matrix * cm) / Nx.sum(weights_matrix * expected) end + @doc """ + Computes the log loss, aka logistic loss or cross-entropy loss. + + The log-loss is a measure of how well a forecaster performs, with smaller + values being better. For each sample, a forecaster outputs a probability for + each class, from which the log loss is computed by averaging the negative log + of the probability forecasted for the true class over a number of samples. + + `y_true` should contain `num_classes` unique values, and the sum of `y_prob` + along axis 1 should be 1 to respect the law of total probability. + + ## Options + + #{NimbleOptions.docs(@log_loss_schema)} + + ## Examples + + iex> y_true = Nx.tensor([0, 0, 1, 1]) + iex> y_prob = Nx.tensor([[0.9, 0.1], [0.8, 0.2], [0.3, 0.7], [0.01, 0.99]]) + iex> Scholar.Metrics.Classification.log_loss(y_true, y_prob, num_classes: 2) + #Nx.Tensor< + f32 + 0.17380733788013458 + > + iex> Scholar.Metrics.Classification.log_loss(y_true, y_prob, num_classes: 2, normalize: false) + #Nx.Tensor< + f32 + 0.6952293515205383 + > + iex> weights = Nx.tensor([0.7, 2.3, 1.3, 0.34]) + iex(361)> Scholar.Metrics.Classification.log_loss(y_true, y_prob, num_classes: 2, sample_weights: weights) + #Nx.Tensor< + f32 + 0.22717177867889404 + > + """ + deftransform log_loss(y_true, y_prob, opts \\ []) do + log_loss_n( + y_true, + y_prob, + NimbleOptions.validate!(opts, @log_loss_schema) + ) + end + + defnp log_loss_n(y_true, y_prob, opts) do + assert_rank!(y_true, 1) + assert_rank!(y_prob, 2) + + if Nx.axis_size(y_true, 0) != Nx.axis_size(y_prob, 0) do + raise ArgumentError, "y_true and y_prob must have the same size along axis 0" + end + + num_classes = opts[:num_classes] + + if Nx.axis_size(y_prob, 1) != num_classes do + raise ArgumentError, "y_prob must have a size of num_classes along axis 1" + end + + weights = + validate_weights( + opts[:sample_weights], + Nx.axis_size(y_true, 0), + type: to_float_type(y_prob) + ) + + y_true_onehot = + ordinal_encode(y_true, num_classes: num_classes) + |> one_hot_encode(num_classes: num_classes) + + y_prob = Nx.clip(y_prob, 0, 1) + + sample_loss = + Nx.multiply(y_true_onehot, y_prob) + |> Nx.sum(axes: [-1]) + |> Nx.log() + |> Nx.negate() + + if opts[:normalize] do + Nx.weighted_mean(sample_loss, weights) + else + Nx.multiply(sample_loss, weights) + |> Nx.sum() + end + end + @doc """ Top-k Accuracy classification score. @@ -1341,4 +1448,42 @@ defmodule Scholar.Metrics.Classification do assert_rank!(y_true, 1) assert_same_shape!(y_true, y_pred) end + + @doc """ + Matthews Correlation Coefficient (MCC) provides a measure of the quality of binary classifications. + + It returns a value between -1 and 1 where 1 represents a perfect prediction, 0 represents no better + than random prediction, and -1 indicates total disagreement between prediction and observation. + """ + defn mcc(y_true, y_pred) do + true_positives = binary_true_positives(y_true, y_pred) + true_negatives = binary_true_negatives(y_true, y_pred) + false_positives = binary_false_positives(y_true, y_pred) + false_negatives = binary_false_negatives(y_true, y_pred) + + mcc_numerator = true_positives * true_negatives - false_positives * false_negatives + + mcc_denominator = + Nx.sqrt( + (true_positives + false_positives) * + (true_positives + false_negatives) * + (true_negatives + false_positives) * + (true_negatives + false_negatives) + ) + + zero_tensor = Nx.tensor([0.0], type: :f32) + + if Nx.all( + true_positives == zero_tensor and + true_negatives == zero_tensor + ) do + Nx.tensor([-1.0], type: :f32) + else + Nx.select( + mcc_denominator == zero_tensor, + zero_tensor, + mcc_numerator / mcc_denominator + ) + end + end end diff --git a/lib/scholar/metrics/clustering.ex b/lib/scholar/metrics/clustering.ex index 7150bd96..d716faca 100644 --- a/lib/scholar/metrics/clustering.ex +++ b/lib/scholar/metrics/clustering.ex @@ -26,6 +26,8 @@ defmodule Scholar.Metrics.Clustering do clustering configuration is appropriate. If many points have a low or negative value, then the clustering configuration may have too many or too few clusters. + Time complexity of silhouette score is $O(N^2)$ where $N$ is the number of samples. + ## Options #{NimbleOptions.docs(@opts_schema)} diff --git a/lib/scholar/metrics/ranking.ex b/lib/scholar/metrics/ranking.ex new file mode 100644 index 00000000..45a08447 --- /dev/null +++ b/lib/scholar/metrics/ranking.ex @@ -0,0 +1,97 @@ +defmodule Scholar.Metrics.Ranking do + @moduledoc """ + Provides metrics and calculations related to ranking quality. + + Ranking metrics evaluate the quality of ordered lists of items, + often used in information retrieval and recommendation systems. + + This module currently supports the following ranking metrics: + * Discounted Cumulative Gain (DCG) + """ + + import Nx.Defn + import Scholar.Shared + require Nx + + @dcg_opts [ + k: [ + type: {:custom, Scholar.Options, :positive_number, []}, + doc: "Truncation parameter to consider only the top-k elements." + ] + ] + + @dcg_opts_schema NimbleOptions.new!(@dcg_opts) + + deftransform dcg(y_true, y_score, opts \\ []) do + dcg_n(y_true, y_score, NimbleOptions.validate!(opts, @dcg_opts_schema)) + end + + @doc """ + ## Options + #{NimbleOptions.docs(@dcg_opts_schema)} + + Computes the DCG based on true relevance scores (`y_true`) and their respective predicted scores (`y_score`). + """ + defn dcg_n(y_true, y_score, opts) do + y_true_shape = Nx.shape(y_true) + y_score_shape = Nx.shape(y_score) + + check_shape(y_true_shape, y_score_shape) + + {adjusted_y_true, adjusted_y_score} = handle_ties(y_true, y_score) + + sorted_indices = Nx.argsort(adjusted_y_score, axis: 0, direction: :desc) + sorted_y_true = Nx.take(adjusted_y_true, sorted_indices) + + truncated_y_true = truncate_at_k(sorted_y_true, opts) + dcg_value(truncated_y_true) + end + + defnp check_shape(y_true, y_pred) do + assert_same_shape!(y_true, y_pred) + end + + defnp handle_ties(y_true, y_score) do + sorted_indices = Nx.argsort(y_score, axis: 0, direction: :desc) + + sorted_y_true = Nx.take(y_true, sorted_indices) + sorted_y_score = Nx.take(y_score, sorted_indices) + + tie_sorted_indices = Nx.argsort(sorted_y_true, axis: 0, direction: :desc) + adjusted_y_true = Nx.take(sorted_y_true, tie_sorted_indices) + adjusted_y_score = Nx.take(sorted_y_score, tie_sorted_indices) + + {adjusted_y_true, adjusted_y_score} + end + + defnp dcg_value(y_true) do + float_y_true = Nx.as_type(y_true, :f32) + + log_tensor = + y_true + |> Nx.shape() + |> Nx.iota() + |> Nx.as_type(:f32) + |> Nx.add(2.0) + |> Nx.log2() + + div_result = Nx.divide(float_y_true, log_tensor) + + Nx.sum(div_result) + end + + defnp truncate_at_k(tensor, opts) do + case opts[:k] do + nil -> + tensor + + _ -> + if opts[:k] > Nx.axis_size(tensor, 0) do + tensor + else + {top_k, _rest} = Nx.split(tensor, opts[:k], axis: 0) + top_k + end + end + end +end diff --git a/lib/scholar/metrics/regression.ex b/lib/scholar/metrics/regression.ex index 725f7b99..aa2385ad 100644 --- a/lib/scholar/metrics/regression.ex +++ b/lib/scholar/metrics/regression.ex @@ -71,8 +71,7 @@ defmodule Scholar.Metrics.Regression do > """ defn mean_square_error(y_true, y_pred) do - diff = y_true - y_pred - (diff * diff) |> Nx.mean() + mean_tweedie_deviance_n(y_true, y_pred, 0) end @doc ~S""" @@ -133,6 +132,170 @@ defmodule Scholar.Metrics.Regression do |> Nx.mean() end + @doc """ + Calculates the mean Tweedie deviance of predictions + with respect to targets. Includes the Gaussian, Poisson, + Gamma and inverse-Gaussian families as special cases. + + #{~S''' + $$d(y,\mu) = + \begin{cases} + (y-\mu)^2, & \text{for }p=0\\\\ + 2(y \log(y/\mu) + \mu - y), & \text{for }p=1\\\\ + 2(\log(\mu/y) + y/\mu - 1), & \text{for }p=2\\\\ + 2\left(\frac{\max(y,0)^{2-p}}{(1-p)(2-p)}-\frac{y\mu^{1-p}}{1-p}+\frac{\mu^{2-p}}{2-p}\right), & \text{for }p<0 \vee p>2 + \end{cases}$$ + '''} + + ## Examples + + iex> y_true = Nx.tensor([1, 1, 1, 1, 1, 2, 2, 1, 3, 1], type: :u32) + iex> y_pred = Nx.tensor([2, 2, 1, 1, 2, 2, 2, 1, 3, 1], type: :u32) + iex> Scholar.Metrics.Regression.mean_tweedie_deviance(y_true, y_pred, 1) + #Nx.Tensor< + f32 + 0.18411168456077576 + > + """ + defn mean_tweedie_deviance(y_true, y_pred, power) do + mean_tweedie_deviance_n(y_true, y_pred, power) + end + + @doc """ + Similar to `mean_tweedie_deviance/3` but raises `RuntimeError` if the + inputs cannot be used with the given power argument. + + Note: This function cannot be used in `defn`. + + ## Examples + + iex> y_true = Nx.tensor([1, 1, 1, 1, 1, 2, 2, 1, 3, 1], type: :u32) + iex> y_pred = Nx.tensor([2, 2, 1, 1, 2, 2, 2, 1, 3, 1], type: :u32) + iex> Scholar.Metrics.Regression.mean_tweedie_deviance!(y_true, y_pred, 1) + #Nx.Tensor< + f32 + 0.18411168456077576 + > + """ + def mean_tweedie_deviance!(y_true, y_pred, power) do + message = "mean Tweedie deviance with power=#{power} can only be used on " + + case check_tweedie_deviance_power(y_true, y_pred, power) |> Nx.to_number() do + 1 -> :ok + 2 -> raise message <> "strictly positive y_pred" + 4 -> raise message <> "non-negative y_true and strictly positive y_pred" + 5 -> raise message <> "strictly positive y_true and strictly positive y_pred" + 100 -> raise "something went wrong, branch should never appear" + end + + mean_tweedie_deviance_n(y_true, y_pred, power) + end + + defnp mean_tweedie_deviance_n(y_true, y_pred, power) do + deviance = + cond do + power < 0 -> + 2 * + ( + Nx.pow(max(y_true, 0), 2 - power) / ((1 - power) * (2 - power)) + -y_true * Nx.pow(y_pred, 1 - power) / (1 - power) + +Nx.pow(y_pred, 2 - power) / (2 - power) + ) + + # Normal distribution + power == 0 -> + Nx.pow(y_true - y_pred, 2) + + # Poisson distribution + power == 1 -> + 2 * (y_true * Nx.log(y_true / y_pred) + y_pred - y_true) + + # Gamma distribution + power == 2 -> + 2 * (Nx.log(y_pred / y_true) + y_true / y_pred - 1) + + # 1 < power < 2 -> Compound Poisson distribution, non-negative with mass at zero + # power == 3 -> Inverse-Gaussian distribution + # power > 2 -> Stable distribution, with support on the positive reals + true -> + 2 * + ( + Nx.pow(y_true, 2 - power) / ((1 - power) * (2 - power)) + -y_true * Nx.pow(y_pred, 1 - power) / (1 - power) + +Nx.pow(y_pred, 2 - power) / (2 - power) + ) + end + + Nx.mean(deviance) + end + + defnp check_tweedie_deviance_power(y_true, y_pred, power) do + cond do + power < 0 -> + if Nx.all(y_pred > 0) do + Nx.u8(1) + else + Nx.u8(2) + end + + power == 0 -> + Nx.u8(1) + + power >= 1 and power < 2 -> + if Nx.all(y_true >= 0) and Nx.all(y_pred > 0) do + Nx.u8(1) + else + Nx.u8(4) + end + + power >= 2 -> + if Nx.all(y_true > 0) and Nx.all(y_pred > 0) do + Nx.u8(1) + else + Nx.u8(5) + end + + true -> + Nx.u8(100) + end + end + + @doc """ + Calculates the mean Poisson deviance of predictions + with respect to targets. + + ## Examples + + iex> y_true = Nx.tensor([1, 1, 1, 1, 1, 2, 2, 1, 3, 1], type: :u32) + iex> y_pred = Nx.tensor([2, 2, 1, 1, 2, 2, 2, 1, 3, 1], type: :u32) + iex> Scholar.Metrics.Regression.mean_poisson_deviance(y_true, y_pred) + #Nx.Tensor< + f32 + 0.18411168456077576 + > + """ + defn mean_poisson_deviance(y_true, y_pred) do + mean_tweedie_deviance_n(y_true, y_pred, 1) + end + + @doc """ + Calculates the mean Gamma deviance of predictions + with respect to targets. + + ## Examples + + iex> y_true = Nx.tensor([1, 1, 1, 1, 1, 2, 2, 1, 3, 1], type: :u32) + iex> y_pred = Nx.tensor([2, 2, 1, 1, 2, 2, 2, 1, 3, 1], type: :u32) + iex> Scholar.Metrics.Regression.mean_gamma_deviance(y_true, y_pred) + #Nx.Tensor< + f32 + 0.115888312458992 + > + """ + defn mean_gamma_deviance(y_true, y_pred) do + mean_tweedie_deviance_n(y_true, y_pred, 2) + end + @doc """ Calculates the $R^2$ score of predictions with respect to targets. @@ -292,6 +455,46 @@ defmodule Scholar.Metrics.Regression do end end + @doc """ + $D^2$ regression score function, fraction of Tweedie + deviance explained. + + Best possible score is 1.0, lower values are worse and it + can also be negative. + + Since it uses the mean Tweedie deviance, it also includes + the Gaussian, Poisson, Gamma and inverse-Gaussian + distribution families as special cases. + + ## Examples + + iex> y_true = Nx.tensor([1, 1, 1, 1, 1, 2, 2, 1, 3, 1], type: :u32) + iex> y_pred = Nx.tensor([2, 2, 1, 1, 2, 2, 2, 1, 3, 1], type: :u32) + iex> Scholar.Metrics.Regression.d2_tweedie_score(y_true, y_pred, 1) + #Nx.Tensor< + f32 + 0.32202935218811035 + > + """ + defn d2_tweedie_score(y_true, y_pred, power) do + if Nx.size(y_pred) < 2 do + Nx.Constants.nan() + else + d2_tweedie_score_n(y_true, y_pred, power) + end + end + + defnp d2_tweedie_score_n(y_true, y_pred, power) do + y_true = Nx.squeeze(y_true) + y_pred = Nx.squeeze(y_pred) + + numerator = mean_tweedie_deviance_n(y_true, y_pred, power) + y_avg = Nx.mean(y_true) + denominator = mean_tweedie_deviance_n(y_true, y_avg, power) + + 1 - numerator / denominator + end + @doc ~S""" Calculates the maximum residual error. diff --git a/lib/scholar/naive_bayes/complement.ex b/lib/scholar/naive_bayes/complement.ex index b81145c2..fce02c9c 100644 --- a/lib/scholar/naive_bayes/complement.ex +++ b/lib/scholar/naive_bayes/complement.ex @@ -6,6 +6,9 @@ defmodule Scholar.NaiveBayes.Complement do that each class has roughly the same representation. It is particularly suited for imbalanced data sets. + Time complexity is $O(K * N * C)$ where $N$ is the number of samples and $K$ is the number of features, + and $C$ is the number of classes. + Reference: * [1] - [Paper about Complement Naive Bayes Algorithm](https://cdn.aaai.org/ICML/2003/ICML03-081.pdf) diff --git a/lib/scholar/naive_bayes/gaussian.ex b/lib/scholar/naive_bayes/gaussian.ex index 452a42dd..315061b5 100644 --- a/lib/scholar/naive_bayes/gaussian.ex +++ b/lib/scholar/naive_bayes/gaussian.ex @@ -7,6 +7,9 @@ defmodule Scholar.NaiveBayes.Gaussian do The parameters $\sigma\_{y}$ and $\mu\_{y}$ are estimated using maximum likelihood. + Time complexity is $O(K * N * C)$ where $N$ is the number of samples and $K$ is the number of features, + and $C$ is the number of classes. + Reference: * [1] - [Detailed explanation of algorithm used to update feature means and variance online by Chan, Golub, and LaVeque](http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf) diff --git a/lib/scholar/naive_bayes/multinomial.ex b/lib/scholar/naive_bayes/multinomial.ex index 06d5ba0f..33e4c31a 100644 --- a/lib/scholar/naive_bayes/multinomial.ex +++ b/lib/scholar/naive_bayes/multinomial.ex @@ -1,6 +1,9 @@ defmodule Scholar.NaiveBayes.Multinomial do @moduledoc """ Naive Bayes classifier for multinomial models. + + Time complexity is $O(K * N * C)$ where $N$ is the number of samples and $K$ is the number of features, + and $C$ is the number of classes. """ import Nx.Defn import Scholar.Shared diff --git a/lib/scholar/neighbors/k_nearest_neighbors.ex b/lib/scholar/neighbors/k_nearest_neighbors.ex index 0223a3b6..a41ee5c8 100644 --- a/lib/scholar/neighbors/k_nearest_neighbors.ex +++ b/lib/scholar/neighbors/k_nearest_neighbors.ex @@ -1,6 +1,9 @@ defmodule Scholar.Neighbors.KNearestNeighbors do @moduledoc """ - The K-Nearest Neighbors. It implements both classification and regression. + The K-Nearest Neighbors. + + It implements both classification and regression. This implements the linear + version of kNN and therefore it has time complexity $O(N^2)$ for $N$ samples. """ import Nx.Defn import Scholar.Shared @@ -162,7 +165,7 @@ defmodule Scholar.Neighbors.KNearestNeighbors do ## Return Values - It returns a tensor with predicted class labels + It returns a tensor with predicted class labels. ## Examples @@ -210,7 +213,7 @@ defmodule Scholar.Neighbors.KNearestNeighbors do ## Return Values - It returns a tensor with probabilities of classes. They are arranged in lexicographic order. + It returns a tensor with probabilities of classes. They are arranged in lexicographic order. ## Examples diff --git a/lib/scholar/neighbors/kd_tree.ex b/lib/scholar/neighbors/kd_tree.ex new file mode 100644 index 00000000..c80077d8 --- /dev/null +++ b/lib/scholar/neighbors/kd_tree.ex @@ -0,0 +1,350 @@ +defmodule Scholar.Neighbors.KDTree do + @moduledoc """ + Implements a kd-tree, a space-partitioning data structure for organizing points + in a k-dimensional space. + + This is implemented as one-dimensional tensor with indices pointed to highest + dimension of the given tensor. Traversal starts by calling `root/0` and then + accessing the `left_child/1` and `right_child/1`. The tree is left-balanced. + + Two construction modes are available: + + * `bounded/2` - the tensor has min and max values with an amplitude given by `max - min`. + It is also guaranteed that the `amplitude * levels(tensor) + 1` does not overflow + the tensor. See `amplitude/1` to verify if this holds. This implementation happens + fully within `defn`. This version is orders of magnitude faster than the `unbounded/2` + one. + + * `unbounded/2` - there are no known bounds (min and max values) to the tensor. + This implementation is recursive and goes in and out of the `defn`, therefore + it cannot be called inside `defn`. + + Each level traverses over the last axis of tensor, the index for a level can be + computed as: `rem(level, Nx.axis_size(tensor, -1))`. + + ## References + + * [GPU-friendly, Parallel, and (Almost-)In-Place Construction of Left-Balanced k-d Trees](https://arxiv.org/pdf/2211.00120.pdf). + """ + + import Nx.Defn + + @derive {Nx.Container, keep: [:levels], containers: [:indexes, :data]} + @enforce_keys [:levels, :indexes, :data] + defstruct [:levels, :indexes, :data] + + @doc """ + Builds a KDTree without known min-max bounds. + + If your tensor has known bounds (for example, -1 and 1), + consider using the `bounded/2` version which is often orders of + magnitude more efficient. + + ## Options + + * `:compiler` - the default compiler to use for internal defn operations + + ## Examples + + iex> Scholar.Neighbors.KDTree.unbounded(Nx.iota({5, 2}), compiler: EXLA) + %Scholar.Neighbors.KDTree{ + data: Nx.iota({5, 2}), + levels: 3, + indexes: Nx.u32([3, 1, 4, 0, 2]) + } + + """ + def unbounded(tensor, opts \\ []) do + levels = levels(tensor) + {size, _dims} = Nx.shape(tensor) + + indexes = + if size > 2 do + subtree_size = unbounded_subtree_size(1, levels, size) + {left, mid, right} = Nx.Defn.jit_apply(&root_slice(&1, subtree_size), [tensor], opts) + + acc = <> + acc = recur([{1, left}, {2, right}], [], acc, tensor, 1, levels, opts) + Nx.from_binary(acc, :u32) + else + Nx.argsort(tensor[[.., 0]], direction: :desc, type: :u32) + end + + %__MODULE__{levels: levels, indexes: indexes, data: tensor} + end + + defp recur([{_i, %Nx.Tensor{shape: {1}} = leaf} | rest], next, acc, tensor, level, levels, opts) do + [leaf] = Nx.to_flat_list(leaf) + acc = <> + recur(rest, next, acc, tensor, level, levels, opts) + end + + defp recur([{i, %Nx.Tensor{shape: {2}} = node} | rest], next, acc, tensor, level, levels, opts) do + acc = <> + next = [{left_child(i), Nx.slice(node, [0], [1])} | next] + recur(rest, next, acc, tensor, level, levels, opts) + end + + defp recur([{i, indexes} | rest], next, acc, tensor, level, levels, opts) do + %Nx.Tensor{shape: {size, dims}} = tensor + k = rem(level, dims) + subtree_size = unbounded_subtree_size(left_child(i), levels, size) + + {left, mid, right} = + Nx.Defn.jit_apply(&recur_slice(&1, &2, &3, subtree_size), [tensor, indexes, k], opts) + + next = [{right_child(i), right}, {left_child(i), left} | next] + acc = <> + recur(rest, next, acc, tensor, level, levels, opts) + end + + defp recur([], [], acc, _tensor, _level, _levels, _opts) do + acc + end + + defp recur([], next, acc, tensor, level, levels, opts) do + recur(Enum.reverse(next), [], acc, tensor, level + 1, levels, opts) + end + + defp root_slice(tensor, subtree_size) do + indexes = Nx.argsort(tensor[[.., 0]], type: :u32) + + {Nx.slice(indexes, [0], [subtree_size]), indexes[subtree_size], + Nx.slice(indexes, [subtree_size + 1], [Nx.size(indexes) - subtree_size - 1])} + end + + defp recur_slice(tensor, indexes, k, subtree_size) do + sorted = Nx.argsort(Nx.take(tensor, indexes)[[.., k]], type: :u32) + indexes = Nx.take(indexes, sorted) + + {Nx.slice(indexes, [0], [subtree_size]), indexes[subtree_size], + Nx.slice(indexes, [subtree_size + 1], [Nx.size(indexes) - subtree_size - 1])} + end + + defp unbounded_subtree_size(i, levels, size) do + import Bitwise + diff = levels - unbounded_level(i) - 1 + shifted = 1 <<< diff + fllc_s = (i <<< diff) + shifted - 1 + shifted - 1 + min(max(0, size - fllc_s), shifted) + end + + defp unbounded_level(i) when is_integer(i), do: floor(:math.log2(i + 1)) + + @doc """ + Builds a KDTree with known min-max bounds entirely within `defn`. + + This requires the amplitude `|max - min|` of the tensor to be given + such that `max + (amplitude + 1) * (size - 1)` does not overflow the + maximum tensor type. + + For example, a tensor where all values are between 0 and 1 has amplitude + 1. Values between -1 and 1 has amplitude 2. If your tensor is normalized + to floating points, then it is most likely bounded (given their high + precision). You can use `amplitude/1` to check your assumptions. + + ## Examples + + iex> Scholar.Neighbors.KDTree.bounded(Nx.iota({5, 2}), 10) + %Scholar.Neighbors.KDTree{ + data: Nx.iota({5, 2}), + levels: 3, + indexes: Nx.u32([3, 1, 4, 0, 2]) + } + """ + deftransform bounded(tensor, amplitude) do + %__MODULE__{levels: levels(tensor), indexes: bounded_n(tensor, amplitude), data: tensor} + end + + defnp bounded_n(tensor, amplitude) do + levels = levels(tensor) + {size, dims} = Nx.shape(tensor) + band = amplitude + 1 + tags = Nx.broadcast(Nx.u32(0), {size}) + + {level, tags, _tensor, _band} = + while {level = Nx.u32(0), tags, tensor, band}, level < levels - 1 do + k = rem(level, dims) + indexes = Nx.argsort(tensor[[.., k]] + band * tags, type: :u32) + tags = update_tags(tags, indexes, level, levels, size) + {level + 1, tags, tensor, band} + end + + k = rem(level, dims) + Nx.argsort(tensor[[.., k]] + band * tags, type: :u32) + end + + defnp update_tags(tags, indexes, level, levels, size) do + pos = Nx.argsort(indexes, type: :u32) + + pivot = + bounded_segment_begin(tags, levels, size) + + bounded_subtree_size(left_child(tags), levels, size) + + Nx.select( + pos < (1 <<< level) - 1, + tags, + Nx.select( + pos < pivot, + left_child(tags), + Nx.select( + pos > pivot, + right_child(tags), + tags + ) + ) + ) + end + + defnp bounded_subtree_size(i, levels, size) do + diff = levels - bounded_level(i) - 1 + shifted = 1 <<< diff + first_lowest_level = (i <<< diff) + shifted - 1 + # Use select instead of max to deal with overflows + lowest_level = Nx.select(first_lowest_level > size, Nx.u32(0), size - first_lowest_level) + shifted - 1 + min(lowest_level, shifted) + end + + defnp bounded_segment_begin(i, levels, size) do + level = bounded_level(i) + top = (1 <<< level) - 1 + diff = levels - level - 1 + shifted = 1 <<< diff + left_siblings = i - top + + top + left_siblings * (shifted - 1) + + min(left_siblings * shifted, size - (1 <<< (levels - 1)) + 1) + end + + # Since this property relies on u32, let's check the tensor type. + deftransformp bounded_level(%Nx.Tensor{type: {:u, 32}} = i) do + Nx.subtract(31, Nx.count_leading_zeros(Nx.add(i, 1))) + end + + @doc """ + Returns the amplitude of a bounded tensor. + + If -1 is returned, it means the tensor cannot use the `bounded` algorithm + to generate a KDTree and `unbounded/2` must be used instead. + + This cannot be invoked inside a `defn`. + + ## Examples + + iex> Scholar.Neighbors.KDTree.amplitude(Nx.iota({10, 2})) + 19 + iex> Scholar.Neighbors.KDTree.amplitude(Nx.iota({20, 2}, type: :f32)) + 39.0 + iex> Scholar.Neighbors.KDTree.amplitude(Nx.iota({20, 2}, type: :u8)) + -1 + iex> Scholar.Neighbors.KDTree.amplitude(Nx.negate(Nx.iota({10, 2}))) + 19 + + """ + def amplitude(tensor) do + max = tensor |> Nx.reduce_max() |> Nx.to_number() + min = tensor |> Nx.reduce_min() |> Nx.to_number() + amplitude = abs(max - min) + limit = tensor.type |> Nx.Constants.max_finite() |> Nx.to_number() + + if max + (amplitude + 1) * (Nx.axis_size(tensor, 0) - 1) > limit do + -1 + else + amplitude + end + end + + @doc """ + Returns the number of resulting levels in a KDTree for `tensor`. + + ## Examples + + iex> Scholar.Neighbors.KDTree.levels(Nx.iota({10, 3})) + 4 + """ + deftransform levels(%Nx.Tensor{} = tensor) do + case Nx.shape(tensor) do + {size, _dims} -> ceil(:math.log2(size + 1)) + _ -> raise ArgumentError, "KDTrees requires a tensor of rank 2" + end + end + + @doc """ + Returns the root index. + + ## Examples + + iex> Scholar.Neighbors.KDTree.root() + 0 + + """ + deftransform root, do: 0 + + @doc """ + Returns the parent of child `i`. + + It is your responsibility to guarantee the result is positive. + + ## Examples + + iex> Scholar.Neighbors.KDTree.parent(1) + 0 + iex> Scholar.Neighbors.KDTree.parent(2) + 0 + + iex> Scholar.Neighbors.KDTree.parent(Nx.u32(3)) + #Nx.Tensor< + u32 + 1 + > + + """ + deftransform parent(i) when is_integer(i), do: div(i - 1, 2) + deftransform parent(%Nx.Tensor{} = t), do: Nx.quotient(Nx.subtract(t, 1), 2) + + @doc """ + Returns the index of the left child of i. + + It is your responsibility to guarantee the result + is not greater than the leading axis of the tensor. + + ## Examples + + iex> Scholar.Neighbors.KDTree.left_child(0) + 1 + iex> Scholar.Neighbors.KDTree.left_child(1) + 3 + + iex> Scholar.Neighbors.KDTree.left_child(Nx.u32(3)) + #Nx.Tensor< + u32 + 7 + > + + """ + deftransform left_child(i) when is_integer(i), do: 2 * i + 1 + deftransform left_child(%Nx.Tensor{} = t), do: Nx.add(Nx.multiply(2, t), 1) + + @doc """ + Returns the index of the right child of i. + + It is your responsibility to guarantee the result + is not greater than the leading axis of the tensor. + + ## Examples + + iex> Scholar.Neighbors.KDTree.right_child(0) + 2 + iex> Scholar.Neighbors.KDTree.right_child(1) + 4 + + iex> Scholar.Neighbors.KDTree.right_child(Nx.u32(3)) + #Nx.Tensor< + u32 + 8 + > + + """ + deftransform right_child(i) when is_integer(i), do: 2 * i + 2 + deftransform right_child(%Nx.Tensor{} = t), do: Nx.add(Nx.multiply(2, t), 2) +end diff --git a/lib/scholar/neighbors/radius_nearest_neighbors.ex b/lib/scholar/neighbors/radius_nearest_neighbors.ex index 66c378b1..b1eca0e3 100644 --- a/lib/scholar/neighbors/radius_nearest_neighbors.ex +++ b/lib/scholar/neighbors/radius_nearest_neighbors.ex @@ -1,6 +1,8 @@ defmodule Scholar.Neighbors.RadiusNearestNeighbors do @moduledoc """ - The Radius Nearest Neighbors. It implements both classification and regression. + The Radius Nearest Neighbors. + + It implements both classification and regression. """ import Nx.Defn import Scholar.Shared diff --git a/lib/scholar/stats.ex b/lib/scholar/stats.ex index 188a0df4..e3cf20a8 100644 --- a/lib/scholar/stats.ex +++ b/lib/scholar/stats.ex @@ -181,6 +181,9 @@ defmodule Scholar.Stats do * $Cov(X\_i, X\_j)$ is covariance between features $X_i$ and $X_j$ '''} + Time complexity of correlation estimation is $O(N * K^2)$ where $N$ is the number of samples + and $K$ is the number of features. + ## Example iex> Scholar.Stats.correlation_matrix(Nx.tensor([[3, 6, 5], [26, 75, 3], [23, 4, 1]])) diff --git a/mix.exs b/mix.exs index c439392e..02acd3bb 100644 --- a/mix.exs +++ b/mix.exs @@ -31,10 +31,11 @@ defmodule Scholar.MixProject do [ {:ex_doc, "~> 0.30", only: :docs}, # {:nx, "~> 0.6", override: true}, - {:nx, github: "elixir-nx/nx", sparse: "nx", override: true}, + {:nx, github: "elixir-nx/nx", sparse: "nx", override: true, branch: "v0.6"}, {:nimble_options, "~> 0.5.2 or ~> 1.0"}, {:exla, "~> 0.6", optional: true}, - {:polaris, "~> 0.1"} + {:polaris, "~> 0.1"}, + {:benchee, "~> 1.0", only: :dev} ] end diff --git a/mix.lock b/mix.lock index cb415b64..c17916d2 100644 --- a/mix.lock +++ b/mix.lock @@ -1,5 +1,7 @@ %{ + "benchee": {:hex, :benchee, "1.1.0", "f3a43817209a92a1fade36ef36b86e1052627fd8934a8b937ac9ab3a76c43062", [:mix], [{:deep_merge, "~> 1.0", [hex: :deep_merge, repo: "hexpm", optional: false]}, {:statistex, "~> 1.0", [hex: :statistex, repo: "hexpm", optional: false]}], "hexpm", "7da57d545003165a012b587077f6ba90b89210fd88074ce3c60ce239eb5e6d93"}, "complex": {:hex, :complex, "0.5.0", "af2d2331ff6170b61bb738695e481b27a66780e18763e066ee2cd863d0b1dd92", [:mix], [], "hexpm", "2683bd3c184466cfb94fad74cbfddfaa94b860e27ad4ca1bffe3bff169d91ef1"}, + "deep_merge": {:hex, :deep_merge, "1.0.0", "b4aa1a0d1acac393bdf38b2291af38cb1d4a52806cf7a4906f718e1feb5ee961", [:mix], [], "hexpm", "ce708e5f094b9cd4e8f2be4f00d2f4250c4095be93f8cd6d018c753894885430"}, "earmark_parser": {:hex, :earmark_parser, "1.4.37", "2ad73550e27c8946648b06905a57e4d454e4d7229c2dafa72a0348c99d8be5f7", [:mix], [], "hexpm", "6b19783f2802f039806f375610faa22da130b8edc21209d0bff47918bb48360e"}, "elixir_make": {:hex, :elixir_make, "0.7.7", "7128c60c2476019ed978210c245badf08b03dbec4f24d05790ef791da11aa17c", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}], "hexpm", "5bc19fff950fad52bbe5f211b12db9ec82c6b34a9647da0c2224b8b8464c7e6c"}, "ex_doc": {:hex, :ex_doc, "0.30.6", "5f8b54854b240a2b55c9734c4b1d0dd7bdd41f71a095d42a70445c03cf05a281", [:mix], [{:earmark_parser, "~> 1.4.31", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "bd48f2ddacf4e482c727f9293d9498e0881597eae6ddc3d9562bd7923375109f"}, @@ -9,8 +11,9 @@ "makeup_erlang": {:hex, :makeup_erlang, "0.1.2", "ad87296a092a46e03b7e9b0be7631ddcf64c790fa68a9ef5323b6cbb36affc72", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "f3f5a1ca93ce6e092d92b6d9c049bcda58a3b617a8d888f8e7231c85630e8108"}, "nimble_options": {:hex, :nimble_options, "0.5.2", "42703307b924880f8c08d97719da7472673391905f528259915782bb346e0a1b", [:mix], [], "hexpm", "4da7f904b915fd71db549bcdc25f8d56f378ef7ae07dc1d372cbe72ba950dce0"}, "nimble_parsec": {:hex, :nimble_parsec, "1.3.1", "2c54013ecf170e249e9291ed0a62e5832f70a476c61da16f6aac6dca0189f2af", [:mix], [], "hexpm", "2682e3c0b2eb58d90c6375fc0cc30bc7be06f365bf72608804fb9cffa5e1b167"}, - "nx": {:git, "https://github.com/elixir-nx/nx.git", "a0b7e2e5cc7a62a55cd2e7bbc3e44ba2ac1c996b", [sparse: "nx"]}, + "nx": {:git, "https://github.com/elixir-nx/nx.git", "ef464cfd0935eb4c2c1fa9a40f099b098a0b95bf", [sparse: "nx", branch: "v0.6"]}, "polaris": {:hex, :polaris, "0.1.0", "dca61b18e3e801ecdae6ac9f0eca5f19792b44a5cb4b8d63db50fc40fc038d22", [:mix], [{:nx, "~> 0.5", [hex: :nx, repo: "hexpm", optional: false]}], "hexpm", "13ef2b166650e533cb24b10e2f3b8ab4f2f449ba4d63156e8c569527f206e2c2"}, + "statistex": {:hex, :statistex, "1.0.0", "f3dc93f3c0c6c92e5f291704cf62b99b553253d7969e9a5fa713e5481cd858a5", [:mix], [], "hexpm", "ff9d8bee7035028ab4742ff52fc80a2aa35cece833cf5319009b52f1b5a86c27"}, "telemetry": {:hex, :telemetry, "1.2.1", "68fdfe8d8f05a8428483a97d7aab2f268aaff24b49e0f599faa091f1d4e7f61c", [:rebar3], [], "hexpm", "dad9ce9d8effc621708f99eac538ef1cbe05d6a874dd741de2e689c47feafed5"}, "xla": {:hex, :xla, "0.5.0", "fb8a02c02e5a4f4531fbf18a90c325e471037f983f0115d23f510e7dd9a6aa65", [:make, :mix], [{:elixir_make, "~> 0.4", [hex: :elixir_make, repo: "hexpm", optional: false]}], "hexpm", "571ac797a4244b8ba8552ed0295a54397bd896708be51e4da6cbb784f6678061"}, } diff --git a/notebooks/linear_regression.livemd b/notebooks/linear_regression.livemd index dbfe1083..2ddec874 100644 --- a/notebooks/linear_regression.livemd +++ b/notebooks/linear_regression.livemd @@ -780,15 +780,28 @@ Tucan.heatmap(corr_to_plot, "x", "y", "corr_val", annotate: true, text_color: [{nil, 0, "white"}, {0, nil, "black"}] ) +<<<<<<< HEAD |> Tucan.Scale.set_color_scheme(:viridis) |> Tucan.set_size(630, 630) |> Tucan.set_title("Correlation Matrix for California Housing", offset: 20) +======= +|> Data.heatmap(corr_to_plot, + x: "x", + y: "y", + color: [field: "corr_val", type: :quantitative, scale: [scheme: :viridis]], + text: "corr_val" +) +>>>>>>> 77ac659... Update mix.installs ``` ```vega-lite +<<<<<<< HEAD {"$schema":"https://vega.github.io/schema/vega-lite/v5.json","__tucan__":{"multilayer":true},"data":{"values":[{"corr_val":1.0,"x":"Bedrooms per rooms","y":"Bedrooms per rooms"},{"corr_val":0.06,"x":"Households","y":"Bedrooms per rooms"},{"corr_val":0.13,"x":"Housing median age","y":"Bedrooms per rooms"},{"corr_val":-0.12,"x":"Latitude","y":"Bedrooms per rooms"},{"corr_val":0.1,"x":"Longitude","y":"Bedrooms per rooms"},{"corr_val":-0.61,"x":"Median income","y":"Bedrooms per rooms"},{"corr_val":0.04,"x":"Population","y":"Bedrooms per rooms"},{"corr_val":0.0,"x":"Population per family","y":"Bedrooms per rooms"},{"corr_val":-0.41,"x":"Rooms per family","y":"Bedrooms per rooms"},{"corr_val":0.08,"x":"Total bedrooms","y":"Bedrooms per rooms"},{"corr_val":-0.19,"x":"Total rooms","y":"Bedrooms per rooms"},{"corr_val":-0.12,"x":"Ocean proximity","y":"Bedrooms per rooms"},{"corr_val":-0.25,"x":"Median house value","y":"Bedrooms per rooms"},{"corr_val":0.06,"x":"Bedrooms per rooms","y":"Households"},{"corr_val":1.0,"x":"Households","y":"Households"},{"corr_val":-0.31,"x":"Housing median age","y":"Households"},{"corr_val":-0.08,"x":"Latitude","y":"Households"},{"corr_val":0.06,"x":"Longitude","y":"Households"},{"corr_val":0.01,"x":"Median income","y":"Households"},{"corr_val":0.91,"x":"Population","y":"Households"},{"corr_val":-0.03,"x":"Population per family","y":"Households"},{"corr_val":-0.07,"x":"Rooms per family","y":"Households"},{"corr_val":0.97,"x":"Total bedrooms","y":"Households"},{"corr_val":0.92,"x":"Total rooms","y":"Households"},{"corr_val":-0.05,"x":"Ocean proximity","y":"Households"},{"corr_val":0.06,"x":"Median house value","y":"Households"},{"corr_val":0.13,"x":"Bedrooms per rooms","y":"Housing median age"},{"corr_val":-0.31,"x":"Households","y":"Housing median age"},{"corr_val":1.0,"x":"Housing median age","y":"Housing median age"},{"corr_val":0.02,"x":"Latitude","y":"Housing median age"},{"corr_val":-0.11,"x":"Longitude","y":"Housing median age"},{"corr_val":-0.13,"x":"Median income","y":"Housing median age"},{"corr_val":-0.3,"x":"Population","y":"Housing median age"},{"corr_val":0.01,"x":"Population per family","y":"Housing median age"},{"corr_val":-0.15,"x":"Rooms per family","y":"Housing median age"},{"corr_val":-0.32,"x":"Total bedrooms","y":"Housing median age"},{"corr_val":-0.36,"x":"Total rooms","y":"Housing median age"},{"corr_val":-0.12,"x":"Ocean proximity","y":"Housing median age"},{"corr_val":0.1,"x":"Median house value","y":"Housing median age"},{"corr_val":-0.12,"x":"Bedrooms per rooms","y":"Latitude"},{"corr_val":-0.08,"x":"Households","y":"Latitude"},{"corr_val":0.02,"x":"Housing median age","y":"Latitude"},{"corr_val":1.0,"x":"Latitude","y":"Latitude"},{"corr_val":-0.93,"x":"Longitude","y":"Latitude"},{"corr_val":-0.07,"x":"Median income","y":"Latitude"},{"corr_val":-0.11,"x":"Population","y":"Latitude"},{"corr_val":0.0,"x":"Population per family","y":"Latitude"},{"corr_val":0.11,"x":"Rooms per family","y":"Latitude"},{"corr_val":-0.07,"x":"Total bedrooms","y":"Latitude"},{"corr_val":-0.04,"x":"Total rooms","y":"Latitude"},{"corr_val":0.5,"x":"Ocean proximity","y":"Latitude"},{"corr_val":-0.14,"x":"Median house value","y":"Latitude"},{"corr_val":0.1,"x":"Bedrooms per rooms","y":"Longitude"},{"corr_val":0.06,"x":"Households","y":"Longitude"},{"corr_val":-0.11,"x":"Housing median age","y":"Longitude"},{"corr_val":-0.93,"x":"Latitude","y":"Longitude"},{"corr_val":1.0,"x":"Longitude","y":"Longitude"},{"corr_val":-0.02,"x":"Median income","y":"Longitude"},{"corr_val":0.11,"x":"Population","y":"Longitude"},{"corr_val":0.0,"x":"Population per family","y":"Longitude"},{"corr_val":-0.03,"x":"Rooms per family","y":"Longitude"},{"corr_val":0.08,"x":"Total bedrooms","y":"Longitude"},{"corr_val":0.05,"x":"Total rooms","y":"Longitude"},{"corr_val":-0.28,"x":"Ocean proximity","y":"Longitude"},{"corr_val":-0.05,"x":"Median house value","y":"Longitude"},{"corr_val":-0.61,"x":"Bedrooms per rooms","y":"Median income"},{"corr_val":0.01,"x":"Households","y":"Median income"},{"corr_val":-0.13,"x":"Housing median age","y":"Median income"},{"corr_val":-0.07,"x":"Latitude","y":"Median income"},{"corr_val":-0.02,"x":"Longitude","y":"Median income"},{"corr_val":1.0,"x":"Median income","y":"Median income"},{"corr_val":0.0,"x":"Population","y":"Median income"},{"corr_val":0.02,"x":"Population per family","y":"Median income"},{"corr_val":0.32,"x":"Rooms per family","y":"Median income"},{"corr_val":-0.01,"x":"Total bedrooms","y":"Median income"},{"corr_val":0.19,"x":"Total rooms","y":"Median income"},{"corr_val":-0.21,"x":"Ocean proximity","y":"Median income"},{"corr_val":0.69,"x":"Median house value","y":"Median income"},{"corr_val":0.04,"x":"Bedrooms per rooms","y":"Population"},{"corr_val":0.91,"x":"Households","y":"Population"},{"corr_val":-0.3,"x":"Housing median age","y":"Population"},{"corr_val":-0.11,"x":"Latitude","y":"Population"},{"corr_val":0.11,"x":"Longitude","y":"Population"},{"corr_val":0.0,"x":"Median income","y":"Population"},{"corr_val":1.0,"x":"Population","y":"Population"},{"corr_val":0.07,"x":"Population per family","y":"Population"},{"corr_val":-0.07,"x":"Rooms per family","y":"Population"},{"corr_val":0.87,"x":"Total bedrooms","y":"Population"},{"corr_val":0.86,"x":"Total rooms","y":"Population"},{"corr_val":-0.06,"x":"Ocean proximity","y":"Population"},{"corr_val":-0.03,"x":"Median house value","y":"Population"},{"corr_val":0.0,"x":"Bedrooms per rooms","y":"Population per family"},{"corr_val":-0.03,"x":"Households","y":"Population per family"},{"corr_val":0.01,"x":"Housing median age","y":"Population per family"},{"corr_val":0.0,"x":"Latitude","y":"Population per family"},{"corr_val":0.0,"x":"Longitude","y":"Population per family"},{"corr_val":0.02,"x":"Median income","y":"Population per family"},{"corr_val":0.07,"x":"Population","y":"Population per family"},{"corr_val":1.0,"x":"Population per family","y":"Population per family"},{"corr_val":-0.01,"x":"Rooms per family","y":"Population per family"},{"corr_val":-0.03,"x":"Total bedrooms","y":"Population per family"},{"corr_val":-0.02,"x":"Total rooms","y":"Population per family"},{"corr_val":0.01,"x":"Ocean proximity","y":"Population per family"},{"corr_val":-0.02,"x":"Median house value","y":"Population per family"},{"corr_val":-0.41,"x":"Bedrooms per rooms","y":"Rooms per family"},{"corr_val":-0.07,"x":"Households","y":"Rooms per family"},{"corr_val":-0.15,"x":"Housing median age","y":"Rooms per family"},{"corr_val":0.11,"x":"Latitude","y":"Rooms per family"},{"corr_val":-0.03,"x":"Longitude","y":"Rooms per family"},{"corr_val":0.32,"x":"Median income","y":"Rooms per family"},{"corr_val":-0.07,"x":"Population","y":"Rooms per family"},{"corr_val":-0.01,"x":"Population per family","y":"Rooms per family"},{"corr_val":1.0,"x":"Rooms per family","y":"Rooms per family"},{"corr_val":0.01,"x":"Total bedrooms","y":"Rooms per family"},{"corr_val":0.14,"x":"Total rooms","y":"Rooms per family"},{"corr_val":0.14,"x":"Ocean proximity","y":"Rooms per family"},{"corr_val":0.15,"x":"Median house value","y":"Rooms per family"},{"corr_val":0.08,"x":"Bedrooms per rooms","y":"Total bedrooms"},{"corr_val":0.97,"x":"Households","y":"Total bedrooms"},{"corr_val":-0.32,"x":"Housing median age","y":"Total bedrooms"},{"corr_val":-0.07,"x":"Latitude","y":"Total bedrooms"},{"corr_val":0.08,"x":"Longitude","y":"Total bedrooms"},{"corr_val":-0.01,"x":"Median income","y":"Total bedrooms"},{"corr_val":0.87,"x":"Population","y":"Total bedrooms"},{"corr_val":-0.03,"x":"Population per family","y":"Total bedrooms"},{"corr_val":0.01,"x":"Rooms per family","y":"Total bedrooms"},{"corr_val":1.0,"x":"Total bedrooms","y":"Total bedrooms"},{"corr_val":0.93,"x":"Total rooms","y":"Total bedrooms"},{"corr_val":-0.02,"x":"Ocean proximity","y":"Total bedrooms"},{"corr_val":0.05,"x":"Median house value","y":"Total bedrooms"},{"corr_val":-0.19,"x":"Bedrooms per rooms","y":"Total rooms"},{"corr_val":0.92,"x":"Households","y":"Total rooms"},{"corr_val":-0.36,"x":"Housing median age","y":"Total rooms"},{"corr_val":-0.04,"x":"Latitude","y":"Total rooms"},{"corr_val":0.05,"x":"Longitude","y":"Total rooms"},{"corr_val":0.19,"x":"Median income","y":"Total rooms"},{"corr_val":0.86,"x":"Population","y":"Total rooms"},{"corr_val":-0.02,"x":"Population per family","y":"Total rooms"},{"corr_val":0.14,"x":"Rooms per family","y":"Total rooms"},{"corr_val":0.93,"x":"Total bedrooms","y":"Total rooms"},{"corr_val":1.0,"x":"Total rooms","y":"Total rooms"},{"corr_val":0.02,"x":"Ocean proximity","y":"Total rooms"},{"corr_val":0.13,"x":"Median house value","y":"Total rooms"},{"corr_val":-0.12,"x":"Bedrooms per rooms","y":"Ocean proximity"},{"corr_val":-0.05,"x":"Households","y":"Ocean proximity"},{"corr_val":-0.12,"x":"Housing median age","y":"Ocean proximity"},{"corr_val":0.5,"x":"Latitude","y":"Ocean proximity"},{"corr_val":-0.28,"x":"Longitude","y":"Ocean proximity"},{"corr_val":-0.21,"x":"Median income","y":"Ocean proximity"},{"corr_val":-0.06,"x":"Population","y":"Ocean proximity"},{"corr_val":0.01,"x":"Population per family","y":"Ocean proximity"},{"corr_val":0.14,"x":"Rooms per family","y":"Ocean proximity"},{"corr_val":-0.02,"x":"Total bedrooms","y":"Ocean proximity"},{"corr_val":0.02,"x":"Total rooms","y":"Ocean proximity"},{"corr_val":1.0,"x":"Ocean proximity","y":"Ocean proximity"},{"corr_val":-0.4,"x":"Median house value","y":"Ocean proximity"},{"corr_val":-0.25,"x":"Bedrooms per rooms","y":"Median house value"},{"corr_val":0.06,"x":"Households","y":"Median house value"},{"corr_val":0.1,"x":"Housing median age","y":"Median house value"},{"corr_val":-0.14,"x":"Latitude","y":"Median house value"},{"corr_val":-0.05,"x":"Longitude","y":"Median house value"},{"corr_val":0.69,"x":"Median income","y":"Median house value"},{"corr_val":-0.03,"x":"Population","y":"Median house value"},{"corr_val":-0.02,"x":"Population per family","y":"Median house value"},{"corr_val":0.15,"x":"Rooms per family","y":"Median house value"},{"corr_val":0.05,"x":"Total bedrooms","y":"Median house value"},{"corr_val":0.13,"x":"Total rooms","y":"Median house value"},{"corr_val":-0.4,"x":"Ocean proximity","y":"Median house value"},{"corr_val":1.0,"x":"Median house value","y":"Median house value"}]},"height":630,"layer":[{"encoding":{"color":{"aggregate":"mean","field":"corr_val","scale":{"reverse":false,"scheme":"viridis"},"type":"quantitative"},"x":{"field":"x","type":"nominal"},"y":{"field":"y","type":"nominal"}},"mark":{"fillOpacity":1,"type":"rect"}},{"encoding":{"color":{"aggregate":"mean","condition":[{"test":"datum['mean_corr_val'] < 0","value":"white"},{"test":"datum['mean_corr_val'] >= 0","value":"black"},{"test":"true","value":"black"}],"field":"corr_val","scale":{"reverse":false,"scheme":"viridis"},"type":"quantitative"},"text":{"aggregate":"mean","field":"corr_val","type":"quantitative"},"x":{"field":"x","type":"nominal"},"y":{"field":"y","type":"nominal"}},"mark":"text"}],"title":{"offset":20,"text":"Correlation Matrix for California Housing"},"width":630} +======= +{"$schema":"https://vega.github.io/schema/vega-lite/v5.json","data":{"values":[{"corr_val":1.0,"x":"Bedrooms per rooms","y":"Bedrooms per rooms"},{"corr_val":0.06,"x":"Households","y":"Bedrooms per rooms"},{"corr_val":0.13,"x":"Housing median age","y":"Bedrooms per rooms"},{"corr_val":-0.12,"x":"Latitude","y":"Bedrooms per rooms"},{"corr_val":0.1,"x":"Longitude","y":"Bedrooms per rooms"},{"corr_val":-0.61,"x":"Median income","y":"Bedrooms per rooms"},{"corr_val":0.04,"x":"Population","y":"Bedrooms per rooms"},{"corr_val":0.0,"x":"Population per family","y":"Bedrooms per rooms"},{"corr_val":-0.41,"x":"Rooms per family","y":"Bedrooms per rooms"},{"corr_val":0.08,"x":"Total bedrooms","y":"Bedrooms per rooms"},{"corr_val":-0.19,"x":"Total rooms","y":"Bedrooms per rooms"},{"corr_val":-0.12,"x":"Ocean proximity","y":"Bedrooms per rooms"},{"corr_val":-0.25,"x":"Median house value","y":"Bedrooms per rooms"},{"corr_val":0.06,"x":"Bedrooms per rooms","y":"Households"},{"corr_val":1.0,"x":"Households","y":"Households"},{"corr_val":-0.31,"x":"Housing median age","y":"Households"},{"corr_val":-0.08,"x":"Latitude","y":"Households"},{"corr_val":0.06,"x":"Longitude","y":"Households"},{"corr_val":0.01,"x":"Median income","y":"Households"},{"corr_val":0.91,"x":"Population","y":"Households"},{"corr_val":-0.03,"x":"Population per family","y":"Households"},{"corr_val":-0.07,"x":"Rooms per family","y":"Households"},{"corr_val":0.97,"x":"Total bedrooms","y":"Households"},{"corr_val":0.92,"x":"Total rooms","y":"Households"},{"corr_val":-0.05,"x":"Ocean proximity","y":"Households"},{"corr_val":0.06,"x":"Median house value","y":"Households"},{"corr_val":0.13,"x":"Bedrooms per rooms","y":"Housing median age"},{"corr_val":-0.31,"x":"Households","y":"Housing median age"},{"corr_val":1.0,"x":"Housing median age","y":"Housing median age"},{"corr_val":0.02,"x":"Latitude","y":"Housing median age"},{"corr_val":-0.11,"x":"Longitude","y":"Housing median age"},{"corr_val":-0.13,"x":"Median income","y":"Housing median age"},{"corr_val":-0.3,"x":"Population","y":"Housing median age"},{"corr_val":0.01,"x":"Population per family","y":"Housing median age"},{"corr_val":-0.15,"x":"Rooms per family","y":"Housing median age"},{"corr_val":-0.32,"x":"Total bedrooms","y":"Housing median age"},{"corr_val":-0.36,"x":"Total rooms","y":"Housing median age"},{"corr_val":-0.12,"x":"Ocean proximity","y":"Housing median age"},{"corr_val":0.1,"x":"Median house value","y":"Housing median age"},{"corr_val":-0.12,"x":"Bedrooms per rooms","y":"Latitude"},{"corr_val":-0.08,"x":"Households","y":"Latitude"},{"corr_val":0.02,"x":"Housing median age","y":"Latitude"},{"corr_val":1.0,"x":"Latitude","y":"Latitude"},{"corr_val":-0.93,"x":"Longitude","y":"Latitude"},{"corr_val":-0.07,"x":"Median income","y":"Latitude"},{"corr_val":-0.11,"x":"Population","y":"Latitude"},{"corr_val":0.0,"x":"Population per family","y":"Latitude"},{"corr_val":0.11,"x":"Rooms per family","y":"Latitude"},{"corr_val":-0.07,"x":"Total bedrooms","y":"Latitude"},{"corr_val":-0.04,"x":"Total rooms","y":"Latitude"},{"corr_val":0.5,"x":"Ocean proximity","y":"Latitude"},{"corr_val":-0.14,"x":"Median house value","y":"Latitude"},{"corr_val":0.1,"x":"Bedrooms per rooms","y":"Longitude"},{"corr_val":0.06,"x":"Households","y":"Longitude"},{"corr_val":-0.11,"x":"Housing median age","y":"Longitude"},{"corr_val":-0.93,"x":"Latitude","y":"Longitude"},{"corr_val":1.0,"x":"Longitude","y":"Longitude"},{"corr_val":-0.02,"x":"Median income","y":"Longitude"},{"corr_val":0.11,"x":"Population","y":"Longitude"},{"corr_val":0.0,"x":"Population per family","y":"Longitude"},{"corr_val":-0.03,"x":"Rooms per family","y":"Longitude"},{"corr_val":0.08,"x":"Total bedrooms","y":"Longitude"},{"corr_val":0.05,"x":"Total rooms","y":"Longitude"},{"corr_val":-0.28,"x":"Ocean proximity","y":"Longitude"},{"corr_val":-0.05,"x":"Median house value","y":"Longitude"},{"corr_val":-0.61,"x":"Bedrooms per rooms","y":"Median income"},{"corr_val":0.01,"x":"Households","y":"Median income"},{"corr_val":-0.13,"x":"Housing median age","y":"Median income"},{"corr_val":-0.07,"x":"Latitude","y":"Median income"},{"corr_val":-0.02,"x":"Longitude","y":"Median income"},{"corr_val":1.0,"x":"Median income","y":"Median income"},{"corr_val":0.0,"x":"Population","y":"Median income"},{"corr_val":0.02,"x":"Population per family","y":"Median income"},{"corr_val":0.32,"x":"Rooms per family","y":"Median income"},{"corr_val":-0.01,"x":"Total bedrooms","y":"Median income"},{"corr_val":0.19,"x":"Total rooms","y":"Median income"},{"corr_val":-0.21,"x":"Ocean proximity","y":"Median income"},{"corr_val":0.69,"x":"Median house value","y":"Median income"},{"corr_val":0.04,"x":"Bedrooms per rooms","y":"Population"},{"corr_val":0.91,"x":"Households","y":"Population"},{"corr_val":-0.3,"x":"Housing median age","y":"Population"},{"corr_val":-0.11,"x":"Latitude","y":"Population"},{"corr_val":0.11,"x":"Longitude","y":"Population"},{"corr_val":0.0,"x":"Median income","y":"Population"},{"corr_val":1.0,"x":"Population","y":"Population"},{"corr_val":0.07,"x":"Population per family","y":"Population"},{"corr_val":-0.07,"x":"Rooms per family","y":"Population"},{"corr_val":0.87,"x":"Total bedrooms","y":"Population"},{"corr_val":0.86,"x":"Total rooms","y":"Population"},{"corr_val":-0.06,"x":"Ocean proximity","y":"Population"},{"corr_val":-0.03,"x":"Median house value","y":"Population"},{"corr_val":0.0,"x":"Bedrooms per rooms","y":"Population per family"},{"corr_val":-0.03,"x":"Households","y":"Population per family"},{"corr_val":0.01,"x":"Housing median age","y":"Population per family"},{"corr_val":0.0,"x":"Latitude","y":"Population per family"},{"corr_val":0.0,"x":"Longitude","y":"Population per family"},{"corr_val":0.02,"x":"Median income","y":"Population per family"},{"corr_val":0.07,"x":"Population","y":"Population per family"},{"corr_val":1.0,"x":"Population per family","y":"Population per family"},{"corr_val":-0.01,"x":"Rooms per family","y":"Population per family"},{"corr_val":-0.03,"x":"Total bedrooms","y":"Population per family"},{"corr_val":-0.02,"x":"Total rooms","y":"Population per family"},{"corr_val":0.01,"x":"Ocean proximity","y":"Population per family"},{"corr_val":-0.02,"x":"Median house value","y":"Population per family"},{"corr_val":-0.41,"x":"Bedrooms per rooms","y":"Rooms per family"},{"corr_val":-0.07,"x":"Households","y":"Rooms per family"},{"corr_val":-0.15,"x":"Housing median age","y":"Rooms per family"},{"corr_val":0.11,"x":"Latitude","y":"Rooms per family"},{"corr_val":-0.03,"x":"Longitude","y":"Rooms per family"},{"corr_val":0.32,"x":"Median income","y":"Rooms per family"},{"corr_val":-0.07,"x":"Population","y":"Rooms per family"},{"corr_val":-0.01,"x":"Population per family","y":"Rooms per family"},{"corr_val":1.0,"x":"Rooms per family","y":"Rooms per family"},{"corr_val":0.01,"x":"Total bedrooms","y":"Rooms per family"},{"corr_val":0.14,"x":"Total rooms","y":"Rooms per family"},{"corr_val":0.14,"x":"Ocean proximity","y":"Rooms per family"},{"corr_val":0.15,"x":"Median house value","y":"Rooms per family"},{"corr_val":0.08,"x":"Bedrooms per rooms","y":"Total bedrooms"},{"corr_val":0.97,"x":"Households","y":"Total bedrooms"},{"corr_val":-0.32,"x":"Housing median age","y":"Total bedrooms"},{"corr_val":-0.07,"x":"Latitude","y":"Total bedrooms"},{"corr_val":0.08,"x":"Longitude","y":"Total bedrooms"},{"corr_val":-0.01,"x":"Median income","y":"Total bedrooms"},{"corr_val":0.87,"x":"Population","y":"Total bedrooms"},{"corr_val":-0.03,"x":"Population per family","y":"Total bedrooms"},{"corr_val":0.01,"x":"Rooms per family","y":"Total bedrooms"},{"corr_val":1.0,"x":"Total bedrooms","y":"Total bedrooms"},{"corr_val":0.93,"x":"Total rooms","y":"Total bedrooms"},{"corr_val":-0.02,"x":"Ocean proximity","y":"Total bedrooms"},{"corr_val":0.05,"x":"Median house value","y":"Total bedrooms"},{"corr_val":-0.19,"x":"Bedrooms per rooms","y":"Total rooms"},{"corr_val":0.92,"x":"Households","y":"Total rooms"},{"corr_val":-0.36,"x":"Housing median age","y":"Total rooms"},{"corr_val":-0.04,"x":"Latitude","y":"Total rooms"},{"corr_val":0.05,"x":"Longitude","y":"Total rooms"},{"corr_val":0.19,"x":"Median income","y":"Total rooms"},{"corr_val":0.86,"x":"Population","y":"Total rooms"},{"corr_val":-0.02,"x":"Population per family","y":"Total rooms"},{"corr_val":0.14,"x":"Rooms per family","y":"Total rooms"},{"corr_val":0.93,"x":"Total bedrooms","y":"Total rooms"},{"corr_val":1.0,"x":"Total rooms","y":"Total rooms"},{"corr_val":0.02,"x":"Ocean proximity","y":"Total rooms"},{"corr_val":0.13,"x":"Median house value","y":"Total rooms"},{"corr_val":-0.12,"x":"Bedrooms per rooms","y":"Ocean proximity"},{"corr_val":-0.05,"x":"Households","y":"Ocean proximity"},{"corr_val":-0.12,"x":"Housing median age","y":"Ocean proximity"},{"corr_val":0.5,"x":"Latitude","y":"Ocean proximity"},{"corr_val":-0.28,"x":"Longitude","y":"Ocean proximity"},{"corr_val":-0.21,"x":"Median income","y":"Ocean proximity"},{"corr_val":-0.06,"x":"Population","y":"Ocean proximity"},{"corr_val":0.01,"x":"Population per family","y":"Ocean proximity"},{"corr_val":0.14,"x":"Rooms per family","y":"Ocean proximity"},{"corr_val":-0.02,"x":"Total bedrooms","y":"Ocean proximity"},{"corr_val":0.02,"x":"Total rooms","y":"Ocean proximity"},{"corr_val":1.0,"x":"Ocean proximity","y":"Ocean proximity"},{"corr_val":-0.4,"x":"Median house value","y":"Ocean proximity"},{"corr_val":-0.25,"x":"Bedrooms per rooms","y":"Median house value"},{"corr_val":0.06,"x":"Households","y":"Median house value"},{"corr_val":0.1,"x":"Housing median age","y":"Median house value"},{"corr_val":-0.14,"x":"Latitude","y":"Median house value"},{"corr_val":-0.05,"x":"Longitude","y":"Median house value"},{"corr_val":0.69,"x":"Median income","y":"Median house value"},{"corr_val":-0.03,"x":"Population","y":"Median house value"},{"corr_val":-0.02,"x":"Population per family","y":"Median house value"},{"corr_val":0.15,"x":"Rooms per family","y":"Median house value"},{"corr_val":0.05,"x":"Total bedrooms","y":"Median house value"},{"corr_val":0.13,"x":"Total rooms","y":"Median house value"},{"corr_val":-0.4,"x":"Ocean proximity","y":"Median house value"},{"corr_val":1.0,"x":"Median house value","y":"Median house value"}]},"height":630,"layer":[{"encoding":{"color":{"field":"corr_val","scale":{"scheme":"viridis"},"type":"quantitative"},"x":{"field":"x","type":"nominal"},"y":{"field":"y","type":"nominal"}},"mark":"rect"},{"encoding":{"text":{"field":"corr_val","type":"quantitative"},"x":{"field":"x","type":"nominal"},"y":{"field":"y","type":"nominal"}},"mark":"text"}],"title":{"offset":20,"text":"Correlation Matrix for California Housing"},"width":630} +>>>>>>> 77ac659... Update mix.installs ``` We can spot that _median_house_value_ is strongly correlated with _median_income_. It's pretty straightforward, the more money you have, the more expensive house you can buy. Non-obvious is a negative correlation with _bedrooms_per_rooms_. But it also can be explained. Bedrooms are the most crucial rooms in the house. Firstly, you need to guarantee that you have a house with enough bedrooms. If this condition is satisfied, then you can focus on "additional rooms" like a chill room, cabinets and so on. So if you buy a house with more additional rooms, then you decrease the ratio. diff --git a/test/scholar/cluster/affinity_propagation_test.exs b/test/scholar/cluster/affinity_propagation_test.exs index 320e4ced..f63e4160 100644 --- a/test/scholar/cluster/affinity_propagation_test.exs +++ b/test/scholar/cluster/affinity_propagation_test.exs @@ -68,7 +68,7 @@ defmodule Scholar.Cluster.AffinityPropagationTest do end test "fit and compute_values" do - model = AffinityPropagation.fit(x(), key: key()) + model = AffinityPropagation.fit(x(), key: key(), preference: :median) model = AffinityPropagation.prune(model) @@ -94,14 +94,14 @@ defmodule Scholar.Cluster.AffinityPropagationTest do end test "predict with pruning" do - model = AffinityPropagation.fit(x(), key: key()) + model = AffinityPropagation.fit(x(), key: key(), preference: :median) model = AffinityPropagation.prune(model) preds = AffinityPropagation.predict(model, x_test()) assert preds == Nx.tensor([0, 2, 0, 5, 5, 5, 2, 2, 5, 2]) end test "predict without pruning" do - model = AffinityPropagation.fit(x(), key: key()) + model = AffinityPropagation.fit(x(), key: key(), preference: :median) preds = AffinityPropagation.predict(model, x_test()) assert preds == Nx.tensor([2, 9, 2, 34, 34, 34, 9, 9, 34, 9]) end diff --git a/test/scholar/manifold/mds_test.exs b/test/scholar/manifold/mds_test.exs new file mode 100644 index 00000000..87d5cd93 --- /dev/null +++ b/test/scholar/manifold/mds_test.exs @@ -0,0 +1,212 @@ +defmodule Scholar.Manifold.MDSTest do + use Scholar.Case, async: true + alias Scholar.Manifold.MDS + doctest MDS + + def x() do + Nx.iota({10, 50}) + end + + def key() do + Nx.Random.key(42) + end + + test "non-default num_components" do + key = key() + x = x() + model = EXLA.jit_apply(&MDS.fit(&1, num_components: 5, key: &2), [x, key]) + + assert_all_close( + model.embedding, + Nx.tensor([ + [ + 57.28269577026367, + -678.6760864257812, + 811.1503295898438, + -251.1714324951172, + 1156.7987060546875 + ], + [ + 7.623606204986572, + -544.2373046875, + 604.0946655273438, + -225.99559020996094, + 903.2800903320312 + ], + [ + -7.334737300872803, + -429.81671142578125, + 402.1512145996094, + -163.3682861328125, + 639.9016723632812 + ], + [ + 13.86670207977295, + -296.5096435546875, + 223.15061950683594, + -84.07274627685547, + 374.4827575683594 + ], + [ + 38.73623275756836, + -134.54620361328125, + 50.4241943359375, + -38.010799407958984, + 113.90003967285156 + ], + [ + 18.940887451171875, + 30.962879180908203, + -127.7795639038086, + 45.001678466796875, + -131.29234313964844 + ], + [ + 18.05344581604004, + 222.0098114013672, + -292.34197998046875, + 86.87554168701172, + -378.58544921875 + ], + [ + -3.060556173324585, + 429.6268005371094, + -436.2151794433594, + 146.84103393554688, + -621.5556640625 + ], + [ + -55.395423889160156, + 613.6642456054688, + -565.1470947265625, + 225.3615264892578, + -882.4739379882812 + ], + [ + -88.7128677368164, + 787.5221557617188, + -669.4872436523438, + 258.53912353515625, + -1174.455810546875 + ] + ]) + ) + + assert_all_close(model.stress, 698.4426879882812) + assert_all_close(model.n_iter, Nx.tensor(152)) + end + + test "non-default metric" do + key = key() + x = x() + model = EXLA.jit_apply(&MDS.fit(&1, metric: false, key: &2), [x, key]) + + assert_all_close( + model.embedding, + Nx.tensor([ + [-0.23465712368488312, 0.6921732425689697], + [-0.3380763530731201, 0.4378605782985687], + [-0.15237200260162354, 0.26230522990226746], + [0.09990488737821579, 0.2603200674057007], + [0.15598554909229279, 0.03315458819270134], + [0.41043558716773987, 0.13559512794017792], + [0.24686546623706818, -0.24366283416748047], + [0.1395486444234848, -0.4151153564453125], + [-0.07875102013349533, -0.530768096446991], + [-0.21976199746131897, -0.6417303681373596] + ]) + ) + + assert_all_close(model.stress, 0.1966342180967331) + assert_all_close(model.n_iter, Nx.tensor(38)) + end + + test "option normalized_stress with metric set to false" do + key = key() + x = x() + + model = + EXLA.jit_apply(&MDS.fit(&1, metric: false, key: &2, normalized_stress: true), [x, key]) + + assert_all_close( + model.embedding, + Nx.tensor([ + [-0.17997372150421143, 0.7225074768066406], + [-0.3138044774532318, 0.3934117257595062], + [-0.0900932177901268, 0.19507794082164764], + [0.2092301845550537, 0.295993834733963], + [0.24611115455627441, 0.0019988759886473417], + [0.4951189458370209, 0.08028026670217514], + [0.12963972985744476, -0.3193856179714203], + [0.19291982054710388, -0.44776636362075806], + [-0.2770233750343323, -0.4146113097667694], + [-0.3582141101360321, -0.5444929003715515] + ]) + ) + + assert_all_close(model.stress, 0.13638167083263397) + assert_all_close(model.n_iter, Nx.tensor(20)) + end + + test "epsilon set to a smaller then default value" do + key = key() + x = x() + + model = + EXLA.jit_apply(&MDS.fit(&1, metric: false, key: &2, normalized_stress: true, eps: 1.0e-4), [ + x, + key + ]) + + assert_all_close( + model.embedding, + Nx.tensor([ + [-0.35130712389945984, 0.6258886456489563], + [-0.4270354211330414, 0.4396686255931854], + [-0.30671024322509766, 0.2688262462615967], + [-0.12758131325244904, 0.18020282685756683], + [-0.05403336510062218, 0.01867777667939663], + [0.17203716933727264, 0.044468216598033905], + [0.2791652977466583, -0.09437420219182968], + [0.2869844138622284, -0.3071449398994446], + [0.2768166959285736, -0.49931082129478455], + [0.2563020884990692, -0.678329348564148] + ]) + ) + + # as expected smaller value of stress (loss) and bigger number of iterations + assert_all_close(model.stress, 0.03167537972331047) + assert_all_close(model.n_iter, Nx.tensor(116)) + end + + test "smaller max_iter value (100)" do + key = key() + x = x() + + model = + EXLA.jit_apply( + &MDS.fit(&1, metric: false, key: &2, normalized_stress: true, eps: 1.0e-4, max_iter: 100), + [x, key] + ) + + assert_all_close( + model.embedding, + Nx.tensor([ + [-0.34521010518074036, 0.6345276236534119], + [-0.4247266352176666, 0.43899187445640564], + [-0.2903931438922882, 0.2677172124385834], + [-0.09941618889570236, 0.19031266868114471], + [-0.03261081129312515, 0.019261524081230164], + [0.2049849033355713, 0.07233452051877975], + [0.29381951689720154, -0.09455471485853195], + [0.27441200613975525, -0.320201575756073], + [0.2368578165769577, -0.5156480669975281], + [0.19262047111988068, -0.6936381459236145] + ]) + ) + + # same params as in previous test, but smaller number of iterations, cupped on 100 + assert_all_close(model.stress, 0.040396787226200104) + assert_all_close(model.n_iter, Nx.tensor(100)) + end +end diff --git a/test/scholar/metrics/classification_test.exs b/test/scholar/metrics/classification_test.exs index 7667e2df..975abed0 100644 --- a/test/scholar/metrics/classification_test.exs +++ b/test/scholar/metrics/classification_test.exs @@ -34,4 +34,42 @@ defmodule Scholar.Metrics.ClassificationTest do assert_all_close(fbeta_scores, Classification.precision(y_true, y_pred, num_classes: 2)) end end + + describe "mcc/2" do + test "returns 1 for perfect predictions" do + y_true = Nx.tensor([1, 0, 1, 0, 1]) + y_pred = Nx.tensor([1, 0, 1, 0, 1]) + assert Classification.mcc(y_true, y_pred) == Nx.tensor([1.0], type: :f32) + end + + test "returns -1 for completely wrong predictions" do + y_true = Nx.tensor([1, 0, 1, 0, 1]) + y_pred = Nx.tensor([0, 1, 0, 1, 0]) + assert Classification.mcc(y_true, y_pred) == Nx.tensor([-1.0], type: :f32) + end + + test "returns 0 when all predictions are positive" do + y_true = Nx.tensor([1, 0, 1, 0, 1]) + y_pred = Nx.tensor([1, 1, 1, 1, 1]) + assert Classification.mcc(y_true, y_pred) == Nx.tensor([0.0], type: :f32) + end + + test "returns 0 when all predictions are negative" do + y_true = Nx.tensor([1, 0, 1, 0, 1]) + y_pred = Nx.tensor([0, 0, 0, 0, 0]) + assert Classification.mcc(y_true, y_pred) == Nx.tensor([0.0], type: :f32) + end + + test "computes MCC for generic case" do + y_true = Nx.tensor([1, 0, 1, 0, 1]) + y_pred = Nx.tensor([1, 0, 1, 1, 1]) + assert Classification.mcc(y_true, y_pred) == Nx.tensor([0.6123723983764648], type: :f32) + end + + test "returns 0 when TP, TN, FP, and FN are all 0" do + y_true = Nx.tensor([0, 0, 0, 0, 0]) + y_pred = Nx.tensor([0, 0, 0, 0, 0]) + assert Classification.mcc(y_true, y_pred) == Nx.tensor([0.0], type: :f32) + end + end end diff --git a/test/scholar/metrics/ranking_test.exs b/test/scholar/metrics/ranking_test.exs new file mode 100644 index 00000000..a189ae52 --- /dev/null +++ b/test/scholar/metrics/ranking_test.exs @@ -0,0 +1,47 @@ +defmodule Scholar.Metrics.RankingTest do + use Scholar.Case, async: true + alias Scholar.Metrics.Ranking + + describe "dcg/3" do + test "computes DCG when there are no ties" do + y_true = Nx.tensor([3, 2, 3, 0, 1, 2]) + y_score = Nx.tensor([3.0, 2.2, 3.5, 0.5, 1.0, 2.1]) + + result = Ranking.dcg(y_true, y_score) + + x = Nx.tensor([7.140995025634766]) + assert x == Nx.broadcast(result, {1}) + end + + test "computes DCG with ties" do + y_true = Nx.tensor([3, 3, 3]) + y_score = Nx.tensor([2.0, 2.0, 3.5]) + + result = Ranking.dcg(y_true, y_score) + + x = Nx.tensor([6.3927892607143715]) + assert x == Nx.broadcast(result, {1}) + end + + test "raises error when shapes mismatch" do + y_true = Nx.tensor([3, 2, 3]) + y_score = Nx.tensor([3.0, 2.2, 3.5, 0.5]) + + assert_raise ArgumentError, + "expected tensor to have shape {3}, got tensor with shape {4}", + fn -> + Ranking.dcg(y_true, y_score) + end + end + + test "computes DCG for top-k values" do + y_true = Nx.tensor([3, 2, 3, 0, 1, 2]) + y_score = Nx.tensor([3.0, 2.2, 3.5, 0.5, 1.0, 2.1]) + + result = Ranking.dcg(y_true, y_score, k: 3) + + x = Nx.tensor([5.892789363861084]) + assert x == Nx.broadcast(result, {1}) + end + end +end diff --git a/test/scholar/metrics/regression_test.exs b/test/scholar/metrics/regression_test.exs index 68258382..2522da25 100644 --- a/test/scholar/metrics/regression_test.exs +++ b/test/scholar/metrics/regression_test.exs @@ -3,4 +3,67 @@ defmodule Scholar.Metrics.RegressionTest do alias Scholar.Metrics.Regression doctest Regression + + describe "mean_tweedie_deviance!/3" do + test "raise when y_pred <= 0 and power < 0" do + power = -1 + y_true = Nx.tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1], type: :u32) + y_pred = Nx.tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0], type: :u32) + + assert_raise RuntimeError, ~r/mean Tweedie deviance/, fn -> + Regression.mean_tweedie_deviance!(y_true, y_pred, power) + end + end + + test "raise when y_pred <= 0 and 1 <= power < 2" do + power = 1 + y_true = Nx.tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1], type: :u32) + y_pred = Nx.tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0], type: :u32) + + assert_raise RuntimeError, ~r/mean Tweedie deviance/, fn -> + Regression.mean_tweedie_deviance!(y_true, y_pred, power) + end + end + + test "raise when y_pred <= 0 and power >= 2" do + power = 2 + y_true = Nx.tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1], type: :u32) + y_pred = Nx.tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0], type: :u32) + + assert_raise RuntimeError, ~r/mean Tweedie deviance/, fn -> + Regression.mean_tweedie_deviance!(y_true, y_pred, power) + end + end + + test "raise when y_true < 0 and 1 <= power < 2" do + power = 1 + y_true = Nx.tensor([-1, 0, 0, 0, 0, 1, 1, 1, 1, 1], type: :s32) + y_pred = Nx.tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], type: :s32) + + assert_raise RuntimeError, ~r/mean Tweedie deviance/, fn -> + Regression.mean_tweedie_deviance!(y_true, y_pred, power) + end + end + + test "raise when y_true <= 0 and power >= 2" do + power = 2 + y_true = Nx.tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1], type: :s32) + y_pred = Nx.tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], type: :s32) + + assert_raise RuntimeError, ~r/mean Tweedie deviance/, fn -> + Regression.mean_tweedie_deviance!(y_true, y_pred, power) + end + end + end + + describe "d2_tweedie_score/3" do + test "equal R^2 when power is 0" do + y_true = Nx.tensor([1, 1, 1, 1, 1, 2, 2, 1, 3, 1], type: :u32) + y_pred = Nx.tensor([2, 2, 1, 1, 2, 2, 2, 1, 3, 1], type: :u32) + d2 = Regression.d2_tweedie_score(y_true, y_pred, 0) + r2 = Regression.r2_score(y_true, y_pred) + + assert Nx.equal(d2, r2) + end + end end diff --git a/test/scholar/neighbors/kd_tree_test.exs b/test/scholar/neighbors/kd_tree_test.exs new file mode 100644 index 00000000..3df22fbe --- /dev/null +++ b/test/scholar/neighbors/kd_tree_test.exs @@ -0,0 +1,72 @@ +defmodule Scholar.Neighbors.KDTreeTest do + use ExUnit.Case, async: true + doctest Scholar.Neighbors.KDTree + + defp example do + Nx.tensor([ + [10, 15], + [46, 63], + [68, 21], + [40, 33], + [25, 54], + [15, 43], + [44, 58], + [45, 40], + [62, 69], + [53, 67] + ]) + end + + describe "unbounded" do + test "sample" do + assert %Scholar.Neighbors.KDTree{levels: 4, indexes: indexes} = + Scholar.Neighbors.KDTree.unbounded(example(), compiler: EXLA) + + assert Nx.to_flat_list(indexes) == [1, 5, 9, 3, 6, 2, 8, 0, 7, 4] + end + + test "float" do + assert %Scholar.Neighbors.KDTree{levels: 4, indexes: indexes} = + Scholar.Neighbors.KDTree.unbounded(example() |> Nx.as_type(:f32), + compiler: EXLA + ) + + assert Nx.to_flat_list(indexes) == [1, 5, 9, 3, 6, 2, 8, 0, 7, 4] + end + + test "corner cases" do + assert %Scholar.Neighbors.KDTree{levels: 1, indexes: indexes} = + Scholar.Neighbors.KDTree.unbounded(Nx.iota({1, 2}), compiler: EXLA) + + assert indexes == Nx.u32([0]) + + assert %Scholar.Neighbors.KDTree{levels: 2, indexes: indexes} = + Scholar.Neighbors.KDTree.unbounded(Nx.iota({2, 2}), compiler: EXLA) + + assert indexes == Nx.u32([1, 0]) + end + end + + describe "bounded" do + test "iota" do + assert %Scholar.Neighbors.KDTree{levels: 3, indexes: indexes} = + Scholar.Neighbors.KDTree.bounded(Nx.iota({5, 2}), 10) + + assert indexes == Nx.u32([3, 1, 4, 0, 2]) + end + + test "float" do + assert %Scholar.Neighbors.KDTree{levels: 4, indexes: indexes} = + Scholar.Neighbors.KDTree.bounded(example() |> Nx.as_type(:f32), 100) + + assert Nx.to_flat_list(indexes) == [1, 5, 9, 3, 6, 2, 8, 0, 7, 4] + end + + test "sample" do + assert %Scholar.Neighbors.KDTree{levels: 4, indexes: indexes} = + Scholar.Neighbors.KDTree.bounded(example(), 100) + + assert Nx.to_flat_list(indexes) == [1, 5, 9, 3, 6, 2, 8, 0, 7, 4] + end + end +end diff --git a/test/test_helper.exs b/test/test_helper.exs index e69de29b..5e96292b 100644 --- a/test/test_helper.exs +++ b/test/test_helper.exs @@ -0,0 +1 @@ +Application.ensure_all_started(:exla)