diff --git a/.travis.yml b/.travis.yml index 55b44fc..3312542 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,7 @@ language: python python: - - "2.7" - - "3.5" - "3.6" + - "3.7" script: - "python setup.py test" before_install: diff --git a/VERSION b/VERSION index 1d71ef9..e6adf3f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.3 \ No newline at end of file +0.4 \ No newline at end of file diff --git a/dcor/__init__.py b/dcor/__init__.py index 9c4669b..e398e9e 100644 --- a/dcor/__init__.py +++ b/dcor/__init__.py @@ -19,7 +19,8 @@ u_distance_correlation_sqr, u_distance_stats_sqr, distance_correlation_af_inv_sqr, - distance_correlation_af_inv) + distance_correlation_af_inv, + DistanceCovarianceMethod) from ._dcor_internals import (double_centered, u_centered, # noqa mean_product, u_product, u_projection, diff --git a/dcor/_dcor.py b/dcor/_dcor.py index c2199f9..6cf2c1d 100644 --- a/dcor/_dcor.py +++ b/dcor/_dcor.py @@ -17,18 +17,104 @@ from __future__ import unicode_literals import collections -import math - from dcor._dcor_internals import _af_inv_scaled +from enum import Enum + import numpy as np from ._dcor_internals import _distance_matrix, _u_distance_matrix from ._dcor_internals import mean_product, u_product -from ._utils import _sqrt, _jit +from ._fast_dcov_avl import _distance_covariance_sqr_avl_generic +from ._fast_dcov_mergesort import _distance_covariance_sqr_mergesort_generic +from ._utils import _sqrt + + +class _DcovAlgorithmInternals(): + + def __init__(self, *, + dcov_sqr=None, u_dcov_sqr=None, + dcor_sqr=None, u_dcor_sqr=None, + stats_sqr=None, u_stats_sqr=None, + dcov_generic=None, + stats_generic=None): + + # Dcov and U-Dcov + if dcov_generic is not None: + self.dcov_sqr = ( + lambda *args, **kwargs: dcov_generic( + *args, **kwargs, unbiased=False)) + self.u_dcov_sqr = ( + lambda *args, **kwargs: dcov_generic( + *args, **kwargs, unbiased=True)) + else: + self.dcov_sqr = dcov_sqr + self.u_dcov_sqr = u_dcov_sqr + + # Stats + if stats_sqr is not None: + self.stats_sqr = stats_sqr + else: + if stats_generic is None: + self.stats_sqr = ( + lambda *args, **kwargs: _distance_stats_sqr_generic( + *args, **kwargs, dcov_function=self.dcov_sqr)) + else: + self.stats_sqr = ( + lambda *args, **kwargs: stats_generic( + *args, **kwargs, + matrix_centered=_distance_matrix, + product=mean_product)) + + # U-Stats + if u_stats_sqr is not None: + self.u_stats_sqr = u_stats_sqr + else: + if stats_generic is None: + self.u_stats_sqr = ( + lambda *args, **kwargs: _distance_stats_sqr_generic( + *args, **kwargs, dcov_function=self.u_dcov_sqr)) + else: + self.u_stats_sqr = ( + lambda *args, **kwargs: stats_generic( + *args, **kwargs, + matrix_centered=_u_distance_matrix, + product=u_product)) + + # Dcor + if dcor_sqr is not None: + self.dcor_sqr = dcor_sqr + else: + self.dcor_sqr = lambda *args, **kwargs: self.stats_sqr( + *args, **kwargs).correlation_xy + + # U-Dcor + if u_dcor_sqr is not None: + self.u_dcor_sqr = u_dcor_sqr + else: + self.u_dcor_sqr = lambda *args, **kwargs: self.u_stats_sqr( + *args, **kwargs).correlation_xy + + +class _DcovAlgorithmInternalsAuto(): + def _dispatch(self, x, y, *, method, **kwargs): + if _can_use_fast_algorithm(x, y, **kwargs): + return getattr(DistanceCovarianceMethod.AVL.value, method)(x, y) + else: + return getattr( + DistanceCovarianceMethod.NAIVE.value, method)( + x, y, **kwargs) + + def __getattr__(self, method): + if method[0] != '_': + return lambda *args, **kwargs: self._dispatch( + *args, **kwargs, method=method) + else: + raise AttributeError("%r object has no attribute %r" % + (self.__class__.__name__, method)) Stats = collections.namedtuple('Stats', ['covariance_xy', 'correlation_xy', - 'variance_x', 'variance_y']) + 'variance_x', 'variance_y']) def _distance_covariance_sqr_naive(x, y, exponent=1): @@ -127,194 +213,11 @@ def _can_use_fast_algorithm(x, y, exponent=1): x.shape[0] > 3 and y.shape[0] > 3 and exponent == 1) -@_jit -def _dyad_update(y, c): # pylint:disable=too-many-locals - # This function has many locals so it can be compared - # with the original algorithm. - """ - Inner function of the fast distance covariance. - - This function is compiled because otherwise it would become - a bottleneck. - - """ - n = y.shape[0] - gamma = np.zeros(n, dtype=c.dtype) - - # Step 1: get the smallest l such that n <= 2^l - l_max = int(math.ceil(np.log2(n))) - - # Step 2: assign s(l, k) = 0 - s_len = 2 ** (l_max + 1) - s = np.zeros(s_len, dtype=c.dtype) - - pos_sums = np.arange(l_max) - pos_sums[:] = 2 ** (l_max - pos_sums) - pos_sums = np.cumsum(pos_sums) - - # Step 3: iteration - for i in range(1, n): - - # Step 3.a: update s(l, k) - for l in range(l_max): - k = int(math.ceil(y[i - 1] / 2 ** l)) - pos = k - 1 - - if l > 0: - pos += pos_sums[l - 1] - - s[pos] += c[i - 1] - - # Steps 3.b and 3.c - for l in range(l_max): - k = int(math.floor((y[i] - 1) / 2 ** l)) - if k / 2 > math.floor(k / 2): - pos = k - 1 - if l > 0: - pos += pos_sums[l - 1] - - gamma[i] = gamma[i] + s[pos] - - return gamma - - -def _partial_sum_2d(x, y, c): # pylint:disable=too-many-locals - # This function has many locals so it can be compared - # with the original algorithm. - x = np.asarray(x) - y = np.asarray(y) - c = np.asarray(c) - - n = x.shape[0] - - # Step 1: rearrange x, y and c so x is in ascending order - temp = range(n) - - ix0 = np.argsort(x) - ix = np.zeros(n, dtype=int) - ix[ix0] = temp - - x = x[ix0] - y = y[ix0] - c = c[ix0] - - # Step 2 - iy0 = np.argsort(y) - iy = np.zeros(n, dtype=int) - iy[iy0] = temp - - y = iy + 1 - - # Step 3 - sy = np.cumsum(c[iy0]) - c[iy0] - - # Step 4 - sx = np.cumsum(c) - c - - # Step 5 - c_dot = np.sum(c) - - # Step 6 - y = np.asarray(y) - c = np.asarray(c) - gamma1 = _dyad_update(y, c) - - # Step 7 - gamma = c_dot - c - 2 * sy[iy] - 2 * sx + 4 * gamma1 - gamma = gamma[ix] - - return gamma - - -def _distance_covariance_sqr_fast_generic( - x, y, unbiased=False): # pylint:disable=too-many-locals - # This function has many locals so it can be compared - # with the original algorithm. - """Fast algorithm for the squared distance covariance.""" - x = np.asarray(x) - y = np.asarray(y) - - x = np.ravel(x) - y = np.ravel(y) - - n = x.shape[0] - assert n > 3 - assert n == y.shape[0] - temp = range(n) - - # Step 1 - ix0 = np.argsort(x) - vx = x[ix0] - - ix = np.zeros(n, dtype=int) - ix[ix0] = temp - - iy0 = np.argsort(y) - vy = y[iy0] - - iy = np.zeros(n, dtype=int) - iy[iy0] = temp - - # Step 2 - sx = np.cumsum(vx) - sy = np.cumsum(vy) - - # Step 3 - alpha_x = ix - alpha_y = iy - - beta_x = sx[ix] - vx[ix] - beta_y = sy[iy] - vy[iy] - - # Step 4 - x_dot = np.sum(x) - y_dot = np.sum(y) - - # Step 5 - a_i_dot = x_dot + (2 * alpha_x - n) * x - 2 * beta_x - b_i_dot = y_dot + (2 * alpha_y - n) * y - 2 * beta_y - - sum_ab = np.sum(a_i_dot * b_i_dot) - - # Step 6 - a_dot_dot = 2 * np.sum(alpha_x * x) - 2 * np.sum(beta_x) - b_dot_dot = 2 * np.sum(alpha_y * y) - 2 * np.sum(beta_y) - - # Step 7 - gamma_1 = _partial_sum_2d(x, y, np.ones(n, dtype=x.dtype)) - gamma_x = _partial_sum_2d(x, y, x) - gamma_y = _partial_sum_2d(x, y, y) - gamma_xy = _partial_sum_2d(x, y, x * y) - - # Step 8 - aijbij = np.sum(x * y * gamma_1 + gamma_xy - x * gamma_y - y * gamma_x) - - if unbiased: - d3 = (n - 3) - d2 = (n - 2) - d1 = (n - 1) - else: - d3 = d2 = d1 = n - - # Step 9 - d_cov = (aijbij / n / d3 - 2 * sum_ab / n / d2 / d3 + - a_dot_dot / n * b_dot_dot / d1 / d2 / d3) - - return d_cov - - -def _distance_covariance_sqr_fast(x, y): - """Fast algorithm for the biased squared distance covariance""" - return _distance_covariance_sqr_fast_generic(x, y, unbiased=False) - - -def _u_distance_covariance_sqr_fast(x, y): - """Fast algorithm for the unbiased squared distance covariance""" - return _distance_covariance_sqr_fast_generic(x, y, unbiased=True) - +def _distance_stats_sqr_generic(x, y, *, exponent=1, dcov_function): + """Compute the distance stats using a dcov algorithm.""" + if exponent != 1: + raise ValueError(f"Exponent should be 1 but is {exponent} instead.") -def _distance_stats_sqr_fast_generic(x, y, dcov_function): - """Compute the distance stats using the fast algorithm.""" covariance_xy_sqr = dcov_function(x, y) variance_x_sqr = dcov_function(x, x) variance_y_sqr = dcov_function(y, y) @@ -335,32 +238,57 @@ def _distance_stats_sqr_fast_generic(x, y, dcov_function): variance_y=variance_y_sqr) -def _distance_stats_sqr_fast(x, y): - """Compute the biased distance stats using the fast algorithm.""" - return _distance_stats_sqr_fast_generic(x, y, - _distance_covariance_sqr_fast) - - -def _u_distance_stats_sqr_fast(x, y): - """Compute the bias-corrected distance stats using the fast algorithm.""" - return _distance_stats_sqr_fast_generic(x, y, - _u_distance_covariance_sqr_fast) +class DistanceCovarianceMethod(Enum): + """ + Method used for computing the distance covariance. + """ + AUTO = _DcovAlgorithmInternalsAuto() + """ + Try to select the best algorithm. It will try to use a fast + algorithm if possible. Otherwise it will use the naive + implementation. + """ + NAIVE = _DcovAlgorithmInternals( + dcov_sqr=_distance_covariance_sqr_naive, + u_dcov_sqr=_u_distance_covariance_sqr_naive, + dcor_sqr=_distance_correlation_sqr_naive, + u_dcor_sqr=_u_distance_correlation_sqr_naive, + stats_generic=_distance_sqr_stats_naive_generic) + """ + Use the usual estimator of the distance covariance, which is + :math:`O(n^2)` + """ + AVL = _DcovAlgorithmInternals( + dcov_generic=_distance_covariance_sqr_avl_generic) + """ + Use the fast implementation from + :cite:`b-fast_distance_correlation_avl` which is + :math:`O(n\log n)` + """ + MERGESORT = _DcovAlgorithmInternals( + dcov_generic=_distance_covariance_sqr_mergesort_generic) + """ + Use the fast implementation from + :cite:`b-fast_distance_correlation_mergesort` which is + :math:`O(n\log n)` + """ -def _distance_correlation_sqr_fast(x, y): - """Fast algorithm for bias-corrected squared distance correlation.""" - return _distance_stats_sqr_fast(x, y).correlation_xy + def __repr__(self): + return '%s.%s' % (self.__class__.__name__, self.name) -def _u_distance_correlation_sqr_fast(x, y): - """Fast algorithm for bias-corrected squared distance correlation.""" - return _u_distance_stats_sqr_fast(x, y).correlation_xy +def _to_algorithm(algorithm): + """Convert to algorithm if string""" + if isinstance(algorithm, DistanceCovarianceMethod): + return algorithm + else: + return DistanceCovarianceMethod[algorithm.upper()] -def distance_covariance_sqr(x, y, **kwargs): +def distance_covariance_sqr(x, y, *, exponent=1, + method=DistanceCovarianceMethod.AUTO): """ - distance_covariance_sqr(x, y, *, exponent=1) - Computes the usual (biased) estimator for the squared distance covariance between two random vectors. @@ -376,6 +304,8 @@ def distance_covariance_sqr(x, y, **kwargs): Exponent of the Euclidean distance, in the range :math:`(0, 2)`. Equivalently, it is twice the Hurst parameter of fractional Brownian motion. + method: DistanceCovarianceMethod + Method to use internally to compute the distance covariance. Returns ------- @@ -387,11 +317,6 @@ def distance_covariance_sqr(x, y, **kwargs): distance_covariance u_distance_covariance_sqr - Notes - ----- - The algorithm uses the fast distance covariance algorithm proposed in - :cite:`b-fast_distance_correlation` when possible. - Examples -------- >>> import numpy as np @@ -411,16 +336,14 @@ def distance_covariance_sqr(x, y, **kwargs): 0.3705904... """ - if _can_use_fast_algorithm(x, y, **kwargs): - return _distance_covariance_sqr_fast(x, y) - else: - return _distance_covariance_sqr_naive(x, y, **kwargs) + method = _to_algorithm(method) + return method.value.dcov_sqr(x, y, exponent=exponent) -def u_distance_covariance_sqr(x, y, **kwargs): - """ - u_distance_covariance_sqr(x, y, *, exponent=1) +def u_distance_covariance_sqr(x, y, *, exponent=1, + method=DistanceCovarianceMethod.AUTO): + """ Computes the unbiased estimator for the squared distance covariance between two random vectors. @@ -436,6 +359,8 @@ def u_distance_covariance_sqr(x, y, **kwargs): Exponent of the Euclidean distance, in the range :math:`(0, 2)`. Equivalently, it is twice the Hurst parameter of fractional Brownian motion. + method: DistanceCovarianceMethod + Method to use internally to compute the distance covariance. Returns ------- @@ -447,11 +372,6 @@ def u_distance_covariance_sqr(x, y, **kwargs): distance_covariance distance_covariance_sqr - Notes - ----- - The algorithm uses the fast distance covariance algorithm proposed in - :cite:`b-fast_distance_correlation` when possible. - Examples -------- >>> import numpy as np @@ -471,16 +391,14 @@ def u_distance_covariance_sqr(x, y, **kwargs): -0.2996598... """ - if _can_use_fast_algorithm(x, y, **kwargs): - return _u_distance_covariance_sqr_fast(x, y) - else: - return _u_distance_covariance_sqr_naive(x, y, **kwargs) + method = _to_algorithm(method) + return method.value.u_dcov_sqr(x, y, exponent=exponent) -def distance_covariance(x, y, **kwargs): - """ - distance_covariance(x, y, *, exponent=1) +def distance_covariance(x, y, *, exponent=1, + method=DistanceCovarianceMethod.AUTO): + """ Computes the usual (biased) estimator for the distance covariance between two random vectors. @@ -496,6 +414,8 @@ def distance_covariance(x, y, **kwargs): Exponent of the Euclidean distance, in the range :math:`(0, 2)`. Equivalently, it is twice the Hurst parameter of fractional Brownian motion. + method: DistanceCovarianceMethod + Method to use internally to compute the distance covariance. Returns ------- @@ -507,11 +427,6 @@ def distance_covariance(x, y, **kwargs): distance_covariance_sqr u_distance_covariance_sqr - Notes - ----- - The algorithm uses the fast distance covariance algorithm proposed in - :cite:`b-fast_distance_correlation` when possible. - Examples -------- >>> import numpy as np @@ -531,13 +446,13 @@ def distance_covariance(x, y, **kwargs): 0.6087614... """ - return _sqrt(distance_covariance_sqr(x, y, **kwargs)) + return _sqrt(distance_covariance_sqr( + x, y, exponent=exponent, method=method)) -def distance_stats_sqr(x, y, **kwargs): +def distance_stats_sqr(x, y, *, exponent=1, + method=DistanceCovarianceMethod.AUTO): """ - distance_stats_sqr(x, y, *, exponent=1) - Computes the usual (biased) estimators for the squared distance covariance and squared distance correlation between two random vectors, and the individual squared distance variances. @@ -554,6 +469,8 @@ def distance_stats_sqr(x, y, **kwargs): Exponent of the Euclidean distance, in the range :math:`(0, 2)`. Equivalently, it is twice the Hurst parameter of fractional Brownian motion. + method: DistanceCovarianceMethod + Method to use internally to compute the distance covariance. Returns ------- @@ -572,9 +489,6 @@ def distance_stats_sqr(x, y, **kwargs): It is less efficient to compute the statistics separately, rather than using this function, because some computations can be shared. - The algorithm uses the fast distance covariance algorithm proposed in - :cite:`b-fast_distance_correlation` when possible. - Examples -------- >>> import numpy as np @@ -599,20 +513,14 @@ def distance_stats_sqr(x, y, **kwargs): variance_x=2.7209220..., variance_y=0.25) """ - if _can_use_fast_algorithm(x, y, **kwargs): - return _distance_stats_sqr_fast(x, y) - else: - return _distance_sqr_stats_naive_generic( - x, y, - matrix_centered=_distance_matrix, - product=mean_product, - **kwargs) + method = _to_algorithm(method) + return method.value.stats_sqr(x, y, exponent=exponent) -def u_distance_stats_sqr(x, y, **kwargs): - """ - u_distance_stats_sqr(x, y, *, exponent=1) +def u_distance_stats_sqr(x, y, *, exponent=1, + method=DistanceCovarianceMethod.AUTO): + """ Computes the unbiased estimators for the squared distance covariance and squared distance correlation between two random vectors, and the individual squared distance variances. @@ -629,6 +537,8 @@ def u_distance_stats_sqr(x, y, **kwargs): Exponent of the Euclidean distance, in the range :math:`(0, 2)`. Equivalently, it is twice the Hurst parameter of fractional Brownian motion. + method: DistanceCovarianceMethod + Method to use internally to compute the distance covariance. Returns ------- @@ -647,9 +557,6 @@ def u_distance_stats_sqr(x, y, **kwargs): It is less efficient to compute the statistics separately, rather than using this function, because some computations can be shared. - The algorithm uses the fast distance covariance algorithm proposed in - :cite:`b-fast_distance_correlation` when possible. - Examples -------- >>> import numpy as np @@ -677,20 +584,14 @@ def u_distance_stats_sqr(x, y, **kwargs): variance_x=0.8209855..., variance_y=0.6666666...) """ - if _can_use_fast_algorithm(x, y, **kwargs): - return _u_distance_stats_sqr_fast(x, y) - else: - return _distance_sqr_stats_naive_generic( - x, y, - matrix_centered=_u_distance_matrix, - product=u_product, - **kwargs) + method = _to_algorithm(method) + return method.value.u_stats_sqr(x, y, exponent=exponent) -def distance_stats(x, y, **kwargs): - """ - distance_stats(x, y, *, exponent=1) +def distance_stats(x, y, *, exponent=1, + method=DistanceCovarianceMethod.AUTO): + """ Computes the usual (biased) estimators for the distance covariance and distance correlation between two random vectors, and the individual distance variances. @@ -707,6 +608,8 @@ def distance_stats(x, y, **kwargs): Exponent of the Euclidean distance, in the range :math:`(0, 2)`. Equivalently, it is twice the Hurst parameter of fractional Brownian motion. + method: DistanceCovarianceMethod + Method to use internally to compute the distance covariance. Returns ------- @@ -725,9 +628,6 @@ def distance_stats(x, y, **kwargs): It is less efficient to compute the statistics separately, rather than using this function, because some computations can be shared. - The algorithm uses the fast distance covariance algorithm proposed in - :cite:`b-fast_distance_correlation` when possible. - Examples -------- >>> import numpy as np @@ -752,13 +652,13 @@ def distance_stats(x, y, **kwargs): variance_x=1.6495217..., variance_y=0.5) """ - return Stats(*[_sqrt(s) for s in distance_stats_sqr(x, y, **kwargs)]) + return Stats(*[_sqrt(s) for s in distance_stats_sqr( + x, y, exponent=exponent, method=method)]) -def distance_correlation_sqr(x, y, **kwargs): +def distance_correlation_sqr(x, y, *, exponent=1, + method=DistanceCovarianceMethod.AUTO): """ - distance_correlation_sqr(x, y, *, exponent=1) - Computes the usual (biased) estimator for the squared distance correlation between two random vectors. @@ -774,6 +674,8 @@ def distance_correlation_sqr(x, y, **kwargs): Exponent of the Euclidean distance, in the range :math:`(0, 2)`. Equivalently, it is twice the Hurst parameter of fractional Brownian motion. + method: DistanceCovarianceMethod + Method to use internally to compute the distance covariance. Returns ------- @@ -785,11 +687,6 @@ def distance_correlation_sqr(x, y, **kwargs): distance_correlation u_distance_correlation_sqr - Notes - ----- - The algorithm uses the fast distance covariance algorithm proposed in - :cite:`b-fast_distance_correlation` when possible. - Examples -------- >>> import numpy as np @@ -809,16 +706,14 @@ def distance_correlation_sqr(x, y, **kwargs): 0.4493308... """ - if _can_use_fast_algorithm(x, y, **kwargs): - return _distance_correlation_sqr_fast(x, y) - else: - return _distance_correlation_sqr_naive(x, y, **kwargs) + method = _to_algorithm(method) + return method.value.dcor_sqr(x, y, exponent=exponent) -def u_distance_correlation_sqr(x, y, **kwargs): - """ - u_distance_correlation_sqr(x, y, *, exponent=1) +def u_distance_correlation_sqr(x, y, *, exponent=1, + method=DistanceCovarianceMethod.AUTO): + """ Computes the bias-corrected estimator for the squared distance correlation between two random vectors. @@ -834,6 +729,8 @@ def u_distance_correlation_sqr(x, y, **kwargs): Exponent of the Euclidean distance, in the range :math:`(0, 2)`. Equivalently, it is twice the Hurst parameter of fractional Brownian motion. + method: DistanceCovarianceMethod + Method to use internally to compute the distance covariance. Returns ------- @@ -846,11 +743,6 @@ def u_distance_correlation_sqr(x, y, **kwargs): distance_correlation distance_correlation_sqr - Notes - ----- - The algorithm uses the fast distance covariance algorithm proposed in - :cite:`b-fast_distance_correlation` when possible. - Examples -------- >>> import numpy as np @@ -871,16 +763,14 @@ def u_distance_correlation_sqr(x, y, **kwargs): -0.4050479... """ - if _can_use_fast_algorithm(x, y, **kwargs): - return _u_distance_correlation_sqr_fast(x, y) - else: - return _u_distance_correlation_sqr_naive(x, y, **kwargs) + method = _to_algorithm(method) + return method.value.u_dcor_sqr(x, y, exponent=exponent) -def distance_correlation(x, y, **kwargs): - """ - distance_correlation(x, y, *, exponent=1) +def distance_correlation(x, y, *, exponent=1, + method=DistanceCovarianceMethod.AUTO): + """ Computes the usual (biased) estimator for the distance correlation between two random vectors. @@ -896,6 +786,8 @@ def distance_correlation(x, y, **kwargs): Exponent of the Euclidean distance, in the range :math:`(0, 2)`. Equivalently, it is twice the Hurst parameter of fractional Brownian motion. + method: DistanceCovarianceMethod + Method to use internally to compute the distance covariance. Returns ------- @@ -907,11 +799,6 @@ def distance_correlation(x, y, **kwargs): distance_correlation_sqr u_distance_correlation_sqr - Notes - ----- - The algorithm uses the fast distance covariance algorithm proposed in - :cite:`b-fast_distance_correlation` when possible. - Examples -------- >>> import numpy as np @@ -931,10 +818,12 @@ def distance_correlation(x, y, **kwargs): 0.6703214... """ - return distance_stats(x, y, **kwargs).correlation_xy + return distance_stats( + x, y, exponent=exponent, method=method).correlation_xy -def distance_correlation_af_inv_sqr(x, y): +def distance_correlation_af_inv_sqr(x, y, + method=DistanceCovarianceMethod.AUTO): """ Square of the affinely invariant distance correlation. @@ -952,6 +841,8 @@ def distance_correlation_af_inv_sqr(x, y): y: array_like Second random vector. The columns correspond with the individual random variables while the rows are individual instances of the random vector. + method: DistanceCovarianceMethod + Method to use internally to compute the distance covariance. Returns ------- @@ -984,11 +875,12 @@ def distance_correlation_af_inv_sqr(x, y): x = _af_inv_scaled(x) y = _af_inv_scaled(y) - correlation = distance_correlation_sqr(x, y) + correlation = distance_correlation_sqr(x, y, method=method) return 0 if np.isnan(correlation) else correlation -def distance_correlation_af_inv(x, y): +def distance_correlation_af_inv(x, y, + method=DistanceCovarianceMethod.AUTO): """ Affinely invariant distance correlation. @@ -1006,6 +898,8 @@ def distance_correlation_af_inv(x, y): y: array_like Second random vector. The columns correspond with the individual random variables while the rows are individual instances of the random vector. + method: DistanceCovarianceMethod + Method to use internally to compute the distance covariance. Returns ------- @@ -1035,4 +929,4 @@ def distance_correlation_af_inv(x, y): 1.0 """ - return _sqrt(distance_correlation_af_inv_sqr(x, y)) + return _sqrt(distance_correlation_af_inv_sqr(x, y, method=method)) diff --git a/dcor/_fast_dcov_avl.py b/dcor/_fast_dcov_avl.py new file mode 100644 index 0000000..fec3732 --- /dev/null +++ b/dcor/_fast_dcov_avl.py @@ -0,0 +1,188 @@ +''' +Functions to compute fast distance covariance using AVL. +''' +import math + +import numpy as np + +from ._utils import _jit + + +@_jit +def _dyad_update(y, c): # pylint:disable=too-many-locals + # This function has many locals so it can be compared + # with the original algorithm. + """ + Inner function of the fast distance covariance. + + This function is compiled because otherwise it would become + a bottleneck. + + """ + n = y.shape[0] + gamma = np.zeros(n, dtype=c.dtype) + + # Step 1: get the smallest l such that n <= 2^l + l_max = int(math.ceil(np.log2(n))) + + # Step 2: assign s(l, k) = 0 + s_len = 2 ** (l_max + 1) + s = np.zeros(s_len, dtype=c.dtype) + + pos_sums = np.arange(l_max) + pos_sums[:] = 2 ** (l_max - pos_sums) + pos_sums = np.cumsum(pos_sums) + + # Step 3: iteration + for i in range(1, n): + + # Step 3.a: update s(l, k) + for l in range(l_max): + k = int(math.ceil(y[i - 1] / 2 ** l)) + pos = k - 1 + + if l > 0: + pos += pos_sums[l - 1] + + s[pos] += c[i - 1] + + # Steps 3.b and 3.c + for l in range(l_max): + k = int(math.floor((y[i] - 1) / 2 ** l)) + if k / 2 > math.floor(k / 2): + pos = k - 1 + if l > 0: + pos += pos_sums[l - 1] + + gamma[i] = gamma[i] + s[pos] + + return gamma + + +def _partial_sum_2d(x, y, c): # pylint:disable=too-many-locals + # This function has many locals so it can be compared + # with the original algorithm. + x = np.asarray(x) + y = np.asarray(y) + c = np.asarray(c) + + n = x.shape[0] + + # Step 1: rearrange x, y and c so x is in ascending order + temp = range(n) + + ix0 = np.argsort(x) + ix = np.zeros(n, dtype=int) + ix[ix0] = temp + + x = x[ix0] + y = y[ix0] + c = c[ix0] + + # Step 2 + iy0 = np.argsort(y) + iy = np.zeros(n, dtype=int) + iy[iy0] = temp + + y = iy + 1 + + # Step 3 + sy = np.cumsum(c[iy0]) - c[iy0] + + # Step 4 + sx = np.cumsum(c) - c + + # Step 5 + c_dot = np.sum(c) + + # Step 6 + y = np.asarray(y) + c = np.asarray(c) + gamma1 = _dyad_update(y, c) + + # Step 7 + gamma = c_dot - c - 2 * sy[iy] - 2 * sx + 4 * gamma1 + gamma = gamma[ix] + + return gamma + + +def _distance_covariance_sqr_avl_generic( + x, y, *, exponent=1, unbiased=False): # pylint:disable=too-many-locals + # This function has many locals so it can be compared + # with the original algorithm. + """Fast algorithm for the squared distance covariance.""" + + if exponent != 1: + raise ValueError(f"Exponent should be 1 but is {exponent} instead.") + + x = np.asarray(x) + y = np.asarray(y) + + x = np.ravel(x) + y = np.ravel(y) + + n = x.shape[0] + assert n > 3 + assert n == y.shape[0] + temp = range(n) + + # Step 1 + ix0 = np.argsort(x) + vx = x[ix0] + + ix = np.zeros(n, dtype=int) + ix[ix0] = temp + + iy0 = np.argsort(y) + vy = y[iy0] + + iy = np.zeros(n, dtype=int) + iy[iy0] = temp + + # Step 2 + sx = np.cumsum(vx) + sy = np.cumsum(vy) + + # Step 3 + alpha_x = ix + alpha_y = iy + + beta_x = sx[ix] - vx[ix] + beta_y = sy[iy] - vy[iy] + + # Step 4 + x_dot = np.sum(x) + y_dot = np.sum(y) + + # Step 5 + a_i_dot = x_dot + (2 * alpha_x - n) * x - 2 * beta_x + b_i_dot = y_dot + (2 * alpha_y - n) * y - 2 * beta_y + + sum_ab = a_i_dot @ b_i_dot + + # Step 6 + a_dot_dot = 2 * np.sum(alpha_x * x) - 2 * np.sum(beta_x) + b_dot_dot = 2 * np.sum(alpha_y * y) - 2 * np.sum(beta_y) + + # Step 7 + gamma_1 = _partial_sum_2d(x, y, np.ones(n, dtype=x.dtype)) + gamma_x = _partial_sum_2d(x, y, x) + gamma_y = _partial_sum_2d(x, y, y) + gamma_xy = _partial_sum_2d(x, y, x * y) + + # Step 8 + aijbij = np.sum(x * y * gamma_1 + gamma_xy - x * gamma_y - y * gamma_x) + + if unbiased: + d3 = (n - 3) + d2 = (n - 2) + d1 = (n - 1) + else: + d3 = d2 = d1 = n + + # Step 9 + d_cov = (aijbij / n / d3 - 2 * sum_ab / n / d2 / d3 + + a_dot_dot / n * b_dot_dot / d1 / d2 / d3) + + return d_cov diff --git a/dcor/_fast_dcov_mergesort.py b/dcor/_fast_dcov_mergesort.py new file mode 100644 index 0000000..5b1bd0d --- /dev/null +++ b/dcor/_fast_dcov_mergesort.py @@ -0,0 +1,168 @@ +''' +Functions to compute fast distance covariance using mergesort. +''' + +import numpy as np + +from ._utils import _jit, _transform_to_2d + + +@_jit +def _compute_weight_sums(y, weights): + + n_samples = len(y) + + weight_sums = np.zeros((n_samples,) + weights.shape[1:], dtype=y.dtype) + + # Buffer that contains the indexes of the current and + # last iterations + indexes = np.arange(2 * n_samples).reshape((2, n_samples)) + indexes[1] = 0 # Remove this + + previous_indexes = indexes[0] + current_indexes = indexes[1] + + weights_cumsum = np.zeros( + (n_samples + 1,) + weights.shape[1:], dtype=weights.dtype) + + merged_subarray_len = 1 + + # For all lengths that are a power of two + while merged_subarray_len < n_samples: + gap = 2 * merged_subarray_len + indexes_idx = 0 + + # Numba does not support axis, nor out parameter. + for var in range(weights.shape[1]): + weights_cumsum[1:, var] = np.cumsum( + weights[previous_indexes, var]) + + # Select the subarrays in pairs + for subarray_pair_idx in range(0, n_samples, gap): + subarray_1_idx = subarray_pair_idx + subarray_2_idx = subarray_pair_idx + merged_subarray_len + subarray_1_idx_last = min( + subarray_1_idx + merged_subarray_len - 1, n_samples - 1) + subarray_2_idx_last = min( + subarray_2_idx + merged_subarray_len - 1, n_samples - 1) + + # Merge the subarrays + while (subarray_1_idx <= subarray_1_idx_last and + subarray_2_idx <= subarray_2_idx_last): + previous_index_1 = previous_indexes[subarray_1_idx] + previous_index_2 = previous_indexes[subarray_2_idx] + + if y[previous_index_1].item() >= y[previous_index_2].item(): + current_indexes[indexes_idx] = previous_index_1 + subarray_1_idx += 1 + else: + current_indexes[indexes_idx] = previous_index_2 + subarray_2_idx += 1 + + weight_sums[previous_index_2] += ( + weights_cumsum[subarray_1_idx_last + 1] - + weights_cumsum[subarray_1_idx]) + indexes_idx += 1 + + # Join the remaining elements of one of the arrays (already sorted) + if subarray_1_idx <= subarray_1_idx_last: + n_remaining = subarray_1_idx_last - subarray_1_idx + 1 + indexes_idx_next = indexes_idx + n_remaining + current_indexes[indexes_idx:indexes_idx_next] = ( + previous_indexes[subarray_1_idx:subarray_1_idx_last + 1]) + indexes_idx = indexes_idx_next + elif subarray_2_idx <= subarray_2_idx_last: + n_remaining = subarray_2_idx_last - subarray_2_idx + 1 + indexes_idx_next = indexes_idx + n_remaining + current_indexes[indexes_idx:indexes_idx_next] = ( + previous_indexes[subarray_2_idx:subarray_2_idx_last + 1]) + indexes_idx = indexes_idx_next + + merged_subarray_len = gap + + # Swap buffer + previous_indexes, current_indexes = (current_indexes, previous_indexes) + + return weight_sums + + +def _compute_aijbij_term(x, y): + # x must be sorted + n = len(x) + + weights = np.hstack((np.ones_like(y), y, x, x * y)) + weight_sums = _compute_weight_sums(y, weights) + + term_1 = (x * y).T @ weight_sums[:, 0] + term_2 = x.T @ weight_sums[:, 1] + term_3 = y.T @ weight_sums[:, 2] + term_4 = np.sum(weight_sums[:, 3]) + + # First term in the equation + sums_term = term_1 - term_2 - term_3 + term_4 + + # Second term in the equation + sum_x = np.sum(x) + sum_y = np.sum(y) + cov_term = n * x.T @ y - np.sum(sum_x * y + sum_y * x) + sum_x * sum_y + + d = 4 * sums_term - 2 * cov_term + + return d.item() + + +def _compute_row_sums(x): + # x must be sorted + + x = x.ravel() + n_samples = len(x) + + term_1 = (2 * np.arange(1, n_samples + 1) - n_samples) * x + + sums = np.cumsum(x) + + term_2 = sums[-1] - 2 * sums + + return term_1 + term_2 + + +def _distance_covariance_sqr_mergesort_generic(x, y, + *, exponent=1, unbiased=False): + + if exponent != 1: + raise ValueError(f"Exponent should be 1 but is {exponent} instead.") + + n = len(x) + + x = _transform_to_2d(x) + y = _transform_to_2d(y) + + # Sort x in ascending order + ordered_indexes = np.argsort(x, axis=0).ravel() + x = x[ordered_indexes] + y = y[ordered_indexes] + + aijbij = _compute_aijbij_term(x, y) + a_i = _compute_row_sums(x.ravel()) + + ordered_indexes_y = np.argsort(y.ravel()) + b_i_perm = _compute_row_sums(y.ravel()[ordered_indexes_y]) + b_i = np.empty_like(b_i_perm) + b_i[ordered_indexes_y] = b_i_perm + + a_dot_dot = np.sum(a_i) + b_dot_dot = np.sum(b_i) + + sum_ab = a_i.T @ b_i + + if unbiased: + d3 = (n - 3) + d2 = (n - 2) + d1 = (n - 1) + else: + d3 = d2 = d1 = n + + d_cov = (aijbij / n / d3 - 2 * sum_ab / n / d2 / d3 + + a_dot_dot / n * b_dot_dot / d1 / d2 / d3) + + return d_cov diff --git a/dcor/_pairwise.py b/dcor/_pairwise.py index 5c6e639..8f45249 100644 --- a/dcor/_pairwise.py +++ b/dcor/_pairwise.py @@ -73,8 +73,8 @@ def pairwise(function, x, y=None, **kwargs): A pool object can be used to improve performance for a large number of computations: - >>> import multiprocessing - >>> pool = multiprocessing.Pool() + >>> import multiprocessing.pool + >>> pool = multiprocessing.pool.ThreadPool() >>> dcor.pairwise(dcor.distance_correlation, a, b, pool=pool) array([[0.98182263, 0.99901855], [0.99989466, 0.98320103]]) diff --git a/dcor/_utils.py b/dcor/_utils.py index 16fa93f..2cffb3e 100644 --- a/dcor/_utils.py +++ b/dcor/_utils.py @@ -39,7 +39,7 @@ def _sqrt(x): try: return np.sqrt(x) - except AttributeError: + except (AttributeError, TypeError): exponent = 0.5 try: @@ -73,9 +73,9 @@ def _can_be_double(x): """ return ((np.issubdtype(x.dtype, np.floating) and - x.dtype.itemsize <= np.dtype(float).itemsize) or + x.dtype.itemsize <= np.dtype(float).itemsize) or (np.issubdtype(x.dtype, np.signedinteger) and - np.can_cast(x, float))) + np.can_cast(x, float))) def _random_state_init(random_state): diff --git a/dcor/tests/test_dcor.py b/dcor/tests/test_dcor.py index a7dbf9c..d8e855c 100644 --- a/dcor/tests/test_dcor.py +++ b/dcor/tests/test_dcor.py @@ -5,7 +5,7 @@ import unittest import dcor -import dcor._dcor as dcor_internals +import dcor._fast_dcov_avl import numpy as np @@ -40,7 +40,7 @@ def test_dyad_update(self): # pylint:disable=no-self-use y = np.array([1, 2, 3]) c = np.array([4, 5, 6]) - gamma = dcor_internals._dyad_update(y, c) + gamma = dcor._fast_dcov_avl._dyad_update(y, c) expected_gamma = [0., 4., 9.] np.testing.assert_allclose(gamma, expected_gamma) @@ -51,12 +51,12 @@ def test_partial_sum_2d(self): # pylint:disable=no-self-use y = [4, 5, 6] c = [7, 8, 9] - gamma = dcor_internals._partial_sum_2d(x, y, c) + gamma = dcor._fast_dcov_avl._partial_sum_2d(x, y, c) expected_gamma = [17., 16., 15.] np.testing.assert_allclose(gamma, expected_gamma) - def test_distance_correlation_naive(self): + def test_distance_correlation_multivariate(self): """Compare distance correlation with the energy package.""" matrix1 = np.array(((1, 2, 3), (4, 5, 6), (7, 8, 9))) matrix2 = np.array(((7, 3, 6), (2, 1, 4), (3, 8, 1))) @@ -79,32 +79,35 @@ def test_distance_correlation_naive(self): matrix1, matrix3) self.assertAlmostEqual(correlation, 0.31623, places=5) - def test_distance_correlation_fast(self): - """Compare fast distance correlation with the energy package.""" + def test_distance_correlation_comparison(self): + """Compare all implementations of the distance correlation.""" arr1 = np.array(((1,), (2,), (3,), (4,), (5,), (6,))) arr2 = np.array(((1,), (7,), (5,), (5,), (6,), (2,))) - covariance = dcor_internals._u_distance_covariance_sqr_fast( - arr1, arr2) - self.assertAlmostEqual(covariance, -0.88889, places=5) + for method in dcor.DistanceCovarianceMethod: + with self.subTest(method=method): + covariance = dcor.u_distance_covariance_sqr( + arr1, arr2, method=method) + self.assertAlmostEqual(covariance, -0.88889, places=5) - correlation = dcor_internals._u_distance_correlation_sqr_fast( - arr1, arr2) - self.assertAlmostEqual(correlation, -0.41613, places=5) + correlation = dcor.u_distance_correlation_sqr( + arr1, arr2, method=method) + self.assertAlmostEqual(correlation, -0.41613, places=5) - covariance = dcor_internals._u_distance_covariance_sqr_fast( - arr1, arr1) - self.assertAlmostEqual(covariance, 1.5556, places=4) + covariance = dcor.u_distance_covariance_sqr( + arr1, arr1, method=method) + self.assertAlmostEqual(covariance, 1.5556, places=4) - correlation = dcor_internals._u_distance_correlation_sqr_fast( - arr1, arr1) - self.assertAlmostEqual(correlation, 1, places=5) + correlation = dcor.u_distance_correlation_sqr( + arr1, arr1, method=method) + self.assertAlmostEqual(correlation, 1, places=5) - def test_u_distance_covariance_fast_overflow(self): + def test_u_distance_covariance_avl_overflow(self): """Test potential overflow in fast distance correlation""" arr1 = np.concatenate((np.zeros(500, dtype=int), np.ones(500, dtype=int))) - covariance = dcor_internals._u_distance_covariance_sqr_fast(arr1, arr1) + covariance = dcor.u_distance_covariance_sqr(arr1, arr1, + method='avl') self.assertAlmostEqual(covariance, 0.25050, places=5) def _test_u_distance_correlation_vector_generic(self, @@ -267,12 +270,15 @@ def test_statistic(self): arr1 = random_state.rand(i, 1) arr2 = random_state.rand(i, 1) - stat = dcor_internals._distance_correlation_sqr_naive( - arr1, arr2) - stat_fast = dcor_internals._distance_correlation_sqr_fast( - arr1, arr2) + stat = dcor.distance_correlation_sqr( + arr1, arr2, method='naive') + + for method in dcor.DistanceCovarianceMethod: + with self.subTest(method=method): + stat2 = dcor.distance_correlation_sqr( + arr1, arr2, method=method) - self.assertAlmostEqual(stat, stat_fast) + self.assertAlmostEqual(stat, stat2) def test_u_statistic(self): """Test that the fast and naive algorithms for unbiased dcor match""" @@ -284,12 +290,14 @@ def test_u_statistic(self): arr1 = random_state.rand(i, 1) arr2 = random_state.rand(i, 1) - u_stat = dcor_internals._u_distance_correlation_sqr_naive( - arr1, arr2) - u_stat_fast = dcor_internals._u_distance_correlation_sqr_fast( - arr1, arr2) + u_stat = dcor.u_distance_correlation_sqr( + arr1, arr2, method='naive') + for method in dcor.DistanceCovarianceMethod: + with self.subTest(method=method): + u_stat2 = dcor.u_distance_correlation_sqr( + arr1, arr2, method='avl') - self.assertAlmostEqual(u_stat, u_stat_fast) + self.assertAlmostEqual(u_stat, u_stat2) if __name__ == "__main__": diff --git a/docs/apilist.rst b/docs/apilist.rst index 53baead..cfd8fbb 100644 --- a/docs/apilist.rst +++ b/docs/apilist.rst @@ -110,4 +110,18 @@ These functions are used for computing distance matrices. .. autosummary:: :toctree: functions - dcor.distances.pairwise_distances \ No newline at end of file + dcor.distances.pairwise_distances + +List of classes +--------------- +A complete list of all classes provided by dcor. + +Methods for computing distance covariance +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The following enum lists the methods that can be used for computing distance +covariance. + +.. autosummary:: + :toctree: functions + + dcor.DistanceCovarianceMethod \ No newline at end of file diff --git a/docs/refs.bib b/docs/refs.bib index 3e270c6..b4fdc25 100644 --- a/docs/refs.bib +++ b/docs/refs.bib @@ -28,8 +28,8 @@ @article{distance_correlation year = "2007" } -% Fast distance correlation -@article{fast_distance_correlation, +% Fast distance correlation (avl_method) +@article{fast_distance_correlation_avl, author = "Xiaoming Huo and Gábor J. Székely", title = "Fast Computing for Distance Covariance", journal = "Technometrics", @@ -42,6 +42,20 @@ @article{fast_distance_correlation eprint = "http://dx.doi.org/10.1080/00401706.2015.1054435" } +% Fast distance correlation (mergesort method) +@article{fast_distance_correlation_mergesort, + title = {A Fast Algorithm for Computing Distance Correlation}, + author = {Chaudhuri, Arin and Hu, Wenhao}, + year = {2019}, + month = jul, + volume = {135}, + pages = {15-24}, + issn = {0167-9473}, + doi = {10.1016/j.csda.2019.01.016}, + journal = {Computational Statistics \& Data Analysis}, + keywords = {Dependency measure,Distance correlation,Fast algorithm,Merge sort} +} + % Affinely invariant distance correlation @article{affinely_invariant_distance_correlation, title={The affinely invariant distance correlation}, diff --git a/docs/theory.rst b/docs/theory.rst index 792284a..0b04a09 100644 --- a/docs/theory.rst +++ b/docs/theory.rst @@ -128,8 +128,10 @@ Then, :math:`\Omega_n(x, y)` is defined as We can also obtain an estimator of :math:`\mathcal{R}^2(X, Y)` using :math:`\Omega_n(x, y)`, as we did with :math:`\mathcal{V}_n^2(x, y)`. :math:`\Omega_n(x, y)` does not verify that :math:`\Omega_n(x, y) \geq 0`, because sometimes could take negative values near :math:`0`. -There is an algorithm that can compute :math:`\Omega_n(x, y)` for random variables -with :math:`O(n\log n)` complexity :cite:`c-fast_distance_correlation`. Since + +There are algorithms that can compute :math:`\mathcal{V}_n^2(x, y)` and :math:`\Omega_n(x, y)` +for random variables with :math:`O(n\log n)` complexity +:cite:`c-fast_distance_correlation_avl,c-fast_distance_correlation_mergesort`. Since the estimator formulas explained above have complexity :math:`O(n^2)`, this improvement is significant, specially for larger samples. diff --git a/setup.py b/setup.py index 363180d..f81214a 100644 --- a/setup.py +++ b/setup.py @@ -67,21 +67,19 @@ platforms=['any'], license='MIT', packages=find_packages(), - python_requires='>=2.7, <4', + python_requires='>=3.6, <4', classifiers=[ - 'Development Status :: 4 - Beta', - 'Intended Audience :: Developers', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: MIT License', - 'Natural Language :: English', - 'Operating System :: OS Independent', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Topic :: Scientific/Engineering :: Mathematics', - 'Topic :: Software Development :: Libraries :: Python Modules', + 'Development Status :: 4 - Beta', + 'Intended Audience :: Developers', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: MIT License', + 'Natural Language :: English', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Topic :: Scientific/Engineering :: Mathematics', + 'Topic :: Software Development :: Libraries :: Python Modules', ], keywords=['distance correlation', 'distance covariance', 'energy distance', 'e-statistic', @@ -92,7 +90,7 @@ 'setuptools'], setup_requires=pytest_runner, tests_require=['pytest-cov', - 'numpy>=1.14' # The printing format for numpy changes + 'numpy>=1.17' # Requires matmul on objects ], test_suite='dcor.tests', zip_safe=False)