Merge branch 'release/0.3'

Add distance correlation t test for independence in high dimension
vnmabus · Jun 6, 2019 · 2c765a6 · 2c765a6
2 parents b0ff127 + 552205a
commit 2c765a6
Show file tree

Hide file tree

Showing 8 changed files with 152 additions and 8 deletions.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.2
+0.3
diff --git a/dcor/__init__.py b/dcor/__init__.py
@@ -9,6 +9,7 @@
 import errno as _errno
 import os as _os
 
+from . import distances  # noqa
 from . import homogeneity  # noqa
 from . import independence  # noqa
 from ._dcor import (distance_covariance_sqr, distance_covariance,  # noqa
@@ -28,7 +29,6 @@
 from ._partial_dcor import (partial_distance_covariance,  # noqa
                             partial_distance_correlation)
 
-
 try:
     with open(_os.path.join(_os.path.dirname(__file__),
                             '..', 'VERSION'), 'r') as version_file:

diff --git a/dcor/independence.py b/dcor/independence.py
@@ -10,6 +10,9 @@
 from . import _dcor_internals
 from . import _hypothesis
 from ._utils import _random_state_init, _transform_to_2d
+from ._dcor import u_distance_correlation_sqr
+import numpy as np
+import scipy.stats
 
 
 def _distance_covariance_test_imp(x, y,
@@ -241,3 +244,117 @@ def partial_distance_covariance_test(x, y, z, **kwargs):
 
     """
     return _partial_distance_covariance_test_imp(x, y, z, **kwargs)
+
+
+def distance_correlation_t_statistic(x, y):
+    """
+    Transformation of the bias corrected version of distance correlation used
+    in :func:`distance_correlation_t_test`.
+
+    Parameters
+    ----------
+    x: array_like
+        First random vector. The columns correspond with the individual random
+        variables while the rows are individual instances of the random vector.
+    y: array_like
+        Second random vector. The columns correspond with the individual random
+        variables while the rows are individual instances of the random vector.
+
+    Returns
+    -------
+    numpy scalar
+        T statistic.
+
+    See Also
+    --------
+    distance_correlation_t_test
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import dcor
+    >>> a = np.array([[1, 2, 3, 4],
+    ...               [5, 6, 7, 8],
+    ...               [9, 10, 11, 12],
+    ...               [13, 14, 15, 16]])
+    >>> b = np.array([[1, 0, 0, 1],
+    ...               [0, 1, 1, 1],
+    ...               [1, 1, 1, 1],
+    ...               [1, 1, 0, 1]])
+    >>> with np.errstate(divide='ignore'):
+    ...     dcor.independence.distance_correlation_t_statistic(a, a)
+    inf
+    >>> dcor.independence.distance_correlation_t_statistic(a, b)
+    ...                                      # doctest: +ELLIPSIS
+    -0.4430164...
+    >>> with np.errstate(divide='ignore'):
+    ...     dcor.independence.distance_correlation_t_statistic(b, b)
+    inf
+
+    """
+    bcdcor = u_distance_correlation_sqr(x, y)
+
+    n = x.shape[0]
+    v = n * (n-3) / 2
+
+    return np.sqrt(v - 1) * bcdcor / np.sqrt(1 - bcdcor**2)
+
+
+def distance_correlation_t_test(x, y):
+    """
+    Test of independence for high dimension based on convergence to a Student t
+    distribution. The null hypothesis is that the two random vectors are
+    independent.
+
+    Parameters
+    ----------
+    x: array_like
+        First random vector. The columns correspond with the individual random
+        variables while the rows are individual instances of the random vector.
+    y: array_like
+        Second random vector. The columns correspond with the individual random
+        variables while the rows are individual instances of the random vector.
+
+    Returns
+    -------
+    HypothesisTest
+        Results of the hypothesis test.
+
+    See Also
+    --------
+    distance_correlation_t_statistic
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import dcor
+    >>> a = np.array([[1, 2, 3, 4],
+    ...               [5, 6, 7, 8],
+    ...               [9, 10, 11, 12],
+    ...               [13, 14, 15, 16]])
+    >>> b = np.array([[1, 0, 0, 1],
+    ...               [0, 1, 1, 1],
+    ...               [1, 1, 1, 1],
+    ...               [1, 1, 0, 1]])
+    >>> with np.errstate(divide='ignore'):
+    ...     dcor.independence.distance_correlation_t_test(a, a)
+    ...                                      # doctest: +ELLIPSIS
+    HypothesisTest(p_value=0.0, statistic=inf)
+    >>> dcor.independence.distance_correlation_t_test(a, b)
+    ...                                      # doctest: +ELLIPSIS
+    HypothesisTest(p_value=0.6327451..., statistic=-0.4430164...)
+    >>> with np.errstate(divide='ignore'):
+    ...     dcor.independence.distance_correlation_t_test(b, b)
+    ...                                      # doctest: +ELLIPSIS
+    HypothesisTest(p_value=0.0, statistic=inf)
+
+    """
+    t_test = distance_correlation_t_statistic(x, y)
+
+    n = x.shape[0]
+    v = n * (n-3) / 2
+    df = v - 1
+
+    p_value = 1 - scipy.stats.t.cdf(t_test, df=df)
+
+    return _hypothesis.HypothesisTest(p_value=p_value, statistic=t_test)
diff --git a/docs/_static/css/wide.css b/docs/_static/css/wide.css
@@ -0,0 +1,3 @@
+.wy-nav-content {
+	max-width: 100% !important;
+}
diff --git a/docs/apilist.rst b/docs/apilist.rst
@@ -85,6 +85,8 @@ The following functions are used to test if two random vectors are independent.
    :toctree: functions
 
    dcor.independence.distance_covariance_test
+   dcor.independence.distance_correlation_t_statistic
+   dcor.independence.distance_correlation_t_test
 
 Internal computations
 ^^^^^^^^^^^^^^^^^^^^^

diff --git a/docs/conf.py b/docs/conf.py
@@ -108,6 +108,11 @@
 #
 html_theme = 'sphinx_rtd_theme'
 
+
+def setup(app):
+    app.add_stylesheet('css/wide.css')
+
+
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.

diff --git a/docs/energycomparison.rst b/docs/energycomparison.rst
@@ -195,7 +195,7 @@ Table of energy-dcor equivalents
             DX <- as.matrix(dx)
             DY <- as.matrix(dy)
 
-            dcovU_stats(x, y)
+            dcovU_stats(DX, DY)
 
       - .. code-block:: python
 
@@ -303,4 +303,21 @@ Table of energy-dcor equivalents
                                                         num_resamples=10)
 
       -
-
+    * - .. code-block:: R
+
+            dcor.t(x, y)
+
+      - .. code-block:: python
+
+             dcor.independence.distance_correlation_t_statistic(x, y)
+
+      -
+    * - .. code-block:: R
+
+            dcor.ttest(x, y)
+
+      - .. code-block:: python
+
+             dcor.independence.distance_correlation_t_test(x, y)
+
+      -
diff --git a/docs/theory.rst b/docs/theory.rst
@@ -77,9 +77,9 @@ and :math:`(B_{i, j})_{i,j=1}^n`
 
 .. math::
    A_{i, j} &= a_{i,j} - \frac{1}{n} \sum_{l=1}^n a_{il} - \frac{1}{n}
-   \sum_{k=1}^n a_{kj} + \frac{1}{n^2}\sum_{k=1}^n a_{kj}, \\
+   \sum_{k=1}^n a_{kj} + \frac{1}{n^2}\sum_{k,l=1}^n a_{kl}, \\
    B_{i, j} &= b_{i,j} - \frac{1}{n} \sum_{l=1}^n b_{il} - \frac{1}{n}
-   \sum_{k=1}^n b_{kj} + \frac{1}{n^2}\sum_{k=1}^n b_{kj}.
+   \sum_{k=1}^n b_{kj} + \frac{1}{n^2}\sum_{k,l=1}^n b_{kl}.
 
 Then
 
@@ -111,11 +111,11 @@ matrices :math:`(\widetilde{A}_{i, j})_{i,j=1}^n` and :math:`(\widetilde{B}_{i,
    :label: ucentering
    
    \widetilde{A}_{i, j} &= \begin{cases} a_{i,j} - \frac{1}{n-2} \sum_{l=1}^n a_{il} -
-   \frac{1}{n-2} \sum_{k=1}^n a_{kj} + \frac{1}{(n-1)(n-2)}\sum_{k=1}^n a_{kj}, &\text{if } i \neq j, \\
+   \frac{1}{n-2} \sum_{k=1}^n a_{kj} + \frac{1}{(n-1)(n-2)}\sum_{k,l=1}^n a_{kl}, &\text{if } i \neq j, \\
    0, &\text{if } i = j,
    \end{cases} \\
    \widetilde{B}_{i, j} &= \begin{cases} b_{i,j} - \frac{1}{n-2} \sum_{l=1}^n b_{il} -
-   \frac{1}{n-2} \sum_{k=1}^n b_{kj} + \frac{1}{(n-1)(n-2)}\sum_{k=1}^n b_{kj}, &\text{if } i \neq j, \\
+   \frac{1}{n-2} \sum_{k=1}^n b_{kj} + \frac{1}{(n-1)(n-2)}\sum_{k,l=1}^n b_{kl}, &\text{if } i \neq j, \\
    0, &\text{if } i = j.
    \end{cases}