mind-inria · lionelkusch · Dec 16, 2024 · Dec 16, 2024 · Dec 16, 2024 · Dec 16, 2024
diff --git a/hidimstat/desparsified_lasso.py b/hidimstat/desparsified_lasso.py
@@ -57,7 +57,7 @@ def _compute_residuals(
 
     else:
 
-        ValueError("The only regression method available is 'lasso'")
+        raise ValueError("The only regression method available is 'lasso'")
 
     clf.fit(X_new, y)
     z = y - clf.predict(X_new)

diff --git a/hidimstat/scenario.py b/hidimstat/scenario.py
@@ -16,6 +16,8 @@ def multivariate_1D_simulation(
     rho=0.0,
     shuffle=True,
     seed=0,
+    n_groups=0,
+    group_size=0,
 ):
     """Generate 1D data with Toeplitz design matrix
 
@@ -42,10 +44,17 @@ def multivariate_1D_simulation(
     seed : int
         Seed used for generating design matrix and noise.
 
+    n_groups : int
+        Number of groups.
+
+    group_size : int
+        Size of each group.
+
     Returns
     -------
     X : ndarray, shape (n_samples, n_features)
         Design matrix.
+        if their is some groups, the first rows contains the groups values.
 
     y : ndarray, shape (n_samples,)
         Target.
@@ -59,19 +68,38 @@ def multivariate_1D_simulation(
 
     rng = np.random.default_rng(seed)
 
-    X = np.zeros((n_samples, n_features))
-    X[:, 0] = rng.standard_normal(n_samples)
+    if n_groups < 0 or group_size < 0:
+        raise ValueError("The number of groups and their size must be positive.")
 
+    n_individual_samples = n_samples - n_groups * group_size
+    if n_individual_samples <= 0:
+        raise ValueError(
+            "The number of samples is too small compate to the number "
+            "of group and their size to gerate the data."
+        )
+
+    n_generate_samples = n_groups + n_individual_samples
+
+    # generate random data for each samples
+    X = np.zeros((n_generate_samples, n_features))
+    X[:, 0] = rng.standard_normal(n_generate_samples)
     for i in np.arange(1, n_features):
-        rand_vector = ((1 - rho**2) ** 0.5) * rng.standard_normal(n_samples)
+        rand_vector = ((1 - rho**2) ** 0.5) * rng.standard_normal(n_generate_samples)
         X[:, i] = rho * X[:, i - 1] + rand_vector
 
     if shuffle:
         rng.shuffle(X.T)
 
+    # generate data for the groups based on one sample
+    if n_groups > 0:
+        groups = np.repeat(X[:n_groups], group_size, axis=0)
+        X = np.vstack((groups, X[n_groups:]))
+
+    # generate the vector of variable of importances
     beta = np.zeros(n_features)
     beta[0:support_size] = 1.0
 
+    # generate the simulated regression data
     noise = sigma * rng.standard_normal(n_samples)
     y = np.dot(X, beta) + noise
 

diff --git a/hidimstat/setup.py b/hidimstat/setup.py
diff --git a/hidimstat/test/test_clustered_inference.py b/hidimstat/test/test_clustered_inference.py
@@ -2,6 +2,7 @@
 Test the clustered_inference module
 """
 
+import pytest
 import numpy as np
 from numpy.testing import assert_almost_equal
 from sklearn.cluster import FeatureAgglomeration
@@ -14,16 +15,17 @@
 )
 
 
-def test_clustered_inference():
-    """Testing the procedure on two simulations with a 1D data structure and
-    with n << p: the first test has no temporal dimension, the second has a
-    temporal dimension. The support is connected and of size 10, it must be
-    recovered with a small spatial tolerance parametrized by `margin_size`.
+# Scenario 1: data with no temporal dimension
+def test_clustered_inference_no_temporal():
+    """
+    Testing the procedure on one simulations with a 1D data structure and
+    with n << p: no temporal dimension. The support is connected and of
+    size 10, it must be recovered with a small spatial tolerance
+    parametrized by `margin_size`.
     Computing one sided p-values, we want low p-values for the features of
-    the support and p-values close to 0.5 for the others."""
+    the support and p-values close to 0.5 for the others.
+    """
 
-    # Scenario 1: data with no temporal dimension
-    # ###########################################
     n_samples, n_features = 100, 2000
     support_size = 10
     sigma = 5.0
@@ -63,8 +65,17 @@ def test_clustered_inference():
         pval_corr[extended_support:200], expected[extended_support:200], decimal=1
     )
 
-    # Scenario 2: temporal data
-    # #########################
+
+# Scenario 2: temporal data
+def test_clustered_inference_temporal():
+    """
+    Testing the procedure on two simulations with a 1D data structure and
+    with n << p: with a temporal dimension. The support is connected and
+    of size 10, it must be recovered with a small spatial tolerance
+    parametrized by `margin_size`.
+    Computing one sided p-values, we want low p-values for the features of
+    the support and p-values close to 0.5 for the others.
+    """
     n_samples, n_features, n_times = 200, 2000, 10
     support_size = 10
     sigma = 5.0
@@ -104,3 +115,89 @@ def test_clustered_inference():
     assert_almost_equal(
         pval_corr[extended_support:], expected[extended_support:], decimal=1
     )
+
+
+# Scenario 3: data with no temporal dimension and with groups
+def test_clustered_inference_no_temporal_groups():
+    """
+    Testing the procedure on one simulations with a 1D data structure and
+    with n << p: no temporal dimension. The support is connected and of
+    size 10, it must be recovered with a small spatial tolerance
+    parametrized by `margin_size`.
+    We group the sample in 10 groups of size 10.
+    Computing one sided p-values, we want low p-values for the features of
+    the support and p-values close to 0.5 for the others.
+    """
+
+    n_samples, n_features = 100, 2000
+    support_size = 10
+    n_groups = 9
+    size = 10
+    sigma = 5.0
+    rho = 0.95
+    n_clusters = 200
+    margin_size = 5
+    interior_support = support_size - margin_size
+    extended_support = support_size + margin_size
+
+    X_init, y, beta, epsilon = multivariate_1D_simulation(
+        n_samples=n_samples,
+        n_features=n_features,
+        support_size=support_size,
+        sigma=sigma,
+        rho=rho,
+        shuffle=False,
+        seed=2,
+        n_groups=n_groups,
+        group_size=size,
+    )
+    groups = np.concatenate([[i] * size for i in range(n_groups + 1)])
+
+    y = y - np.mean(y)
+    X_init = X_init - np.mean(X_init, axis=0)
+
+    connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1)
+    ward = FeatureAgglomeration(
+        n_clusters=n_clusters, connectivity=connectivity, linkage="ward"
+    )
+
+    beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = (
+        clustered_inference(X_init, y, ward, n_clusters, groups=groups)
+    )
+
+    expected = 0.5 * np.ones(n_features)
+    expected[:support_size] = 0.0
+
+    assert_almost_equal(pval_corr[:interior_support], expected[:interior_support])
+    assert_almost_equal(
+        pval_corr[extended_support:200], expected[extended_support:200], decimal=1
+    )
+
+
+# Scenario 1: data with no temporal dimension
+def test_clustered_inference_exception_methods():
+    """
+    Testing the procedure on two simulations with a 1D data structure and
+    checking that the procedure raises an exception when an unknown method is
+    provided.
+    """
+    n_samples, n_features = 100, 2000
+    n_clusters = 200
+
+    X_init, y, beta, epsilon = multivariate_1D_simulation(
+        n_samples=n_samples,
+        n_features=n_features,
+        shuffle=False,
+        seed=2,
+    )
+
+    y = y - np.mean(y)
+    X_init = X_init - np.mean(X_init, axis=0)
+
+    connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1)
+    ward = FeatureAgglomeration(
+        n_clusters=n_clusters, connectivity=connectivity, linkage="ward"
+    )
+
+    with pytest.raises(ValueError, match="Unknow method"):
+        clustered_inference(X_init, y, ward, n_clusters, method="lll")
diff --git a/hidimstat/test/test_dcrt.py b/hidimstat/test/test_dcrt.py
@@ -15,7 +15,7 @@ def test_dcrt_lasso():
     """
     X, y = make_regression(n_samples=100, n_features=10, noise=0.2, random_state=2024)
     # Checking if a loss != 'least_square'
-    with pytest.raises(Exception):
+    with pytest.raises(ValueError, match="test loss is not supported."):
         _ = dcrt_zero(
             X,
             y,
@@ -27,7 +27,7 @@ def test_dcrt_lasso():
         )
 
     # Checking for a different statistic
-    with pytest.raises(Exception):
+    with pytest.raises(ValueError, match="test statistic is not supported."):
         _ = dcrt_zero(
             X,
             y,
@@ -37,6 +37,30 @@ def test_dcrt_lasso():
             random_state=2024,
         )
 
+    # Checking for bad selection of screening_threshold
+    result_th_screen_bad = dcrt_zero(
+        X,
+        y,
+        screening_threshold=0,
+        screening=True,
+        verbose=False,
+    )
+    assert result_th_screen_bad.size == 0
+
+    # Checking for bad selection of screening_threshold with verbose
+    result_th_screen_bad = dcrt_zero(
+        X,
+        y,
+        screening_threshold=0,
+        screening=True,
+        verbose=True,
+    )
+
+    assert len(result_th_screen_bad) == 3
+    assert result_th_screen_bad[0].size == 0
+    assert np.all(result_th_screen_bad[1] == np.ones(10))
+    assert np.all(result_th_screen_bad[2] == np.zeros(10))
+
     # Checking with and without screening
     results_no_screening = dcrt_zero(
         X, y, screening=False, verbose=True, statistic="residual", random_state=2024
@@ -70,7 +94,8 @@ def test_dcrt_lasso():
         X,
         y,
         refit=True,
-        screening=False,
+        screening=True,
+        screening_threshold=50,
         verbose=True,
         statistic="residual",
         random_state=2024,

diff --git a/hidimstat/test/test_desparsified_lasso.py b/hidimstat/test/test_desparsified_lasso.py
@@ -5,6 +5,7 @@
 import numpy as np
 from numpy.testing import assert_almost_equal, assert_equal
 from scipy.linalg import toeplitz
+import pytest
 
 from hidimstat.desparsified_lasso import desparsified_group_lasso, desparsified_lasso
 from hidimstat.scenario import (
@@ -48,6 +49,16 @@ def test_desparsified_lasso():
     assert_equal(cb_max > beta, True)
 
 
+def test_desparsified_lasso_exception():
+    """Testing exception of not using lasso"""
+
+    X, y, beta, noise = multivariate_1D_simulation()
+    with pytest.raises(
+        ValueError, match="The only regression method available is 'lasso'"
+    ):
+        _ = desparsified_lasso(X, y, residual_method="test")
+
+
 def test_desparsified_group_lasso():
     """Testing the procedure on a simulation with no structure and
     a support of size 2. Computing one-sided p-values, we want

diff --git a/hidimstat/test/test_ensemble_clustered_inference.py b/hidimstat/test/test_ensemble_clustered_inference.py
@@ -3,6 +3,7 @@
 """
 
 import numpy as np
+import pytest
 from numpy.testing import assert_almost_equal
 from sklearn.cluster import FeatureAgglomeration
 from sklearn.feature_extraction import image
@@ -125,3 +126,54 @@ def test_ensemble_clustered_inference():
     assert_almost_equal(
         pval_corr[extended_support:], expected[extended_support:], decimal=1
     )
+
+    beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = (
+        ensemble_clustered_inference(
+            X,
+            Y,
+            ward,
+            n_clusters,
+            n_bootstraps=n_bootstraps,
+            inference_method=inference_method,
+            ensembling_method="medians",
+        )
+    )
+
+    expected = 0.5 * np.ones(n_features)
+    expected[:support_size] = 0.0
+
+    assert_almost_equal(
+        pval_corr[:interior_support], expected[:interior_support], decimal=3
+    )
+    assert_almost_equal(
+        pval_corr[extended_support:], expected[extended_support:], decimal=1
+    )
+
+
+def test_ensemble_clustered_inference_exception():
+    """
+    Test the raise of exception
+    """
+    n_samples, n_features = 100, 2000
+    n_clusters = 10
+    X, Y, beta, epsilon = multivariate_1D_simulation(
+        n_samples=n_samples,
+        n_features=n_features,
+    )
+    connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1)
+    ward = FeatureAgglomeration(
+        n_clusters=n_clusters, connectivity=connectivity, linkage="ward"
+    )
+
+    # Test the raise of exception
+    with pytest.raises(ValueError, match="Unknown ensembling method."):
+        ensemble_clustered_inference(
+            X, Y, ward, n_clusters, ensembling_method="wrong_method"
+        )
+
+    with pytest.raises(
+        ValueError,
+        match="'memory' must be None or a string corresponding "
+        + "to the path of the caching directory.",
+    ):
+        ensemble_clustered_inference(X, Y, ward, n_clusters, memory=[])