Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the coverage of the tests and fix some minor bugs #64

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
2 changes: 1 addition & 1 deletion hidimstat/desparsified_lasso.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _compute_residuals(

else:

ValueError("The only regression method available is 'lasso'")
raise ValueError("The only regression method available is 'lasso'")

clf.fit(X_new, y)
z = y - clf.predict(X_new)
Expand Down
34 changes: 31 additions & 3 deletions hidimstat/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ def multivariate_1D_simulation(
rho=0.0,
shuffle=True,
seed=0,
n_groups=0,
group_size=0,
):
"""Generate 1D data with Toeplitz design matrix

Expand All @@ -42,10 +44,17 @@ def multivariate_1D_simulation(
seed : int
Seed used for generating design matrix and noise.

n_groups : int
Number of groups.

group_size : int
Size of each group.

Returns
-------
X : ndarray, shape (n_samples, n_features)
Design matrix.
if their is some groups, the first rows contains the groups values.

y : ndarray, shape (n_samples,)
Target.
Expand All @@ -59,19 +68,38 @@ def multivariate_1D_simulation(

rng = np.random.default_rng(seed)

X = np.zeros((n_samples, n_features))
X[:, 0] = rng.standard_normal(n_samples)
if n_groups < 0 or group_size < 0:
raise ValueError("The number of groups and their size must be positive.")

n_individual_samples = n_samples - n_groups * group_size
if n_individual_samples <= 0:
raise ValueError(
"The number of samples is too small compate to the number "
"of group and their size to gerate the data."
)

n_generate_samples = n_groups + n_individual_samples

# generate random data for each samples
X = np.zeros((n_generate_samples, n_features))
X[:, 0] = rng.standard_normal(n_generate_samples)
for i in np.arange(1, n_features):
rand_vector = ((1 - rho**2) ** 0.5) * rng.standard_normal(n_samples)
rand_vector = ((1 - rho**2) ** 0.5) * rng.standard_normal(n_generate_samples)
X[:, i] = rho * X[:, i - 1] + rand_vector

if shuffle:
rng.shuffle(X.T)

# generate data for the groups based on one sample
if n_groups > 0:
groups = np.repeat(X[:n_groups], group_size, axis=0)
X = np.vstack((groups, X[n_groups:]))

# generate the vector of variable of importances
beta = np.zeros(n_features)
beta[0:support_size] = 1.0

# generate the simulated regression data
noise = sigma * rng.standard_normal(n_samples)
y = np.dot(X, beta) + noise

Expand Down
14 changes: 0 additions & 14 deletions hidimstat/setup.py

This file was deleted.

117 changes: 107 additions & 10 deletions hidimstat/test/test_clustered_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Test the clustered_inference module
"""

import pytest
import numpy as np
from numpy.testing import assert_almost_equal
from sklearn.cluster import FeatureAgglomeration
Expand All @@ -14,16 +15,17 @@
)


def test_clustered_inference():
"""Testing the procedure on two simulations with a 1D data structure and
with n << p: the first test has no temporal dimension, the second has a
temporal dimension. The support is connected and of size 10, it must be
recovered with a small spatial tolerance parametrized by `margin_size`.
# Scenario 1: data with no temporal dimension
def test_clustered_inference_no_temporal():
"""
Testing the procedure on one simulations with a 1D data structure and
with n << p: no temporal dimension. The support is connected and of
size 10, it must be recovered with a small spatial tolerance
parametrized by `margin_size`.
Computing one sided p-values, we want low p-values for the features of
the support and p-values close to 0.5 for the others."""
the support and p-values close to 0.5 for the others.
"""

# Scenario 1: data with no temporal dimension
# ###########################################
n_samples, n_features = 100, 2000
support_size = 10
sigma = 5.0
Expand Down Expand Up @@ -63,8 +65,17 @@ def test_clustered_inference():
pval_corr[extended_support:200], expected[extended_support:200], decimal=1
)

# Scenario 2: temporal data
# #########################

# Scenario 2: temporal data
def test_clustered_inference_temporal():
"""
Testing the procedure on two simulations with a 1D data structure and
with n << p: with a temporal dimension. The support is connected and
of size 10, it must be recovered with a small spatial tolerance
parametrized by `margin_size`.
Computing one sided p-values, we want low p-values for the features of
the support and p-values close to 0.5 for the others.
"""
n_samples, n_features, n_times = 200, 2000, 10
support_size = 10
sigma = 5.0
Expand Down Expand Up @@ -104,3 +115,89 @@ def test_clustered_inference():
assert_almost_equal(
pval_corr[extended_support:], expected[extended_support:], decimal=1
)


# Scenario 3: data with no temporal dimension and with groups
def test_clustered_inference_no_temporal_groups():
"""
Testing the procedure on one simulations with a 1D data structure and
with n << p: no temporal dimension. The support is connected and of
size 10, it must be recovered with a small spatial tolerance
parametrized by `margin_size`.
We group the sample in 10 groups of size 10.
Computing one sided p-values, we want low p-values for the features of
the support and p-values close to 0.5 for the others.
"""

n_samples, n_features = 100, 2000
support_size = 10
n_groups = 9
size = 10
sigma = 5.0
rho = 0.95
n_clusters = 200
margin_size = 5
interior_support = support_size - margin_size
extended_support = support_size + margin_size

X_init, y, beta, epsilon = multivariate_1D_simulation(
n_samples=n_samples,
n_features=n_features,
support_size=support_size,
sigma=sigma,
rho=rho,
shuffle=False,
seed=2,
n_groups=n_groups,
group_size=size,
)
groups = np.concatenate([[i] * size for i in range(n_groups + 1)])

y = y - np.mean(y)
X_init = X_init - np.mean(X_init, axis=0)

connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1)
ward = FeatureAgglomeration(
n_clusters=n_clusters, connectivity=connectivity, linkage="ward"
)

beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = (
clustered_inference(X_init, y, ward, n_clusters, groups=groups)
)

expected = 0.5 * np.ones(n_features)
expected[:support_size] = 0.0

assert_almost_equal(pval_corr[:interior_support], expected[:interior_support])
assert_almost_equal(
pval_corr[extended_support:200], expected[extended_support:200], decimal=1
)


# Scenario 1: data with no temporal dimension
def test_clustered_inference_exception_methods():
"""
Testing the procedure on two simulations with a 1D data structure and
checking that the procedure raises an exception when an unknown method is
provided.
"""
n_samples, n_features = 100, 2000
n_clusters = 200

X_init, y, beta, epsilon = multivariate_1D_simulation(
n_samples=n_samples,
n_features=n_features,
shuffle=False,
seed=2,
)

y = y - np.mean(y)
X_init = X_init - np.mean(X_init, axis=0)

connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1)
ward = FeatureAgglomeration(
n_clusters=n_clusters, connectivity=connectivity, linkage="ward"
)

with pytest.raises(ValueError, match="Unknow method"):
clustered_inference(X_init, y, ward, n_clusters, method="lll")
31 changes: 28 additions & 3 deletions hidimstat/test/test_dcrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def test_dcrt_lasso():
"""
X, y = make_regression(n_samples=100, n_features=10, noise=0.2, random_state=2024)
# Checking if a loss != 'least_square'
with pytest.raises(Exception):
with pytest.raises(ValueError, match="test loss is not supported."):
_ = dcrt_zero(
X,
y,
Expand All @@ -27,7 +27,7 @@ def test_dcrt_lasso():
)

# Checking for a different statistic
with pytest.raises(Exception):
with pytest.raises(ValueError, match="test statistic is not supported."):
_ = dcrt_zero(
X,
y,
Expand All @@ -37,6 +37,30 @@ def test_dcrt_lasso():
random_state=2024,
)

# Checking for bad selection of screening_threshold
result_th_screen_bad = dcrt_zero(
X,
y,
screening_threshold=0,
screening=True,
verbose=False,
)
assert result_th_screen_bad.size == 0

# Checking for bad selection of screening_threshold with verbose
result_th_screen_bad = dcrt_zero(
X,
y,
screening_threshold=0,
screening=True,
verbose=True,
)

assert len(result_th_screen_bad) == 3
assert result_th_screen_bad[0].size == 0
assert np.all(result_th_screen_bad[1] == np.ones(10))
assert np.all(result_th_screen_bad[2] == np.zeros(10))

# Checking with and without screening
results_no_screening = dcrt_zero(
X, y, screening=False, verbose=True, statistic="residual", random_state=2024
Expand Down Expand Up @@ -70,7 +94,8 @@ def test_dcrt_lasso():
X,
y,
refit=True,
screening=False,
screening=True,
screening_threshold=50,
verbose=True,
statistic="residual",
random_state=2024,
Expand Down
11 changes: 11 additions & 0 deletions hidimstat/test/test_desparsified_lasso.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
from numpy.testing import assert_almost_equal, assert_equal
from scipy.linalg import toeplitz
import pytest

from hidimstat.desparsified_lasso import desparsified_group_lasso, desparsified_lasso
from hidimstat.scenario import (
Expand Down Expand Up @@ -48,6 +49,16 @@ def test_desparsified_lasso():
assert_equal(cb_max > beta, True)


def test_desparsified_lasso_exception():
"""Testing exception of not using lasso"""

X, y, beta, noise = multivariate_1D_simulation()
with pytest.raises(
ValueError, match="The only regression method available is 'lasso'"
):
_ = desparsified_lasso(X, y, residual_method="test")


def test_desparsified_group_lasso():
"""Testing the procedure on a simulation with no structure and
a support of size 2. Computing one-sided p-values, we want
Expand Down
52 changes: 52 additions & 0 deletions hidimstat/test/test_ensemble_clustered_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import numpy as np
import pytest
from numpy.testing import assert_almost_equal
from sklearn.cluster import FeatureAgglomeration
from sklearn.feature_extraction import image
Expand Down Expand Up @@ -125,3 +126,54 @@ def test_ensemble_clustered_inference():
assert_almost_equal(
pval_corr[extended_support:], expected[extended_support:], decimal=1
)

beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = (
ensemble_clustered_inference(
X,
Y,
ward,
n_clusters,
n_bootstraps=n_bootstraps,
inference_method=inference_method,
ensembling_method="medians",
)
)

expected = 0.5 * np.ones(n_features)
expected[:support_size] = 0.0

assert_almost_equal(
pval_corr[:interior_support], expected[:interior_support], decimal=3
)
assert_almost_equal(
pval_corr[extended_support:], expected[extended_support:], decimal=1
)


def test_ensemble_clustered_inference_exception():
"""
Test the raise of exception
"""
n_samples, n_features = 100, 2000
n_clusters = 10
X, Y, beta, epsilon = multivariate_1D_simulation(
n_samples=n_samples,
n_features=n_features,
)
connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1)
ward = FeatureAgglomeration(
n_clusters=n_clusters, connectivity=connectivity, linkage="ward"
)

# Test the raise of exception
with pytest.raises(ValueError, match="Unknown ensembling method."):
ensemble_clustered_inference(
X, Y, ward, n_clusters, ensembling_method="wrong_method"
)

with pytest.raises(
ValueError,
match="'memory' must be None or a string corresponding "
+ "to the path of the caching directory.",
):
ensemble_clustered_inference(X, Y, ward, n_clusters, memory=[])
Loading
Loading