diff --git a/idconn/io.py b/idconn/io.py index b5f43e1..23b563c 100644 --- a/idconn/io.py +++ b/idconn/io.py @@ -35,7 +35,6 @@ def calc_fd(confounds): fd = np.sum([delta_x, delta_y, delta_z, delta_alpha, delta_beta, delta_gamma], axis=0) return fd - def build_statsmodel_json( name, task, @@ -132,7 +131,6 @@ def build_statsmodel_json( json.dump(statsmodel, outfile) return statsmodel_json - def atlas_picker(atlas, path, key=None): """Takes in atlas name and path to file, if local, returns nifti-like object (usually file path to downloaded atlas), @@ -192,8 +190,7 @@ def atlas_picker(atlas, path, key=None): return atlas, path - -def vectorize_corrmats(matrices): +def vectorize_corrmats(matrices, diagonal=False): """Returns the vectorized upper triangles of a 3-dimensional array (i.e., node x node x matrix) of matrices. Output will be a 2-dimensional array (i.e., matrix x node^2) @@ -210,11 +207,15 @@ def vectorize_corrmats(matrices): the input matrices. """ # print(f'\n\n\n{matrices.shape}, {matrices.ndim}\n\n\n') + if diagonal == True: + k = 0 + else: + k = 1 num_node = matrices.shape[1] - upper_tri = np.triu_indices(num_node, k=1) + upper_tri = np.triu_indices(num_node, k=k) if matrices.ndim == 3: num_node = matrices.shape[1] - upper_tri = np.triu_indices(num_node, k=1) + upper_tri = np.triu_indices(num_node, k=k) num_matrices = matrices.shape[0] edge_vector = [] for matrix in range(0, num_matrices): @@ -234,7 +235,7 @@ def vectorize_corrmats(matrices): elif matrices.ndim == 1: if matrices[0].ndim == 2: num_node = matrices[0].shape[0] - upper_tri = np.triu_indices(num_node, k=1) + upper_tri = np.triu_indices(num_node, k=k) edge_vector = [] for matrix in matrices: vectorized = matrix[upper_tri] @@ -248,7 +249,6 @@ def vectorize_corrmats(matrices): edge_vector = np.asarray(edge_vector) return edge_vector - def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True, verbose=False): """Returns a node x node x (subject x session) matrix of correlation matrices from a BIDS derivative folder. Optionally returns a node^2 x (subject x session) @@ -419,8 +419,7 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True ppt_df.replace({"": np.nan}, inplace=True) return ppt_df - -def undo_vectorize(edges, num_node=None): +def undo_vectorize(edges, num_node=None, diagonal=False): """ Puts an edge vector back into an adjacency matrix. Parameters @@ -439,15 +438,25 @@ def undo_vectorize(edges, num_node=None): # num_node = (np.sqrt((8 * j) + 1) + 1) / 2 if num_node == None: j = len(edges) - num_node = int((np.sqrt((8 * j) + 1) + 1) / 2) + if diagonal == False: + num_node = int((np.sqrt((8 * j) + 1) + 1) / 2) + else: + num_node = int((np.sqrt((8 * j) + 1) - 1) / 2) else: num_node = int(num_node) X = np.zeros((num_node, num_node)) - X[np.triu_indices(X.shape[0], k=1)] = edges + if diagonal == False: + k=1 + if diagonal == True: + k=0 + X[np.triu_indices(num_node, k=k)] = edges + diag_X = X[np.diag_indices(num_node,2)] X = X + X.T + if diagonal == True: + X[np.diag_indices(num_node,2)] = diag_X + #print('did undo_vectorize work?', np.allclose(X, X.T)) return X - def plot_edges( adj, atlas_nii, @@ -499,7 +508,7 @@ def plot_edges( print("edge plotting threshold: ", threshold) if node_size == "strength": - node_strength = np.sum(adj, axis=0) + node_strength = np.abs(np.sum(adj, axis=0)) # node_strength /= np.max(node_strength) # node_strength **= 4 node_strength = node_strength / np.max(node_strength) * 60 @@ -535,7 +544,7 @@ def plot_edges( nimg = nib.load(atlas_nii) regn_sch_arr = nimg.get_fdata() for i in np.arange(0, num_node): - regn_sch_arr[np.where(regn_sch_arr == i + 1)] = np.sum(adj[i]) + regn_sch_arr[np.where(regn_sch_arr == i + 1)] = np.sum((adj[i])) strength_nimg = nib.Nifti1Image(regn_sch_arr, nimg.affine) # replace this filename with BIDSy output # nib.save(strength_nimg, f'/Users/katherine.b/Dropbox/{title}predictive-strength.nii') @@ -558,6 +567,7 @@ def plot_edges( i = plotting.plot_surf_stat_map( fsaverage.pial_left, texture_l, + bg_map=fsaverage.sulc_left, symmetric_cbar=False, threshold=0.5, cmap=cmap, @@ -568,6 +578,7 @@ def plot_edges( j = plotting.plot_surf_stat_map( fsaverage.pial_left, texture_l, + bg_map=fsaverage.sulc_left, symmetric_cbar=False, threshold=0.5, cmap=cmap, @@ -578,6 +589,7 @@ def plot_edges( k = plotting.plot_surf_stat_map( fsaverage.pial_right, texture_r, + bg_map=fsaverage.sulc_right, symmetric_cbar=False, threshold=0.5, cmap=cmap, @@ -588,6 +600,7 @@ def plot_edges( l = plotting.plot_surf_stat_map( fsaverage.pial_right, texture_r, + bg_map=fsaverage.sulc_right, symmetric_cbar=False, threshold=0.5, cmap=cmap, diff --git a/idconn/workflows/nbs_predict-e2.py b/idconn/workflows/nbs_predict-e2.py new file mode 100644 index 0000000..c92d274 --- /dev/null +++ b/idconn/workflows/nbs_predict-e2.py @@ -0,0 +1,419 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "estradiol" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +#print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=3, n_iterations=3 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +#print(np.sum(weighted_average)) +#nbs_vector = weighted_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +#filter = np.where(nbs_vector >= p75, True, False) +#print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression( + penalty="l2", + solver="saga", + C=best.C_[0] + ) + train_metrics["alpha"] = best.C_[0] + #train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + + #train_metrics["l1_ratio"] = best.l1_ratio_ +#print(params) +#model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +#fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True + ) +train_metrics["in_sample_test"] = np.mean(scores['test_score']) +train_metrics["in_sample_train"] = np.mean(scores['train_score']) + +fitted = scores['estimator'][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f'{OUTCOME}_pred'] = y_pred +dat[f'{OUTCOME}_scaled'] = train_outcome + +Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] +Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +train_colors = ['#a08ad1', #light + '#685690', #medium + '#3f2d69' #dark + ] +light_cmap = sns.color_palette('dark:#a08ad1') +dark_cmap = sns.color_palette('dark:#685690') + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + ax=ax, + palette=dark_cmap) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + ax=ax, + palette=light_cmap) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +#print(fitted.coef_.shape) +#print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + #print(j) + #print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +#cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f'{OUTCOME}_scaled'] = test_outcome +test_df[f'{OUTCOME}_pred'] = y_pred +Ys = test_df[[f'{OUTCOME}_scaled', + f'{OUTCOME}_pred', + 'cycle_day', + 'bc']] +Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +Ys['ppts'] = Ys.index.get_level_values(0) + + +light_colors = ['#33ACE3', #Bubbles + '#EA6964', #Blossom + '#4AB62C' #Buttercup + ] +dark_colors = ['#1278a6', + '#a11510', + '#228208'] +light = ListedColormap(light_colors, name='light_powderpuff') +dark = ListedColormap(dark_colors, name='dark_powderpuff') +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='light_powderpuff' + ) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='dark_powderpuff') +ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') +fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + + + +#print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-e2xp4-bc.py b/idconn/workflows/nbs_predict-e2xp4-bc.py new file mode 100644 index 0000000..ad6a6d8 --- /dev/null +++ b/idconn/workflows/nbs_predict-e2xp4-bc.py @@ -0,0 +1,422 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "estradiol÷progesterone" +CONFOUNDS = ["framewise_displacement", "bc"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +dat['estradiol÷progesterone'] = dat['estradiol'] / dat['progesterone'] + +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +#print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +#print(np.sum(weighted_average)) +#nbs_vector = weighted_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +#filter = np.where(nbs_vector >= p75, True, False) +#print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression( + penalty="l2", + solver="saga", + C=best.C_[0] + ) + train_metrics["alpha"] = best.C_[0] + #train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + + #train_metrics["l1_ratio"] = best.l1_ratio_ +#print(params) +#model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +#fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True + ) +train_metrics["in_sample_test"] = np.mean(scores['test_score']) +train_metrics["in_sample_train"] = np.mean(scores['train_score']) + +fitted = scores['estimator'][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f'{OUTCOME}_pred'] = y_pred +dat[f'{OUTCOME}_scaled'] = train_outcome + +Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] +Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +train_colors = ['#a08ad1', #light + '#685690', #medium + '#3f2d69' #dark + ] +light_cmap = sns.color_palette('dark:#a08ad1') +dark_cmap = sns.color_palette('dark:#685690') + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + ax=ax, + palette=dark_cmap) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + ax=ax, + palette=light_cmap) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +#print(fitted.coef_.shape) +#print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + #print(j) + #print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) +test_df['estradiol÷progesterone'] = test_df['estradiol'] / test_df['progesterone'] + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +#cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f'{OUTCOME}_scaled'] = test_outcome +test_df[f'{OUTCOME}_pred'] = y_pred +Ys = test_df[[f'{OUTCOME}_scaled', + f'{OUTCOME}_pred', + 'cycle_day', + 'bc']] +Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +Ys['ppts'] = Ys.index.get_level_values(0) + + +light_colors = ['#33ACE3', #Bubbles + '#EA6964', #Blossom + '#4AB62C' #Buttercup + ] +dark_colors = ['#1278a6', + '#a11510', + '#228208'] +light = ListedColormap(light_colors, name='light_powderpuff') +dark = ListedColormap(dark_colors, name='dark_powderpuff') +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='light_powderpuff' + ) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='dark_powderpuff') +ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') +fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + + + +#print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-e2xp4.py b/idconn/workflows/nbs_predict-e2xp4.py new file mode 100644 index 0000000..022d8b9 --- /dev/null +++ b/idconn/workflows/nbs_predict-e2xp4.py @@ -0,0 +1,422 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "estradiol÷progesterone" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +dat['estradiol÷progesterone'] = dat['estradiol'] / dat['progesterone'] + +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +#print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +#print(np.sum(weighted_average)) +#nbs_vector = weighted_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +#filter = np.where(nbs_vector >= p75, True, False) +#print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression( + penalty="l2", + solver="saga", + C=best.C_[0] + ) + train_metrics["alpha"] = best.C_[0] + #train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + + #train_metrics["l1_ratio"] = best.l1_ratio_ +#print(params) +#model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +#fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True + ) +train_metrics["in_sample_test"] = np.mean(scores['test_score']) +train_metrics["in_sample_train"] = np.mean(scores['train_score']) + +fitted = scores['estimator'][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f'{OUTCOME}_pred'] = y_pred +dat[f'{OUTCOME}_scaled'] = train_outcome + +Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] +Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +train_colors = ['#a08ad1', #light + '#685690', #medium + '#3f2d69' #dark + ] +light_cmap = sns.color_palette('dark:#a08ad1') +dark_cmap = sns.color_palette('dark:#685690') + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + ax=ax, + palette=dark_cmap) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + ax=ax, + palette=light_cmap) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +#print(fitted.coef_.shape) +#print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + #print(j) + #print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) +test_df['estradiol÷progesterone'] = test_df['estradiol'] / test_df['progesterone'] + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +#cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f'{OUTCOME}_scaled'] = test_outcome +test_df[f'{OUTCOME}_pred'] = y_pred +Ys = test_df[[f'{OUTCOME}_scaled', + f'{OUTCOME}_pred', + 'cycle_day', + 'bc']] +Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +Ys['ppts'] = Ys.index.get_level_values(0) + + +light_colors = ['#33ACE3', #Bubbles + '#EA6964', #Blossom + '#4AB62C' #Buttercup + ] +dark_colors = ['#1278a6', + '#a11510', + '#228208'] +light = ListedColormap(light_colors, name='light_powderpuff') +dark = ListedColormap(dark_colors, name='dark_powderpuff') +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='light_powderpuff' + ) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='dark_powderpuff') +ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') +fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + + + +#print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-p4.py b/idconn/workflows/nbs_predict-p4.py new file mode 100644 index 0000000..559b4ff --- /dev/null +++ b/idconn/workflows/nbs_predict-p4.py @@ -0,0 +1,416 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "progesterone" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +#print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +#print(np.sum(weighted_average)) +#nbs_vector = weighted_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +#filter = np.where(nbs_vector >= p75, True, False) +#print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression( + penalty="l2", + solver="saga", + C=best.C_[0] + ) + train_metrics["alpha"] = best.C_[0] + #train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + + #train_metrics["l1_ratio"] = best.l1_ratio_ +#print(params) +#model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +#fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True + ) +train_metrics["in_sample_test"] = np.mean(scores['test_score']) +train_metrics["in_sample_train"] = np.mean(scores['train_score']) + +fitted = scores['estimator'][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f'{OUTCOME}_pred'] = y_pred +dat[f'{OUTCOME}_scaled'] = train_outcome + +Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] +Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +train_colors = ['#a08ad1', #light + '#685690', #medium + '#3f2d69' #dark + ] +light_cmap = sns.color_palette('dark:#a08ad1') +dark_cmap = sns.color_palette('dark:#685690') + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + ax=ax, + palette=dark_cmap) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + ax=ax, + palette=light_cmap) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +#print(fitted.coef_.shape) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + #print(j) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) + +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +#cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f'{OUTCOME}_scaled'] = test_outcome +test_df[f'{OUTCOME}_pred'] = y_pred +Ys = test_df[[f'{OUTCOME}_scaled', + f'{OUTCOME}_pred', + 'cycle_day', + 'bc']] +Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +Ys['ppts'] = Ys.index.get_level_values(0) + + +light_colors = ['#33ACE3', #Bubbles + '#EA6964', #Blossom + '#4AB62C' #Buttercup + ] +dark_colors = ['#1278a6', + '#a11510', + '#228208'] +light = ListedColormap(light_colors, name='light_powderpuff') +dark = ListedColormap(dark_colors, name='dark_powderpuff') +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='light_powderpuff' + ) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='dark_powderpuff') +ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') +fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + + + +#print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index 50563e7..46e804c 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -29,21 +29,21 @@ today = datetime.today() today_str = strftime("%m_%d_%Y") -TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" -TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +TRAIN_DSET = "" +TEST_DSET = "" DERIV_NAME = "IDConn" -OUTCOME = "bc" +OUTCOME = "" CONFOUNDS = "framewise_displacement" TASK = "rest" ATLAS = "craddock2012" THRESH = 0.5 alpha = 0.05 -atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" +atlas_fname = "craddock2012_tcorr05_2level_270_2mm.nii.gz" -layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) +train_layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) -dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) +dat = io.read_corrmats(train_layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) keep = dat["adj"].dropna().index dat = dat.loc[keep] @@ -65,6 +65,47 @@ base_name = f"nbs-predict_outcome-{OUTCOME}" # print(dat['bc']) +# load in test data +test_layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(test_layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + + + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + + weighted_average, cv_results = nbs.kfold_nbs( matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000 ) @@ -269,43 +310,6 @@ ) -layout = bids.BIDSLayout(TEST_DSET, derivatives=True) - -test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) - -keep = test_df[[OUTCOME, "adj"]].dropna().index -# print(keep) - -test_df = test_df.loc[keep] - -outcome_test = test_df[OUTCOME].values -# print(test_df) - -# print(outcome_test) -matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( - (len(test_df["adj"].dropna().index), num_node, num_node) -) -edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] - -# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE -if CONFOUNDS is not None: - confounds_test = test_df[CONFOUNDS].values - - # regress out the confounds from each edge and the outcome variable, - # use the residuals for the rest of the algorithm - # print(confounds.shape, outcome.shape) - if len(np.unique(outcome_test)) <= 2: - resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) - test_outcome = outcome_test - elif len(np.unique(outcome_test)) > 3: - test_outcome, resid_edges = nbs.residualize( - X=edges_test, y=outcome_test, confounds=confounds_test - ) - test_features = resid_edges -else: - test_features = edges_test - test_outcome = outcome_test - # scale after residualizing omg test_features = x_scaler.transform(test_features) if len(np.unique(test_outcome)) <= 2: