From 4b61fa8a5b5ec8d47a2a71beb9989829a5eee031 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Tue, 5 Mar 2024 12:37:20 +0000 Subject: [PATCH] Handle [rare] case where PGS has 0 variance by abstaining from PGS adjustment (outputting NAs and adding a WARNING to log). Signed-off-by: smlmbrt --- pgscatalog_utils/ancestry/tools.py | 174 ++++++++++++++++------------- 1 file changed, 95 insertions(+), 79 deletions(-) diff --git a/pgscatalog_utils/ancestry/tools.py b/pgscatalog_utils/ancestry/tools.py index 8ed8af9..11fa4a7 100644 --- a/pgscatalog_utils/ancestry/tools.py +++ b/pgscatalog_utils/ancestry/tools.py @@ -239,6 +239,7 @@ def pgs_adjust(ref_df, target_df, scorecols: list, ref_pop_col, target_pop_col, results_ref = {} results_target = {} results_models = {} # used to store regression information + scorecols_drop = set() for c_pgs in scorecols: # Makes melting easier later sum_col = 'SUM|{}'.format(c_pgs) @@ -246,6 +247,11 @@ def pgs_adjust(ref_df, target_df, scorecols: list, ref_pop_col, target_pop_col, results_target[sum_col] = target_df[c_pgs] results_models = {} + # Check that PGS has variance (e.g. not all 0) + if 0 in [np.var(results_ref[sum_col]), np.var(results_target[sum_col])]: + scorecols_drop.add(c_pgs) + logger.warning("Skipping adjustment: {} has 0 variance in PGS SUM".format(c_pgs)) + # Report PGS values with respect to distribution of PGS in the most similar reference population if 'empirical' in use_method: logger.debug("Adjusting PGS using most similar reference population distribution.") @@ -259,35 +265,36 @@ def pgs_adjust(ref_df, target_df, scorecols: list, ref_pop_col, target_pop_col, z_col = 'Z_MostSimilarPop|{}'.format(c_pgs) results_ref[z_col] = pd.Series(index=ref_df.index, dtype='float64') results_target[z_col] = pd.Series(index=target_df.index, dtype='float64') - - r_model = {} - # Adjust for each population - for pop in ref_populations: - r_pop = {} - i_ref_pop = (ref_df[ref_pop_col] == pop) - i_target_pop = (target_df[target_pop_col] == pop) + if c_pgs not in scorecols_drop: + r_model = {} + + # Adjust for each population + for pop in ref_populations: + r_pop = {} + i_ref_pop = (ref_df[ref_pop_col] == pop) + i_target_pop = (target_df[target_pop_col] == pop) - # Reference Score Distribution - c_pgs_pop_dist = ref_train_df.loc[ref_train_df[ref_pop_col] == pop, c_pgs] + # Reference Score Distribution + c_pgs_pop_dist = ref_train_df.loc[ref_train_df[ref_pop_col] == pop, c_pgs] - # Calculate Percentile - results_ref[percentile_col].loc[i_ref_pop] = percentileofscore(c_pgs_pop_dist, ref_df.loc[i_ref_pop, c_pgs]) - results_target[percentile_col].loc[i_target_pop] = percentileofscore(c_pgs_pop_dist, target_df.loc[i_target_pop, c_pgs]) - r_pop['percentiles'] = np.percentile(c_pgs_pop_dist, range(0,101,1)) + # Calculate Percentile + results_ref[percentile_col].loc[i_ref_pop] = percentileofscore(c_pgs_pop_dist, ref_df.loc[i_ref_pop, c_pgs]) + results_target[percentile_col].loc[i_target_pop] = percentileofscore(c_pgs_pop_dist, target_df.loc[i_target_pop, c_pgs]) + r_pop['percentiles'] = np.percentile(c_pgs_pop_dist, range(0,101,1)) - # Calculate Z - r_pop['mean'] = c_pgs_pop_dist.mean() - r_pop['std'] = c_pgs_pop_dist.std(ddof=0) + # Calculate Z + r_pop['mean'] = c_pgs_pop_dist.mean() + r_pop['std'] = c_pgs_pop_dist.std(ddof=0) - results_ref[z_col].loc[i_ref_pop] = (ref_df.loc[i_ref_pop, c_pgs] - r_pop['mean'])/r_pop['std'] - results_target[z_col].loc[i_target_pop] = (target_df.loc[i_target_pop, c_pgs] - r_pop['mean'])/r_pop['std'] + results_ref[z_col].loc[i_ref_pop] = (ref_df.loc[i_ref_pop, c_pgs] - r_pop['mean'])/r_pop['std'] + results_target[z_col].loc[i_target_pop] = (target_df.loc[i_target_pop, c_pgs] - r_pop['mean'])/r_pop['std'] - r_model[pop] = r_pop + r_model[pop] = r_pop - results_models['dist_empirical'][c_pgs] = r_model - # ToDo: explore handling of individuals who have low-confidence population labels - # -> Possible Soln: weighted average based on probabilities? Small Mahalanobis P-values will complicate this + results_models['dist_empirical'][c_pgs] = r_model + # ToDo: explore handling of individuals who have low-confidence population labels + # -> Possible Soln: weighted average based on probabilities? Small Mahalanobis P-values will complicate this # PCA-based adjustment if any([x in use_method for x in ['mean', 'mean+var']]): logger.debug("Adjusting PGS using PCA projections") @@ -313,63 +320,72 @@ def pgs_adjust(ref_df, target_df, scorecols: list, ref_pop_col, target_pop_col, target_norm[pc_col] = (target_norm[pc_col] - pc_mean) / pc_std for c_pgs in scorecols: - results_models['adjust_pcs']['PGS'][c_pgs] = {} - if norm_centerpgs: - pgs_mean = ref_train_df[c_pgs].mean() - ref_train_df[c_pgs] = (ref_train_df[c_pgs] - pgs_mean) - ref_norm[c_pgs] = (ref_norm[c_pgs] - pgs_mean) - target_norm[c_pgs] = (target_norm[c_pgs] - pgs_mean) - results_models['adjust_pcs']['PGS'][c_pgs]['pgs_offset'] = pgs_mean - - # Method 1 (Khera et al. Circulation (2019): normalize mean (doi:10.1161/CIRCULATIONAHA.118.035658) - adj_col = 'Z_norm1|{}'.format(c_pgs) - # Fit to Reference Data - pcs2pgs_fit = LinearRegression().fit(ref_train_df[cols_pcs], ref_train_df[c_pgs]) - ref_train_pgs_pred = pcs2pgs_fit.predict(ref_train_df[cols_pcs]) - ref_train_pgs_resid = ref_train_df[c_pgs] - ref_train_pgs_pred - ref_train_pgs_resid_mean = ref_train_pgs_resid.mean() - ref_train_pgs_resid_std = ref_train_pgs_resid.std(ddof=0) - - ref_pgs_resid = ref_norm[c_pgs] - pcs2pgs_fit.predict(ref_norm[cols_pcs]) - results_ref[adj_col] = ref_pgs_resid / ref_train_pgs_resid_std - # Apply to Target Data - target_pgs_pred = pcs2pgs_fit.predict(target_norm[cols_pcs]) - target_pgs_resid = target_norm[c_pgs] - target_pgs_pred - results_target[adj_col] = target_pgs_resid / ref_train_pgs_resid_std - results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm1'] = package_skl_regression(pcs2pgs_fit) - - if 'mean+var' in use_method: - # Method 2 (Khan et al. Nature Medicine (2022)): normalize variance (doi:10.1038/s41591-022-01869-1) - # Normalize based on residual deviation from mean of the distribution [equalize population sds] - # (e.g. reduce the correlation between genetic ancestry and how far away you are from the mean) - # USE gamma distribution for predicted variance to constrain it to be positive (b/c using linear - # regression we can get negative predictions for the sd) - adj_col = 'Z_norm2|{}'.format(c_pgs) - pcs2var_fit_gamma = GammaRegressor(max_iter=1000).fit(ref_train_df[cols_pcs], ( - ref_train_pgs_resid - ref_train_pgs_resid_mean) ** 2) - if norm2_2step: - # Return 2-step adjustment - results_ref[adj_col] = ref_pgs_resid / np.sqrt(pcs2var_fit_gamma.predict(ref_norm[cols_pcs])) - results_target[adj_col] = target_pgs_resid / np.sqrt( - pcs2var_fit_gamma.predict(target_norm[cols_pcs])) - results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm2'] = package_skl_regression(pcs2var_fit_gamma) - else: - # Return full-likelihood adjustment model - # This jointly re-fits the regression parameters from the mean and variance prediction to better - # fit the observed PGS distribution. It seems to mostly change the intercepts. This implementation is - # adapted from https://github.com/broadinstitute/palantir-workflows/blob/v0.14/ImputationPipeline/ScoringTasks.wdl, - # which is distributed under a BDS-3 license. - params_initial = np.concatenate([[pcs2pgs_fit.intercept_], pcs2pgs_fit.coef_, - [pcs2var_fit_gamma.intercept_], pcs2var_fit_gamma.coef_]) - pcs2full_fit = fullLL_fit(df_score=ref_train_df, scorecol=c_pgs, - predictors=cols_pcs, initial_params=params_initial) - - results_ref[adj_col] = fullLL_adjust(pcs2full_fit, ref_norm, c_pgs) - results_target[adj_col] = fullLL_adjust(pcs2full_fit, target_norm, c_pgs) - - if pcs2full_fit['params']['success'] is False: - logger.warning("{} full-likelihood: {} {}".format(c_pgs, pcs2full_fit['params']['status'], pcs2full_fit['params']['message'])) - results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm2'] = pcs2full_fit + if c_pgs in scorecols_drop: + # fill the output with NAs + adj_cols = ['Z_norm1|{}'.format(c_pgs)] + if 'mean+var' in use_method: + adj_cols.append('Z_norm2|{}'.format(c_pgs)) + for adj_col in adj_cols: + results_ref[adj_col] = pd.Series(index=ref_df.index, dtype='float64') # fill na + results_target[adj_col] = pd.Series(index=target_df.index, dtype='float64') # fill na + else: + results_models['adjust_pcs']['PGS'][c_pgs] = {} + if norm_centerpgs: + pgs_mean = ref_train_df[c_pgs].mean() + ref_train_df[c_pgs] = (ref_train_df[c_pgs] - pgs_mean) + ref_norm[c_pgs] = (ref_norm[c_pgs] - pgs_mean) + target_norm[c_pgs] = (target_norm[c_pgs] - pgs_mean) + results_models['adjust_pcs']['PGS'][c_pgs]['pgs_offset'] = pgs_mean + + # Method 1 (Khera et al. Circulation (2019): normalize mean (doi:10.1161/CIRCULATIONAHA.118.035658) + adj_col = 'Z_norm1|{}'.format(c_pgs) + # Fit to Reference Data + pcs2pgs_fit = LinearRegression().fit(ref_train_df[cols_pcs], ref_train_df[c_pgs]) + ref_train_pgs_pred = pcs2pgs_fit.predict(ref_train_df[cols_pcs]) + ref_train_pgs_resid = ref_train_df[c_pgs] - ref_train_pgs_pred + ref_train_pgs_resid_mean = ref_train_pgs_resid.mean() + ref_train_pgs_resid_std = ref_train_pgs_resid.std(ddof=0) + + ref_pgs_resid = ref_norm[c_pgs] - pcs2pgs_fit.predict(ref_norm[cols_pcs]) + results_ref[adj_col] = ref_pgs_resid / ref_train_pgs_resid_std + # Apply to Target Data + target_pgs_pred = pcs2pgs_fit.predict(target_norm[cols_pcs]) + target_pgs_resid = target_norm[c_pgs] - target_pgs_pred + results_target[adj_col] = target_pgs_resid / ref_train_pgs_resid_std + results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm1'] = package_skl_regression(pcs2pgs_fit) + + if 'mean+var' in use_method: + # Method 2 (Khan et al. Nature Medicine (2022)): normalize variance (doi:10.1038/s41591-022-01869-1) + # Normalize based on residual deviation from mean of the distribution [equalize population sds] + # (e.g. reduce the correlation between genetic ancestry and how far away you are from the mean) + # USE gamma distribution for predicted variance to constrain it to be positive (b/c using linear + # regression we can get negative predictions for the sd) + adj_col = 'Z_norm2|{}'.format(c_pgs) + pcs2var_fit_gamma = GammaRegressor(max_iter=1000).fit(ref_train_df[cols_pcs], ( + ref_train_pgs_resid - ref_train_pgs_resid_mean) ** 2) + if norm2_2step: + # Return 2-step adjustment + results_ref[adj_col] = ref_pgs_resid / np.sqrt(pcs2var_fit_gamma.predict(ref_norm[cols_pcs])) + results_target[adj_col] = target_pgs_resid / np.sqrt( + pcs2var_fit_gamma.predict(target_norm[cols_pcs])) + results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm2'] = package_skl_regression(pcs2var_fit_gamma) + else: + # Return full-likelihood adjustment model + # This jointly re-fits the regression parameters from the mean and variance prediction to better + # fit the observed PGS distribution. It seems to mostly change the intercepts. This implementation is + # adapted from https://github.com/broadinstitute/palantir-workflows/blob/v0.14/ImputationPipeline/ScoringTasks.wdl, + # which is distributed under a BDS-3 license. + params_initial = np.concatenate([[pcs2pgs_fit.intercept_], pcs2pgs_fit.coef_, + [pcs2var_fit_gamma.intercept_], pcs2var_fit_gamma.coef_]) + pcs2full_fit = fullLL_fit(df_score=ref_train_df, scorecol=c_pgs, + predictors=cols_pcs, initial_params=params_initial) + + results_ref[adj_col] = fullLL_adjust(pcs2full_fit, ref_norm, c_pgs) + results_target[adj_col] = fullLL_adjust(pcs2full_fit, target_norm, c_pgs) + + if pcs2full_fit['params']['success'] is False: + logger.warning("{} full-likelihood: {} {}".format(c_pgs, pcs2full_fit['params']['status'], pcs2full_fit['params']['message'])) + results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm2'] = pcs2full_fit # Only return results logger.debug("Outputting adjusted PGS & models") results_ref = pd.DataFrame(results_ref)