Skip to content

Commit

Permalink
Merge pull request #81 from PGScatalog/dev
Browse files Browse the repository at this point in the history
v0.5.2
  • Loading branch information
nebfield authored Mar 7, 2024
2 parents cf5d0d5 + 4b61fa8 commit bc8d87f
Show file tree
Hide file tree
Showing 9 changed files with 163 additions and 943 deletions.
2 changes: 1 addition & 1 deletion pgscatalog_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.4.2'
__version__ = "0.5.2"
19 changes: 8 additions & 11 deletions pgscatalog_utils/ancestry/ancestry_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,15 @@ def ancestry_analysis():
scorecols = list(pgs.columns)

## There should be perfect target sample overlap
assert all(
[
x in pgs.loc["reference"].index
for x in reference_df.index.get_level_values(1)
]
), "Error: PGS data missing for reference samples with PCA data."
reference_df = pd.merge(reference_df, pgs, left_index=True, right_index=True)
assert set(reference_df.index.get_level_values(1)).issubset(pgs.loc["reference"].index),\
"Error: PGS data missing for reference samples with PCA data."
reference_df = reference_df.sort_index()
reference_df = pd.concat([reference_df, pgs.loc[reference_df.index]], axis=1)

assert all(
[x in pgs.loc[args.d_target].index for x in target_df.index.get_level_values(1)]
), "Error: PGS data missing for reference samples with PCA data."
target_df = pd.merge(target_df, pgs, left_index=True, right_index=True)
assert set(target_df.index.get_level_values(1)).issubset(pgs.loc[args.d_target].index), \
"Error: PGS data missing for target samples with PCA data."
target_df = target_df.sort_index()
target_df = pd.concat([target_df, pgs.loc[target_df.index]], axis=1)
del pgs # clear raw PGS from memory

# Compare target sample ancestry/PCs to reference panel
Expand Down
175 changes: 96 additions & 79 deletions pgscatalog_utils/ancestry/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def compare_ancestry(ref_df: pd.DataFrame, ref_pop_col: str, target_df: pd.DataF
:param p_threshold: used to define LowConfidence population assignments
:return: dataframes for reference (predictions on training set) and target (predicted labels) datasets
"""
logger.debug("Starting ancestry comparison")
# Check that datasets have the correct columns
assert method in comparison_method_threshold.keys(), 'comparison method parameter must be Mahalanobis or RF'
if method == 'Mahalanobis':
Expand Down Expand Up @@ -238,13 +239,19 @@ def pgs_adjust(ref_df, target_df, scorecols: list, ref_pop_col, target_pop_col,
results_ref = {}
results_target = {}
results_models = {} # used to store regression information
scorecols_drop = set()
for c_pgs in scorecols:
# Makes melting easier later
sum_col = 'SUM|{}'.format(c_pgs)
results_ref[sum_col] = ref_df[c_pgs]
results_target[sum_col] = target_df[c_pgs]
results_models = {}

# Check that PGS has variance (e.g. not all 0)
if 0 in [np.var(results_ref[sum_col]), np.var(results_target[sum_col])]:
scorecols_drop.add(c_pgs)
logger.warning("Skipping adjustment: {} has 0 variance in PGS SUM".format(c_pgs))

# Report PGS values with respect to distribution of PGS in the most similar reference population
if 'empirical' in use_method:
logger.debug("Adjusting PGS using most similar reference population distribution.")
Expand All @@ -258,35 +265,36 @@ def pgs_adjust(ref_df, target_df, scorecols: list, ref_pop_col, target_pop_col,
z_col = 'Z_MostSimilarPop|{}'.format(c_pgs)
results_ref[z_col] = pd.Series(index=ref_df.index, dtype='float64')
results_target[z_col] = pd.Series(index=target_df.index, dtype='float64')

r_model = {}

# Adjust for each population
for pop in ref_populations:
r_pop = {}
i_ref_pop = (ref_df[ref_pop_col] == pop)
i_target_pop = (target_df[target_pop_col] == pop)
if c_pgs not in scorecols_drop:
r_model = {}

# Adjust for each population
for pop in ref_populations:
r_pop = {}
i_ref_pop = (ref_df[ref_pop_col] == pop)
i_target_pop = (target_df[target_pop_col] == pop)

# Reference Score Distribution
c_pgs_pop_dist = ref_train_df.loc[ref_train_df[ref_pop_col] == pop, c_pgs]
# Reference Score Distribution
c_pgs_pop_dist = ref_train_df.loc[ref_train_df[ref_pop_col] == pop, c_pgs]

# Calculate Percentile
results_ref[percentile_col].loc[i_ref_pop] = percentileofscore(c_pgs_pop_dist, ref_df.loc[i_ref_pop, c_pgs])
results_target[percentile_col].loc[i_target_pop] = percentileofscore(c_pgs_pop_dist, target_df.loc[i_target_pop, c_pgs])
r_pop['percentiles'] = np.percentile(c_pgs_pop_dist, range(0,101,1))
# Calculate Percentile
results_ref[percentile_col].loc[i_ref_pop] = percentileofscore(c_pgs_pop_dist, ref_df.loc[i_ref_pop, c_pgs])
results_target[percentile_col].loc[i_target_pop] = percentileofscore(c_pgs_pop_dist, target_df.loc[i_target_pop, c_pgs])
r_pop['percentiles'] = np.percentile(c_pgs_pop_dist, range(0,101,1))

# Calculate Z
r_pop['mean'] = c_pgs_pop_dist.mean()
r_pop['std'] = c_pgs_pop_dist.std(ddof=0)
# Calculate Z
r_pop['mean'] = c_pgs_pop_dist.mean()
r_pop['std'] = c_pgs_pop_dist.std(ddof=0)

results_ref[z_col].loc[i_ref_pop] = (ref_df.loc[i_ref_pop, c_pgs] - r_pop['mean'])/r_pop['std']
results_target[z_col].loc[i_target_pop] = (target_df.loc[i_target_pop, c_pgs] - r_pop['mean'])/r_pop['std']
results_ref[z_col].loc[i_ref_pop] = (ref_df.loc[i_ref_pop, c_pgs] - r_pop['mean'])/r_pop['std']
results_target[z_col].loc[i_target_pop] = (target_df.loc[i_target_pop, c_pgs] - r_pop['mean'])/r_pop['std']

r_model[pop] = r_pop
r_model[pop] = r_pop

results_models['dist_empirical'][c_pgs] = r_model
# ToDo: explore handling of individuals who have low-confidence population labels
# -> Possible Soln: weighted average based on probabilities? Small Mahalanobis P-values will complicate this
results_models['dist_empirical'][c_pgs] = r_model
# ToDo: explore handling of individuals who have low-confidence population labels
# -> Possible Soln: weighted average based on probabilities? Small Mahalanobis P-values will complicate this
# PCA-based adjustment
if any([x in use_method for x in ['mean', 'mean+var']]):
logger.debug("Adjusting PGS using PCA projections")
Expand All @@ -312,63 +320,72 @@ def pgs_adjust(ref_df, target_df, scorecols: list, ref_pop_col, target_pop_col,
target_norm[pc_col] = (target_norm[pc_col] - pc_mean) / pc_std

for c_pgs in scorecols:
results_models['adjust_pcs']['PGS'][c_pgs] = {}
if norm_centerpgs:
pgs_mean = ref_train_df[c_pgs].mean()
ref_train_df[c_pgs] = (ref_train_df[c_pgs] - pgs_mean)
ref_norm[c_pgs] = (ref_norm[c_pgs] - pgs_mean)
target_norm[c_pgs] = (target_norm[c_pgs] - pgs_mean)
results_models['adjust_pcs']['PGS'][c_pgs]['pgs_offset'] = pgs_mean

# Method 1 (Khera et al. Circulation (2019): normalize mean (doi:10.1161/CIRCULATIONAHA.118.035658)
adj_col = 'Z_norm1|{}'.format(c_pgs)
# Fit to Reference Data
pcs2pgs_fit = LinearRegression().fit(ref_train_df[cols_pcs], ref_train_df[c_pgs])
ref_train_pgs_pred = pcs2pgs_fit.predict(ref_train_df[cols_pcs])
ref_train_pgs_resid = ref_train_df[c_pgs] - ref_train_pgs_pred
ref_train_pgs_resid_mean = ref_train_pgs_resid.mean()
ref_train_pgs_resid_std = ref_train_pgs_resid.std(ddof=0)

ref_pgs_resid = ref_norm[c_pgs] - pcs2pgs_fit.predict(ref_norm[cols_pcs])
results_ref[adj_col] = ref_pgs_resid / ref_train_pgs_resid_std
# Apply to Target Data
target_pgs_pred = pcs2pgs_fit.predict(target_norm[cols_pcs])
target_pgs_resid = target_norm[c_pgs] - target_pgs_pred
results_target[adj_col] = target_pgs_resid / ref_train_pgs_resid_std
results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm1'] = package_skl_regression(pcs2pgs_fit)

if 'mean+var' in use_method:
# Method 2 (Khan et al. Nature Medicine (2022)): normalize variance (doi:10.1038/s41591-022-01869-1)
# Normalize based on residual deviation from mean of the distribution [equalize population sds]
# (e.g. reduce the correlation between genetic ancestry and how far away you are from the mean)
# USE gamma distribution for predicted variance to constrain it to be positive (b/c using linear
# regression we can get negative predictions for the sd)
adj_col = 'Z_norm2|{}'.format(c_pgs)
pcs2var_fit_gamma = GammaRegressor(max_iter=1000).fit(ref_train_df[cols_pcs], (
ref_train_pgs_resid - ref_train_pgs_resid_mean) ** 2)
if norm2_2step:
# Return 2-step adjustment
results_ref[adj_col] = ref_pgs_resid / np.sqrt(pcs2var_fit_gamma.predict(ref_norm[cols_pcs]))
results_target[adj_col] = target_pgs_resid / np.sqrt(
pcs2var_fit_gamma.predict(target_norm[cols_pcs]))
results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm2'] = package_skl_regression(pcs2var_fit_gamma)
else:
# Return full-likelihood adjustment model
# This jointly re-fits the regression parameters from the mean and variance prediction to better
# fit the observed PGS distribution. It seems to mostly change the intercepts. This implementation is
# adapted from https://github.com/broadinstitute/palantir-workflows/blob/v0.14/ImputationPipeline/ScoringTasks.wdl,
# which is distributed under a BDS-3 license.
params_initial = np.concatenate([[pcs2pgs_fit.intercept_], pcs2pgs_fit.coef_,
[pcs2var_fit_gamma.intercept_], pcs2var_fit_gamma.coef_])
pcs2full_fit = fullLL_fit(df_score=ref_train_df, scorecol=c_pgs,
predictors=cols_pcs, initial_params=params_initial)

results_ref[adj_col] = fullLL_adjust(pcs2full_fit, ref_norm, c_pgs)
results_target[adj_col] = fullLL_adjust(pcs2full_fit, target_norm, c_pgs)

if pcs2full_fit['params']['success'] is False:
logger.warning("{} full-likelihood: {} {}".format(c_pgs, pcs2full_fit['params']['status'], pcs2full_fit['params']['message']))
results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm2'] = pcs2full_fit
if c_pgs in scorecols_drop:
# fill the output with NAs
adj_cols = ['Z_norm1|{}'.format(c_pgs)]
if 'mean+var' in use_method:
adj_cols.append('Z_norm2|{}'.format(c_pgs))
for adj_col in adj_cols:
results_ref[adj_col] = pd.Series(index=ref_df.index, dtype='float64') # fill na
results_target[adj_col] = pd.Series(index=target_df.index, dtype='float64') # fill na
else:
results_models['adjust_pcs']['PGS'][c_pgs] = {}
if norm_centerpgs:
pgs_mean = ref_train_df[c_pgs].mean()
ref_train_df[c_pgs] = (ref_train_df[c_pgs] - pgs_mean)
ref_norm[c_pgs] = (ref_norm[c_pgs] - pgs_mean)
target_norm[c_pgs] = (target_norm[c_pgs] - pgs_mean)
results_models['adjust_pcs']['PGS'][c_pgs]['pgs_offset'] = pgs_mean

# Method 1 (Khera et al. Circulation (2019): normalize mean (doi:10.1161/CIRCULATIONAHA.118.035658)
adj_col = 'Z_norm1|{}'.format(c_pgs)
# Fit to Reference Data
pcs2pgs_fit = LinearRegression().fit(ref_train_df[cols_pcs], ref_train_df[c_pgs])
ref_train_pgs_pred = pcs2pgs_fit.predict(ref_train_df[cols_pcs])
ref_train_pgs_resid = ref_train_df[c_pgs] - ref_train_pgs_pred
ref_train_pgs_resid_mean = ref_train_pgs_resid.mean()
ref_train_pgs_resid_std = ref_train_pgs_resid.std(ddof=0)

ref_pgs_resid = ref_norm[c_pgs] - pcs2pgs_fit.predict(ref_norm[cols_pcs])
results_ref[adj_col] = ref_pgs_resid / ref_train_pgs_resid_std
# Apply to Target Data
target_pgs_pred = pcs2pgs_fit.predict(target_norm[cols_pcs])
target_pgs_resid = target_norm[c_pgs] - target_pgs_pred
results_target[adj_col] = target_pgs_resid / ref_train_pgs_resid_std
results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm1'] = package_skl_regression(pcs2pgs_fit)

if 'mean+var' in use_method:
# Method 2 (Khan et al. Nature Medicine (2022)): normalize variance (doi:10.1038/s41591-022-01869-1)
# Normalize based on residual deviation from mean of the distribution [equalize population sds]
# (e.g. reduce the correlation between genetic ancestry and how far away you are from the mean)
# USE gamma distribution for predicted variance to constrain it to be positive (b/c using linear
# regression we can get negative predictions for the sd)
adj_col = 'Z_norm2|{}'.format(c_pgs)
pcs2var_fit_gamma = GammaRegressor(max_iter=1000).fit(ref_train_df[cols_pcs], (
ref_train_pgs_resid - ref_train_pgs_resid_mean) ** 2)
if norm2_2step:
# Return 2-step adjustment
results_ref[adj_col] = ref_pgs_resid / np.sqrt(pcs2var_fit_gamma.predict(ref_norm[cols_pcs]))
results_target[adj_col] = target_pgs_resid / np.sqrt(
pcs2var_fit_gamma.predict(target_norm[cols_pcs]))
results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm2'] = package_skl_regression(pcs2var_fit_gamma)
else:
# Return full-likelihood adjustment model
# This jointly re-fits the regression parameters from the mean and variance prediction to better
# fit the observed PGS distribution. It seems to mostly change the intercepts. This implementation is
# adapted from https://github.com/broadinstitute/palantir-workflows/blob/v0.14/ImputationPipeline/ScoringTasks.wdl,
# which is distributed under a BDS-3 license.
params_initial = np.concatenate([[pcs2pgs_fit.intercept_], pcs2pgs_fit.coef_,
[pcs2var_fit_gamma.intercept_], pcs2var_fit_gamma.coef_])
pcs2full_fit = fullLL_fit(df_score=ref_train_df, scorecol=c_pgs,
predictors=cols_pcs, initial_params=params_initial)

results_ref[adj_col] = fullLL_adjust(pcs2full_fit, ref_norm, c_pgs)
results_target[adj_col] = fullLL_adjust(pcs2full_fit, target_norm, c_pgs)

if pcs2full_fit['params']['success'] is False:
logger.warning("{} full-likelihood: {} {}".format(c_pgs, pcs2full_fit['params']['status'], pcs2full_fit['params']['message']))
results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm2'] = pcs2full_fit
# Only return results
logger.debug("Outputting adjusted PGS & models")
results_ref = pd.DataFrame(results_ref)
Expand Down
6 changes: 3 additions & 3 deletions pgscatalog_utils/scorefile/qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,11 +137,11 @@ def assign_effect_type(
) -> typing.Generator[ScoreVariant, None, None]:
for variant in variants:
match (variant.is_recessive, variant.is_dominant):
case (None, None) | ("FALSE", "FALSE"):
case (None, None) | (False, False):
pass # default value is additive, pass to break match and yield
case ("FALSE", "TRUE"):
case (False, True):
variant.effect_type = EffectType.DOMINANT
case ("TRUE", "FALSE"):
case (True, False):
variant.effect_type = EffectType.RECESSIVE
case _:
logger.critical(f"Bad effect type setting: {variant}")
Expand Down
10 changes: 8 additions & 2 deletions pgscatalog_utils/scorefile/scorevariant.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,14 @@ def __init__(

self.hm_inferOtherAllele: Optional[str] = hm_inferOtherAllele
self.hm_source: Optional[str] = hm_source
self.is_dominant: Optional[bool] = is_dominant
self.is_recessive: Optional[bool] = is_recessive

self.is_dominant: Optional[bool] = (
is_dominant == "True" if is_dominant is not None else None
)
self.is_recessive: Optional[bool] = (
is_recessive == "True" if is_recessive is not None else None
)

self.hm_rsID: Optional[str] = hm_rsID
self.hm_match_chr: Optional[str] = hm_match_chr
self.hm_match_pos: Optional[str] = hm_match_pos
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pgscatalog_utils"
version = "0.5.1"
version = "0.5.2"
description = "Utilities for working with PGS Catalog API and scoring files"
homepage = "https://github.com/PGScatalog/pgscatalog_utils"
authors = ["Benjamin Wingfield <[email protected]>", "Samuel Lambert <[email protected]>", "Laurent Gil <[email protected]>"]
Expand Down
Loading

0 comments on commit bc8d87f

Please sign in to comment.