Merge pull request #81 from PGScatalog/dev

v0.5.2
PGScatalog · Mar 7, 2024 · bc8d87f · bc8d87f
2 parents cf5d0d5 + 4b61fa8
commit bc8d87f
Show file tree

Hide file tree

Showing 9 changed files with 163 additions and 943 deletions.
diff --git a/pgscatalog_utils/__init__.py b/pgscatalog_utils/__init__.py
@@ -1 +1 @@
-__version__ = '0.4.2'
+__version__ = "0.5.2"
diff --git a/pgscatalog_utils/ancestry/ancestry_analysis.py b/pgscatalog_utils/ancestry/ancestry_analysis.py
@@ -55,18 +55,15 @@ def ancestry_analysis():
     scorecols = list(pgs.columns)
 
     ## There should be perfect target sample overlap
-    assert all(
-        [
-            x in pgs.loc["reference"].index
-            for x in reference_df.index.get_level_values(1)
-        ]
-    ), "Error: PGS data missing for reference samples with PCA data."
-    reference_df = pd.merge(reference_df, pgs, left_index=True, right_index=True)
+    assert set(reference_df.index.get_level_values(1)).issubset(pgs.loc["reference"].index),\
+        "Error: PGS data missing for reference samples with PCA data."
+    reference_df = reference_df.sort_index()
+    reference_df = pd.concat([reference_df, pgs.loc[reference_df.index]], axis=1)
 
-    assert all(
-        [x in pgs.loc[args.d_target].index for x in target_df.index.get_level_values(1)]
-    ), "Error: PGS data missing for reference samples with PCA data."
-    target_df = pd.merge(target_df, pgs, left_index=True, right_index=True)
+    assert set(target_df.index.get_level_values(1)).issubset(pgs.loc[args.d_target].index), \
+        "Error: PGS data missing for target samples with PCA data."
+    target_df = target_df.sort_index()
+    target_df = pd.concat([target_df, pgs.loc[target_df.index]], axis=1)
     del pgs  # clear raw PGS from memory
 
     # Compare target sample ancestry/PCs to reference panel

diff --git a/pgscatalog_utils/ancestry/tools.py b/pgscatalog_utils/ancestry/tools.py
@@ -61,6 +61,7 @@ def compare_ancestry(ref_df: pd.DataFrame, ref_pop_col: str, target_df: pd.DataF
     :param p_threshold: used to define LowConfidence population assignments
     :return: dataframes for reference (predictions on training set) and target (predicted labels) datasets
     """
+    logger.debug("Starting ancestry comparison")
     # Check that datasets have the correct columns
     assert method in comparison_method_threshold.keys(), 'comparison method parameter must be Mahalanobis or RF'
     if method == 'Mahalanobis':
@@ -238,13 +239,19 @@ def pgs_adjust(ref_df, target_df, scorecols: list, ref_pop_col, target_pop_col,
     results_ref = {}
     results_target = {}
     results_models = {}  # used to store regression information
+    scorecols_drop = set()
     for c_pgs in scorecols:
         # Makes melting easier later
         sum_col = 'SUM|{}'.format(c_pgs)
         results_ref[sum_col] = ref_df[c_pgs]
         results_target[sum_col] = target_df[c_pgs]
         results_models = {}
 
+        # Check that PGS has variance (e.g. not all 0)
+        if 0 in [np.var(results_ref[sum_col]), np.var(results_target[sum_col])]:
+            scorecols_drop.add(c_pgs)
+            logger.warning("Skipping adjustment: {} has 0 variance in PGS SUM".format(c_pgs))
+
     # Report PGS values with respect to distribution of PGS in the most similar reference population
     if 'empirical' in use_method:
         logger.debug("Adjusting PGS using most similar reference population distribution.")
@@ -258,35 +265,36 @@ def pgs_adjust(ref_df, target_df, scorecols: list, ref_pop_col, target_pop_col,
             z_col = 'Z_MostSimilarPop|{}'.format(c_pgs)
             results_ref[z_col] = pd.Series(index=ref_df.index, dtype='float64')
             results_target[z_col] = pd.Series(index=target_df.index, dtype='float64')
-
-            r_model = {}
 
-            # Adjust for each population
-            for pop in ref_populations:
-                r_pop = {}
-                i_ref_pop = (ref_df[ref_pop_col] == pop)
-                i_target_pop = (target_df[target_pop_col] == pop)
+            if c_pgs not in scorecols_drop:
+                r_model = {}
+
+                # Adjust for each population
+                for pop in ref_populations:
+                    r_pop = {}
+                    i_ref_pop = (ref_df[ref_pop_col] == pop)
+                    i_target_pop = (target_df[target_pop_col] == pop)
 
-                # Reference Score Distribution
-                c_pgs_pop_dist = ref_train_df.loc[ref_train_df[ref_pop_col] == pop, c_pgs]
+                    # Reference Score Distribution
+                    c_pgs_pop_dist = ref_train_df.loc[ref_train_df[ref_pop_col] == pop, c_pgs]
 
-                # Calculate Percentile
-                results_ref[percentile_col].loc[i_ref_pop] = percentileofscore(c_pgs_pop_dist, ref_df.loc[i_ref_pop, c_pgs])
-                results_target[percentile_col].loc[i_target_pop] = percentileofscore(c_pgs_pop_dist, target_df.loc[i_target_pop, c_pgs])
-                r_pop['percentiles'] = np.percentile(c_pgs_pop_dist, range(0,101,1))
+                    # Calculate Percentile
+                    results_ref[percentile_col].loc[i_ref_pop] = percentileofscore(c_pgs_pop_dist, ref_df.loc[i_ref_pop, c_pgs])
+                    results_target[percentile_col].loc[i_target_pop] = percentileofscore(c_pgs_pop_dist, target_df.loc[i_target_pop, c_pgs])
+                    r_pop['percentiles'] = np.percentile(c_pgs_pop_dist, range(0,101,1))
 
-                # Calculate Z
-                r_pop['mean'] = c_pgs_pop_dist.mean()
-                r_pop['std'] = c_pgs_pop_dist.std(ddof=0)
+                    # Calculate Z
+                    r_pop['mean'] = c_pgs_pop_dist.mean()
+                    r_pop['std'] = c_pgs_pop_dist.std(ddof=0)
 
-                results_ref[z_col].loc[i_ref_pop] = (ref_df.loc[i_ref_pop, c_pgs] - r_pop['mean'])/r_pop['std']
-                results_target[z_col].loc[i_target_pop] = (target_df.loc[i_target_pop, c_pgs] - r_pop['mean'])/r_pop['std']
+                    results_ref[z_col].loc[i_ref_pop] = (ref_df.loc[i_ref_pop, c_pgs] - r_pop['mean'])/r_pop['std']
+                    results_target[z_col].loc[i_target_pop] = (target_df.loc[i_target_pop, c_pgs] - r_pop['mean'])/r_pop['std']
 
-                r_model[pop] = r_pop
+                    r_model[pop] = r_pop
 
-            results_models['dist_empirical'][c_pgs] = r_model
-            # ToDo: explore handling of individuals who have low-confidence population labels
-            #  -> Possible Soln: weighted average based on probabilities? Small Mahalanobis P-values will complicate this
+                results_models['dist_empirical'][c_pgs] = r_model
+                # ToDo: explore handling of individuals who have low-confidence population labels
+                #  -> Possible Soln: weighted average based on probabilities? Small Mahalanobis P-values will complicate this
     # PCA-based adjustment
     if any([x in use_method for x in ['mean', 'mean+var']]):
         logger.debug("Adjusting PGS using PCA projections")
@@ -312,63 +320,72 @@ def pgs_adjust(ref_df, target_df, scorecols: list, ref_pop_col, target_pop_col,
                 target_norm[pc_col] = (target_norm[pc_col] - pc_mean) / pc_std
 
         for c_pgs in scorecols:
-            results_models['adjust_pcs']['PGS'][c_pgs] = {}
-            if norm_centerpgs:
-                pgs_mean = ref_train_df[c_pgs].mean()
-                ref_train_df[c_pgs] = (ref_train_df[c_pgs] - pgs_mean)
-                ref_norm[c_pgs] = (ref_norm[c_pgs] - pgs_mean)
-                target_norm[c_pgs] = (target_norm[c_pgs] - pgs_mean)
-                results_models['adjust_pcs']['PGS'][c_pgs]['pgs_offset'] = pgs_mean
-
-            # Method 1 (Khera et al. Circulation (2019): normalize mean (doi:10.1161/CIRCULATIONAHA.118.035658)
-            adj_col = 'Z_norm1|{}'.format(c_pgs)
-            # Fit to Reference Data
-            pcs2pgs_fit = LinearRegression().fit(ref_train_df[cols_pcs], ref_train_df[c_pgs])
-            ref_train_pgs_pred = pcs2pgs_fit.predict(ref_train_df[cols_pcs])
-            ref_train_pgs_resid = ref_train_df[c_pgs] - ref_train_pgs_pred
-            ref_train_pgs_resid_mean = ref_train_pgs_resid.mean()
-            ref_train_pgs_resid_std = ref_train_pgs_resid.std(ddof=0)
-
-            ref_pgs_resid = ref_norm[c_pgs] - pcs2pgs_fit.predict(ref_norm[cols_pcs])
-            results_ref[adj_col] = ref_pgs_resid / ref_train_pgs_resid_std
-            # Apply to Target Data
-            target_pgs_pred = pcs2pgs_fit.predict(target_norm[cols_pcs])
-            target_pgs_resid = target_norm[c_pgs] - target_pgs_pred
-            results_target[adj_col] = target_pgs_resid / ref_train_pgs_resid_std
-            results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm1'] = package_skl_regression(pcs2pgs_fit)
-
-            if 'mean+var' in use_method:
-                # Method 2 (Khan et al. Nature Medicine (2022)): normalize variance (doi:10.1038/s41591-022-01869-1)
-                # Normalize based on residual deviation from mean of the distribution [equalize population sds]
-                # (e.g. reduce the correlation between genetic ancestry and how far away you are from the mean)
-                # USE gamma distribution for predicted variance to constrain it to be positive (b/c using linear
-                # regression we can get negative predictions for the sd)
-                adj_col = 'Z_norm2|{}'.format(c_pgs)
-                pcs2var_fit_gamma = GammaRegressor(max_iter=1000).fit(ref_train_df[cols_pcs], (
-                            ref_train_pgs_resid - ref_train_pgs_resid_mean) ** 2)
-                if norm2_2step:
-                    # Return 2-step adjustment
-                    results_ref[adj_col] = ref_pgs_resid / np.sqrt(pcs2var_fit_gamma.predict(ref_norm[cols_pcs]))
-                    results_target[adj_col] = target_pgs_resid / np.sqrt(
-                        pcs2var_fit_gamma.predict(target_norm[cols_pcs]))
-                    results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm2'] = package_skl_regression(pcs2var_fit_gamma)
-                else:
-                    # Return full-likelihood adjustment model
-                    # This jointly re-fits the regression parameters from the mean and variance prediction to better
-                    # fit the observed PGS distribution. It seems to mostly change the intercepts. This implementation is
-                    # adapted from https://github.com/broadinstitute/palantir-workflows/blob/v0.14/ImputationPipeline/ScoringTasks.wdl,
-                    # which is distributed under a BDS-3 license.
-                    params_initial = np.concatenate([[pcs2pgs_fit.intercept_], pcs2pgs_fit.coef_,
-                                                     [pcs2var_fit_gamma.intercept_], pcs2var_fit_gamma.coef_])
-                    pcs2full_fit = fullLL_fit(df_score=ref_train_df, scorecol=c_pgs,
-                                              predictors=cols_pcs, initial_params=params_initial)
-
-                    results_ref[adj_col] = fullLL_adjust(pcs2full_fit, ref_norm, c_pgs)
-                    results_target[adj_col] = fullLL_adjust(pcs2full_fit, target_norm, c_pgs)
-
-                    if pcs2full_fit['params']['success'] is False:
-                        logger.warning("{} full-likelihood: {} {}".format(c_pgs, pcs2full_fit['params']['status'], pcs2full_fit['params']['message']))
-                    results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm2'] = pcs2full_fit
+            if c_pgs in scorecols_drop:
+                # fill the output with NAs
+                adj_cols = ['Z_norm1|{}'.format(c_pgs)]
+                if 'mean+var' in use_method:
+                    adj_cols.append('Z_norm2|{}'.format(c_pgs))
+                for adj_col in adj_cols:
+                    results_ref[adj_col] = pd.Series(index=ref_df.index, dtype='float64') # fill na
+                    results_target[adj_col] = pd.Series(index=target_df.index, dtype='float64') # fill na
+            else:
+                results_models['adjust_pcs']['PGS'][c_pgs] = {}
+                if norm_centerpgs:
+                    pgs_mean = ref_train_df[c_pgs].mean()
+                    ref_train_df[c_pgs] = (ref_train_df[c_pgs] - pgs_mean)
+                    ref_norm[c_pgs] = (ref_norm[c_pgs] - pgs_mean)
+                    target_norm[c_pgs] = (target_norm[c_pgs] - pgs_mean)
+                    results_models['adjust_pcs']['PGS'][c_pgs]['pgs_offset'] = pgs_mean
+
+                # Method 1 (Khera et al. Circulation (2019): normalize mean (doi:10.1161/CIRCULATIONAHA.118.035658)
+                adj_col = 'Z_norm1|{}'.format(c_pgs)
+                # Fit to Reference Data
+                pcs2pgs_fit = LinearRegression().fit(ref_train_df[cols_pcs], ref_train_df[c_pgs])
+                ref_train_pgs_pred = pcs2pgs_fit.predict(ref_train_df[cols_pcs])
+                ref_train_pgs_resid = ref_train_df[c_pgs] - ref_train_pgs_pred
+                ref_train_pgs_resid_mean = ref_train_pgs_resid.mean()
+                ref_train_pgs_resid_std = ref_train_pgs_resid.std(ddof=0)
+
+                ref_pgs_resid = ref_norm[c_pgs] - pcs2pgs_fit.predict(ref_norm[cols_pcs])
+                results_ref[adj_col] = ref_pgs_resid / ref_train_pgs_resid_std
+                # Apply to Target Data
+                target_pgs_pred = pcs2pgs_fit.predict(target_norm[cols_pcs])
+                target_pgs_resid = target_norm[c_pgs] - target_pgs_pred
+                results_target[adj_col] = target_pgs_resid / ref_train_pgs_resid_std
+                results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm1'] = package_skl_regression(pcs2pgs_fit)
+
+                if 'mean+var' in use_method:
+                    # Method 2 (Khan et al. Nature Medicine (2022)): normalize variance (doi:10.1038/s41591-022-01869-1)
+                    # Normalize based on residual deviation from mean of the distribution [equalize population sds]
+                    # (e.g. reduce the correlation between genetic ancestry and how far away you are from the mean)
+                    # USE gamma distribution for predicted variance to constrain it to be positive (b/c using linear
+                    # regression we can get negative predictions for the sd)
+                    adj_col = 'Z_norm2|{}'.format(c_pgs)
+                    pcs2var_fit_gamma = GammaRegressor(max_iter=1000).fit(ref_train_df[cols_pcs], (
+                                ref_train_pgs_resid - ref_train_pgs_resid_mean) ** 2)
+                    if norm2_2step:
+                        # Return 2-step adjustment
+                        results_ref[adj_col] = ref_pgs_resid / np.sqrt(pcs2var_fit_gamma.predict(ref_norm[cols_pcs]))
+                        results_target[adj_col] = target_pgs_resid / np.sqrt(
+                            pcs2var_fit_gamma.predict(target_norm[cols_pcs]))
+                        results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm2'] = package_skl_regression(pcs2var_fit_gamma)
+                    else:
+                        # Return full-likelihood adjustment model
+                        # This jointly re-fits the regression parameters from the mean and variance prediction to better
+                        # fit the observed PGS distribution. It seems to mostly change the intercepts. This implementation is
+                        # adapted from https://github.com/broadinstitute/palantir-workflows/blob/v0.14/ImputationPipeline/ScoringTasks.wdl,
+                        # which is distributed under a BDS-3 license.
+                        params_initial = np.concatenate([[pcs2pgs_fit.intercept_], pcs2pgs_fit.coef_,
+                                                         [pcs2var_fit_gamma.intercept_], pcs2var_fit_gamma.coef_])
+                        pcs2full_fit = fullLL_fit(df_score=ref_train_df, scorecol=c_pgs,
+                                                  predictors=cols_pcs, initial_params=params_initial)
+
+                        results_ref[adj_col] = fullLL_adjust(pcs2full_fit, ref_norm, c_pgs)
+                        results_target[adj_col] = fullLL_adjust(pcs2full_fit, target_norm, c_pgs)
+
+                        if pcs2full_fit['params']['success'] is False:
+                            logger.warning("{} full-likelihood: {} {}".format(c_pgs, pcs2full_fit['params']['status'], pcs2full_fit['params']['message']))
+                        results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm2'] = pcs2full_fit
     # Only return results
     logger.debug("Outputting adjusted PGS & models")
     results_ref = pd.DataFrame(results_ref)

diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
@@ -137,11 +137,11 @@ def assign_effect_type(
 ) -> typing.Generator[ScoreVariant, None, None]:
     for variant in variants:
         match (variant.is_recessive, variant.is_dominant):
-            case (None, None) | ("FALSE", "FALSE"):
+            case (None, None) | (False, False):
                 pass  # default value is additive, pass to break match and yield
-            case ("FALSE", "TRUE"):
+            case (False, True):
                 variant.effect_type = EffectType.DOMINANT
-            case ("TRUE", "FALSE"):
+            case (True, False):
                 variant.effect_type = EffectType.RECESSIVE
             case _:
                 logger.critical(f"Bad effect type setting: {variant}")

diff --git a/pgscatalog_utils/scorefile/scorevariant.py b/pgscatalog_utils/scorefile/scorevariant.py
@@ -106,8 +106,14 @@ def __init__(
 
         self.hm_inferOtherAllele: Optional[str] = hm_inferOtherAllele
         self.hm_source: Optional[str] = hm_source
-        self.is_dominant: Optional[bool] = is_dominant
-        self.is_recessive: Optional[bool] = is_recessive
+
+        self.is_dominant: Optional[bool] = (
+            is_dominant == "True" if is_dominant is not None else None
+        )
+        self.is_recessive: Optional[bool] = (
+            is_recessive == "True" if is_recessive is not None else None
+        )
+
         self.hm_rsID: Optional[str] = hm_rsID
         self.hm_match_chr: Optional[str] = hm_match_chr
         self.hm_match_pos: Optional[str] = hm_match_pos

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pgscatalog_utils"
-version = "0.5.1"
+version = "0.5.2"
 description = "Utilities for working with PGS Catalog API and scoring files"
 homepage = "https://github.com/PGScatalog/pgscatalog_utils"
 authors = ["Benjamin Wingfield <[email protected]>", "Samuel Lambert <[email protected]>", "Laurent Gil <[email protected]>"]