diff --git a/scripts/westminster_classify.py b/scripts/westminster_classify.py deleted file mode 100755 index b9be159..0000000 --- a/scripts/westminster_classify.py +++ /dev/null @@ -1,478 +0,0 @@ -#!/usr/bin/env python -from optparse import OptionParser -import joblib -import os -import pdb - -import h5py -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import seaborn as sns -from sklearn.ensemble import RandomForestClassifier -from sklearn.linear_model import RidgeClassifier -from sklearn.metrics import roc_auc_score, roc_curve -from sklearn.model_selection import KFold - -''' -westminster_classify.py - -Helper script to compute classifier accuracy. -''' - -################################################################################ -# main -################################################################################ -def main(): - usage = 'usage: %prog [options] ' - parser = OptionParser(usage) - parser.add_option('-a', dest='abs_value', - default=False, action='store_true', - help='Take features absolute value [Default: %default]') - parser.add_option('-f', dest='num_folds', - default=8, type='int', - help='Cross-validation folds [Default: %default]') - parser.add_option('-i', dest='iterations', - default=1, type='int', - help='Cross-validation iterations [Default: %default]') - parser.add_option('--indel', dest='indel', - default=False, action='store_true', - help='Add indel size as feature [Default: %default]') - parser.add_option('--iscale', dest='indel_scale', - default=1, type='float', - help='Scale indel SAD [Default: %default]') - parser.add_option('-l', dest='log', - default=False, action='store_true', - help='Take features arcsinh [Default: %default]') - parser.add_option('-m', dest='model_pkl', - help='Dimension reduction model') - parser.add_option('--msl', dest='msl', - default=1, type='int', - help='Random forest min_samples_leaf [Default: %default]') - parser.add_option('-o', dest='out_dir', - default='class_out', - help='Output directory [Default: %default]') - parser.add_option('-r', dest='random_seed', - default=44, type='int') - parser.add_option('-s', dest='save_preds', - default=False, action='store_true', - help='Save predictions across iterations [Default: %default]') - parser.add_option('--stat', dest='sad_stat', - default='SAD', - help='HDF5 key stat to consider. [Default: %default]') - parser.add_option('-t', dest='targets_file', - default=None) - (options,args) = parser.parse_args() - - if len(args) != 2: - parser.error('Must provide positive and negative variant predictions.') - else: - sadp_file = args[0] - sadn_file = args[1] - - np.random.seed(options.random_seed) - - if not os.path.isdir(options.out_dir): - os.mkdir(options.out_dir) - - # read dimension reduction model - if options.model_pkl: - model = joblib.load(options.model_pkl) - - # slice targets - if options.targets_file is None: - target_slice = None - else: - targets_df = pd.read_csv(options.targets_file, sep='\t', index_col=0) - target_slice = targets_df.index - - # read positive/negative variants - Xp = read_stats(sadp_file, options.sad_stat, target_slice) - Xn = read_stats(sadn_file, options.sad_stat, target_slice) - - # transformations - if options.log: - Xp = np.arcsinh(Xp) - Xn = np.arcsinh(Xn) - if options.abs_value: - Xp = np.abs(Xp) - Xn = np.abs(Xn) - if options.model_pkl: - Xp = model.transform(Xp) - Xn = model.transform(Xn) - - # indels - if options.indel: - Ip = read_indel(sadp_file) - In = read_indel(sadn_file) - Ip = np.expand_dims(Ip, axis=-1) - In = np.expand_dims(In, axis=-1) - Xp = np.concatenate([Xp,Ip], axis=1) - Xn = np.concatenate([Xn,In], axis=1) - elif options.indel_scale != 1: - Ip = read_indel(sadp_file, indel_bool=True) - In = read_indel(sadn_file, indel_bool=True) - Xp[Ip] = options.indel_scale*Xp[Ip] - Xn[Ip] = options.indel_scale*Xn[Ip] - - # combine - X = np.concatenate([Xp, Xn], axis=0) - y = np.array([True]*Xp.shape[0] + [False]*Xn.shape[0], dtype='bool') - - # train classifier - if X.shape[1] == 1: - aurocs, fpr_folds, tpr_folds, fpr_mean, tpr_mean = fold_roc(X, y, folds=8) - - # save preds - if options.save_preds: - np.save('%s/preds.npy' % options.out_dir, X) - else: - # if options.classifier_type == 'ridge': - # aurocs, fpr_folds, tpr_folds, fpr_full, tpr_full = ridge_roc(X, y, folds=8, alpha=10000) - # else: - aurocs, fpr_folds, tpr_folds, fpr_mean, tpr_mean, preds = randfor_roc(X, y, - folds=options.num_folds, iterations=options.iterations, - min_samples_leaf=options.msl, random_state=options.random_seed) - - # save preds - if options.save_preds: - np.save('%s/preds.npy' % options.out_dir, preds) - - # save full model - model = randfor_full(X, y, min_samples_leaf=options.msl) - joblib.dump(model, '%s/model.pkl' % options.out_dir) - - # save - np.save('%s/aurocs.npy' % options.out_dir, aurocs) - np.save('%s/fpr_mean.npy' % options.out_dir, fpr_mean) - np.save('%s/tpr_mean.npy' % options.out_dir, tpr_mean) - - # print stats - stats_out = open('%s/stats.txt' % options.out_dir, 'w') - auroc_stdev = np.std(aurocs) / np.sqrt(len(aurocs)) - print('AUROC: %.4f (%.4f)' % (np.mean(aurocs), auroc_stdev), file=stats_out) - stats_out.close() - - # plot roc - plot_roc(fpr_folds, tpr_folds, options.out_dir) - - -def fold_roc(X: np.array, y: np.array, - folds: int = 8, - random_state: int = 44): - """ - Compute ROC for a single value, sans model. - - Args: - X (:obj:`np.array`): - Feature values. - y (:obj:`np.array`): - Target values. - folds (:obj:`int`): - Cross folds. - random_state (:obj:`int`): - sklearn random_state. - """ - aurocs = [] - fpr_folds = [] - tpr_folds = [] - - fpr_mean = np.linspace(0, 1, 256) - tpr_mean = [] - - kf = KFold(n_splits=folds, shuffle=True, random_state=random_state) - - for train_index, test_index in kf.split(X): - # predict test set (as is) - preds = X[test_index,:] - - # compute ROC curve - fpr, tpr, _ = roc_curve(y[test_index], preds) - fpr_folds.append(fpr) - tpr_folds.append(tpr) - - interp_tpr = np.interp(fpr_mean, fpr, tpr) - interp_tpr[0] = 0.0 - tpr_mean.append(interp_tpr) - - # compute AUROC - aurocs.append(roc_auc_score(y[test_index], preds)) - - tpr_mean = np.array(tpr_mean).mean(axis=0) - - return np.array(aurocs), np.array(fpr_folds), np.array(tpr_folds), fpr_mean, tpr_mean - - -def plot_roc(fprs: np.array, tprs: np.array, out_dir: str): - """ - Plot ROC curve. - - Args: - fprs (:obj:`np.array`): - False positive rates - tprs (:obj:`np.array`): - True positive rates. - out_dir (:obj:`str`): - Output directory. - """ - plt.figure(figsize=(4,4)) - - for fi in range(len(fprs)): - plt.plot(fprs[fi], tprs[fi], alpha=0.25) - - ax = plt.gca() - ax.set_xlabel('False positive rate') - ax.set_ylabel('True positive rate') - - sns.despine() - plt.tight_layout() - - plt.savefig('%s/roc.pdf' % out_dir) - plt.close() - - -def randfor_full(X: np.array, y: np.array, - n_estimators: int = 100, - min_samples_leaf: int = 1, - random_state: int = 44): - """ - Fit a single random forest on the full data. - - Args: - X (:obj:`np.array`): - Feature matrix. - y (:obj:`np.array`): - Target values. - n_estimators (:obj:`int`): - Number of trees in the forest. - min_samples_leaf (:obj:`float`): - Minimum number of samples required to be at a leaf node. - random_state (:obj:`int`): - sklearn random_state. - """ - model = RandomForestClassifier(n_estimators=n_estimators, max_features='log2', - max_depth=64, min_samples_leaf=min_samples_leaf, - random_state=random_state, n_jobs=-1) - model.fit(X, y) - return model - - -def randfor_roc(X: np.array, y: np.array, - folds: int = 8, - iterations: int = 1, - n_estimators: int = 100, - min_samples_leaf: int = 1, - random_state: int = 44): - """ - Fit random forest classifiers in cross-validation. - - Args: - X (:obj:`np.array`): - Feature matrix. - y (:obj:`np.array`): - Target values. - folds (:obj:`int`): - Cross folds. - iterations (:obj:`int`): - Cross fold iterations. - n_estimators (:obj:`int`): - Number of trees in the forest. - min_samples_leaf (:obj:`float`): - Minimum number of samples required to be at a leaf node. - random_state (:obj:`int`): - sklearn random_state. - """ - aurocs = [] - fpr_folds = [] - tpr_folds = [] - fpr_fulls = [] - tpr_fulls = [] - preds_return = [] - - fpr_mean = np.linspace(0, 1, 256) - tpr_mean = [] - - for i in range(iterations): - rs_iter = random_state + i - preds_full = np.zeros(y.shape) - - kf = KFold(n_splits=folds, shuffle=True, random_state=rs_iter) - - for train_index, test_index in kf.split(X): - # fit model - if random_state is None: - rs_rf = None - else: - rs_rf = rs_iter+test_index[0] - model = RandomForestClassifier(n_estimators=n_estimators, max_features='log2', - max_depth=64, min_samples_leaf=min_samples_leaf, - random_state=rs_rf, n_jobs=-1) - model.fit(X[train_index,:], y[train_index]) - - # predict test set - preds = model.predict_proba(X[test_index,:])[:,1] - - # save - preds_full[test_index] = preds.squeeze() - - # compute ROC curve - fpr, tpr, _ = roc_curve(y[test_index], preds) - fpr_folds.append(fpr) - tpr_folds.append(tpr) - - interp_tpr = np.interp(fpr_mean, fpr, tpr) - interp_tpr[0] = 0.0 - tpr_mean.append(interp_tpr) - - # compute AUROC - aurocs.append(roc_auc_score(y[test_index], preds)) - - fpr_full, tpr_full, _ = roc_curve(y, preds_full) - fpr_fulls.append(fpr_full) - tpr_fulls.append(tpr_full) - preds_return.append(preds_full) - - aurocs = np.array(aurocs) - tpr_mean = np.array(tpr_mean).mean(axis=0) - preds_return = np.array(preds_return).T - - return aurocs, fpr_folds, tpr_folds, fpr_mean, tpr_mean, preds_return - - -def ridge_roc(X: np.array, y: np.array, - folds: int = 8, - iterations: int = 1, - alpha: float = 1, - random_state: int = 44): - """ - Fit ridge classifiers in cross-validation. - - Args: - X (:obj:`np.array`): - Feature matrix. - y (:obj:`np.array`): - Target values. - folds (:obj:`int`): - Cross folds. - iterations (:obj:`int`): - Cross fold iterations. - alpha (:obj:`float`): - Ridge alpha coefficient. - random_state (:obj:`int`): - sklearn random_state. - """ - - aurocs = [] - fpr_folds = [] - tpr_folds = [] - fpr_fulls = [] - tpr_fulls = [] - preds_return = [] - - fpr_mean = np.linspace(0, 1, 256) - tpr_mean = [] - - for i in range(iterations): - rs_iter = random_state + i - preds_full = np.zeros(y.shape) - - kf = KFold(n_splits=folds, shuffle=True, random_state=rs_iter) - - for train_index, test_index in kf.split(X): - # fit model - if random_state is None: - rs_rf = None - else: - rs_rf = rs_iter+test_index[0] - model = RidgeClassifier(alpha=alpha, random_state=rs_rf) - model.fit(X[train_index,:], y[train_index]) - - # predict test set - preds = model._predict_proba_lr(X[test_index,:])[:,1] - - # save - preds_full[test_index] = preds.squeeze() - - # compute ROC curve - fpr, tpr, _ = roc_curve(y[test_index], preds) - fpr_folds.append(fpr) - tpr_folds.append(tpr) - - interp_tpr = np.interp(fpr_mean, fpr, tpr) - interp_tpr[0] = 0.0 - tpr_mean.append(interp_tpr) - - # compute AUROC - aurocs.append(roc_auc_score(y[test_index], preds)) - - fpr_full, tpr_full, _ = roc_curve(y, preds_full) - fpr_fulls.append(fpr_full) - tpr_fulls.append(tpr_full) - preds_return.append(preds_full) - - aurocs = np.array(aurocs) - tpr_mean = np.array(tpr_mean).mean(axis=0) - preds_return = np.array(preds_return).T - - return aurocs, fpr_folds, tpr_folds, fpr_mean, tpr_mean, preds_return - - -def read_indel(h5_file, indel_abs=True, indel_bool=False): - """ - Read indel allele information from HDF5 file. - - Args: - h5_file (:obj:`str`): - Stats HDF5 file. - indel_abs (:obj:`bool`): - Take absolute value of indel size. - indel_bool (:obj:`bool`): - Return boolean indel indication vector. - """ - with h5py.File(h5_file, 'r') as h5_open: - try: - ref_alleles = [ra.decode('UTF-8') for ra in h5_open['ref_allele']] - alt_alleles = [aa.decode('UTF-8') for aa in h5_open['alt_allele']] - except KeyError: - ref_alleles = [ra.decode('UTF-8') for ra in h5_open['ref']] - alt_alleles = [aa.decode('UTF-8') for aa in h5_open['alt']] - num_variants = len(ref_alleles) - indels = np.array([len(ref_alleles[vi])-len(alt_alleles[vi]) for vi in range(num_variants)]) - if indel_abs: - indels = np.abs(indels) - if indel_bool: - indels = (indels != 0) - return indels - - -def read_stats(stats_h5_file: str, stat_key: str, target_slice: np.array): - """ - Read stats from HDF5 file. - - Args: - stats_h5_file (:obj:`str`): - Stats HDF5 file. - stat_key (:obj:`str`): - Stat HDF5 key. - target_slice (:obj:`np.array`): - Target axis slice. - """ - with h5py.File(stats_h5_file, 'r') as stats_h5: - sad = stats_h5[stat_key][:] - # TEMP - insertions only - # ref = [ra.decode('UTF-8') for ra in stats_h5['ref_allele']] - # alt = [aa.decode('UTF-8') for aa in stats_h5['alt_allele']] - # num_snps = len(ref) - # indel = np.array([len(alt[si]) - len(ref[si]) for si in range(num_snps)]) - # sad = sad[indel < 0] - if target_slice is not None: - sad = sad[...,target_slice] - sad = np.nan_to_num(sad).astype('float32') - return sad - - -################################################################################ -# __main__ -################################################################################ -if __name__ == '__main__': - main() diff --git a/scripts/westminster_gtex_cmp.py b/scripts/westminster_gtex_cmp.py deleted file mode 100755 index e755bdb..0000000 --- a/scripts/westminster_gtex_cmp.py +++ /dev/null @@ -1,234 +0,0 @@ -#!/usr/bin/env python -from optparse import OptionParser -import glob -import pdb -import os -import sys - -import h5py -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -from scipy.stats import wilcoxon -import seaborn as sns -from tabulate import tabulate - -from westminster.stats import ttest_alt - -''' -westminster_gtex_cmp.py - -Compare multiple variant score sets on the GTEx fine mapped eQTL benchmark. -''' - -################################################################################ -# main -################################################################################ -def main(): - usage = 'usage: %prog [options] ...' - parser = OptionParser(usage) - parser.add_option('-a', '--alt', dest='alternative', - default='two-sided', help='Statistical test alternative [Default: %default]') - parser.add_option('--hue', dest='plot_hue', - default=False, action='store_true', - help='Scatter plot variant number as hue [Default: %default]') - parser.add_option('-l', dest='labels') - parser.add_option('-o', dest='out_dir', - default='compare_scores') - parser.add_option('--stats', dest='stats', - default='logD2') - parser.add_option('-v', dest='min_variants', - default=0, type='int', - help='Minimum variants to include tissue [Default: %default]') - (options,args) = parser.parse_args() - - if len(args) == 0: - parser.error('Must provide classification output directories') - else: - bench_dirs = args - - if not os.path.isdir(options.out_dir): - os.mkdir(options.out_dir) - - num_benches = len(bench_dirs) - sad_stats = [stat for stat in options.stats.split(',')] - if len(sad_stats) == 1: - sad_stats = sad_stats*num_benches - - sns.set(font_scale=1.2, style='ticks') - - if options.labels is None: - options.labels = [os.path.split(bd)[1] for bd in bench_dirs] - else: - options.labels = options.labels.split(',') - assert(len(options.labels) == num_benches) - - # initialize data frame lists - df_tissues = [] - df_variants = [] - df_label1 = [] - df_label2 = [] - df_auroc1 = [] - df_auroc2 = [] - df_mwp = [] - df_tp = [] - - # determine tissues - tissue_bench_dirs0 = glob.glob('%s/*_class-%s' % (bench_dirs[0], sad_stats[0])) - tissue_class_dirs = [tbd.split('/')[-1] for tbd in tissue_bench_dirs0] - tissues = [tcd[:tcd.find('_class')] for tcd in tissue_class_dirs] - - for tissue in tissues: - tissue_out_dir = '%s/%s' % (options.out_dir, tissue) - if not os.path.isdir(tissue_out_dir): - os.mkdir(tissue_out_dir) - - # count variants - tissue_scores_file = '%s/%s_pos/scores.h5' % (bench_dirs[0],tissue) - # TEMP while I still have 'sad' around - if not os.path.isfile(tissue_scores_file): - tissue_scores_file = '%s/%s_pos/sad.h5' % (bench_dirs[0],tissue) - with h5py.File(tissue_scores_file, 'r') as tissue_scores_h5: - sad_stat_up = sad_stats[0] - num_variants = tissue_scores_h5[sad_stat_up].shape[0] - - # filter variants - if num_variants >= options.min_variants: - # read TPRs and FPRs - bench_tpr_mean = [] - bench_fpr_mean = [] - bench_aurocs = [] - for i in range(num_benches): - tissue_class_dir_i = '%s/%s_class-%s' % (bench_dirs[i],tissue,sad_stats[i]) - try: - tpr_mean = np.load('%s/tpr_mean.npy' % tissue_class_dir_i) - fpr_mean = np.load('%s/fpr_mean.npy' % tissue_class_dir_i) - aurocs = np.load('%s/aurocs.npy' % tissue_class_dir_i) - bench_tpr_mean.append(tpr_mean) - bench_fpr_mean.append(fpr_mean) - bench_aurocs.append(aurocs) - except FileNotFoundError: - print('Failed run for %s w/ %d variants' % (tissue, num_variants), file=sys.stderr) - - if len(bench_aurocs) == num_benches: - # mean ROC plot - plt.figure(figsize=(6,6)) - for i in range(num_benches): - label_i = '%s AUROC %.4f' % (options.labels[i], bench_aurocs[i].mean()) - plt.plot(bench_fpr_mean[i], bench_tpr_mean[i], alpha=0.5, label=label_i) - plt.legend() - ax = plt.gca() - ax.set_xlabel('False positive rate') - ax.set_ylabel('True positive rate') - sns.despine() - plt.tight_layout() - plt.savefig('%s/roc_full.pdf' % tissue_out_dir) - plt.close() - - # scatter plot versions' fold AUROCss - for i in range(num_benches): - for j in range(i+1, num_benches): - if len(bench_aurocs[i]) == len(bench_aurocs[j]): - plt.figure(figsize=(6,6)) - sns.scatterplot(x=bench_aurocs[i], y=bench_aurocs[j], - color='black', linewidth=0, alpha=0.5) - ax = plt.gca() - - vmin = min(bench_aurocs[i].min(), bench_aurocs[j].min()) - vmax = max(bench_aurocs[i].max(), bench_aurocs[j].max()) - ax.plot([vmin,vmax], [vmin,vmax], linestyle='--', color='gold') - ax.set_xlabel('%s fold AUROC' % options.labels[i]) - ax.set_ylabel('%s fold AUROC' % options.labels[j]) - sns.despine() - plt.tight_layout() - plt.savefig('%s/auroc_%s_%s.pdf' % (tissue_out_dir, options.labels[i], options.labels[j])) - plt.close() - - # append lists - df_tissues.append(tissue) - df_variants.append(num_variants) - df_label1.append(options.labels[i]) - df_label2.append(options.labels[j]) - df_auroc1.append(bench_aurocs[i].mean()) - df_auroc2.append(bench_aurocs[j].mean()) - if len(bench_aurocs[i]) == len(bench_aurocs[j]): - df_mwp.append(wilcoxon(bench_aurocs[i], bench_aurocs[j], - alternative=options.alternative)[1]) - df_tp.append(ttest_alt(bench_aurocs[i], bench_aurocs[j], - alternative=options.alternative)[1]) - else: - df_mwp.append(0) - df_tp.append(0) - - # make comparison table - df_cmp = pd.DataFrame({ - 'tissue':df_tissues, - 'variants':df_variants, - 'label1':df_label1, - 'label2':df_label2, - 'auroc1':df_auroc1, - 'auroc2':df_auroc2, - 'wilcoxon':df_mwp, - 'ttest':df_tp - }) - - # print table - df_cmp.sort_values('variants', inplace=True) - df_cmp.to_csv('%s/table_cmp.tsv' % options.out_dir, sep='\t') - table_cmp = tabulate(df_cmp, headers='keys', tablefmt='github') - border = table_cmp.split('\n')[1].replace('|','-') - print(border) - print(table_cmp) - print(border) - - if num_benches == 1: - print('%s AUROC: %.4f' % (options.labels[0], np.mean(bench_aurocs))) - else: - # scatter plot pairs - for i in range(num_benches): - for j in range(i+1, num_benches): - mask_ij = (df_cmp.label1 == options.labels[i]) & (df_cmp.label2 == options.labels[j]) - df_cmp_ij = df_cmp[mask_ij] - - hue_var = None - if options.plot_hue: - hue_var = 'variants' - - plt.figure(figsize=(6,6)) - sns.scatterplot(x='auroc1', y='auroc2', data=df_cmp_ij, - hue=hue_var, linewidth=0, alpha=0.8) - ax = plt.gca() - - vmin = min(df_cmp_ij.auroc1.min(), df_cmp_ij.auroc2.min()) - vmax = max(df_cmp_ij.auroc1.max(), df_cmp_ij.auroc2.max()) - ax.plot([vmin,vmax], [vmin,vmax], linestyle='--', color='black') - - eps = 0.05 - ax.text(1-eps, eps, 'Mean %.3f'%df_cmp_ij.auroc1.mean(), - horizontalalignment='right', transform=ax.transAxes) - ax.text(eps, 1-eps, 'Mean %.3f'%df_cmp_ij.auroc2.mean(), - verticalalignment='top', transform=ax.transAxes) - - ax.set_xlabel('%s AUROC' % options.labels[i]) - ax.set_ylabel('%s AUROC' % options.labels[j]) - sns.despine() - plt.tight_layout() - plt.savefig('%s/auroc_%s_%s.pdf' % (options.out_dir, options.labels[i], options.labels[j])) - plt.close() - - wilcoxon_p = wilcoxon(df_cmp_ij.auroc1, df_cmp_ij.auroc2, - alternative=options.alternative)[1] - ttest_p = ttest_alt(df_cmp_ij.auroc1, df_cmp_ij.auroc2, - alternative=options.alternative)[1] - print('') - print('%s AUROC: %.4f' % (options.labels[i], df_cmp_ij.auroc1.mean())) - print('%s AUROC: %.4f' % (options.labels[j], df_cmp_ij.auroc2.mean())) - print('Wilcoxon p: %.3g' % wilcoxon_p) - print('T-test p: %.3g' % ttest_p) - - -################################################################################ -# __main__ -################################################################################ -if __name__ == '__main__': - main()