diff --git a/src/westminster/scripts/westminster_gtex_folds.py b/src/westminster/scripts/westminster_gtex_folds.py
index aa59dad..aee13b6 100755
--- a/src/westminster/scripts/westminster_gtex_folds.py
+++ b/src/westminster/scripts/westminster_gtex_folds.py
@@ -52,15 +52,15 @@ def main():
     snp_options.add_option(
         "-f",
         dest="genome_fasta",
-        default="%s/data/hg38.fa" % os.environ["BASENJIDIR"],
+        default="%s/assembly/ucsc/hg38.fa" % os.environ["HG38"],
         help="Genome FASTA for sequences [Default: %default]",
     )
     snp_options.add_option(
-        '--indel_stitch',
-        dest='indel_stitch',
+        "--indel_stitch",
+        dest="indel_stitch",
         default=True,
-        action='store_true',
-        help="Stitch indel compensation shifts [Default: %default]"
+        action="store_true",
+        help="Stitch indel compensation shifts [Default: %default]",
     )
     snp_options.add_option(
         "-o",
@@ -406,10 +406,10 @@ def main():
     # fit classifiers
 
     # SNPs
-    cmd_base = 'westminster_classify.py -f 8 -i 100 -r 44 -s'
+    cmd_base = "westminster_classify.py -f 8 -i 100 -r 44 -s"
     # indels
     # cmd_base = 'westminster_classify.py -f 6 -i 64 -r 44 -s'
-    cmd_base += ' --msl %d' % options.msl
+    cmd_base += " --msl %d" % options.msl
 
     if options.class_targets_file is not None:
         cmd_base += " -t %s" % options.class_targets_file
diff --git a/src/westminster/scripts/westminster_gtexg_coef.py b/src/westminster/scripts/westminster_gtexg_coef.py
index 2973119..37a4d29 100755
--- a/src/westminster/scripts/westminster_gtexg_coef.py
+++ b/src/westminster/scripts/westminster_gtexg_coef.py
@@ -35,7 +35,7 @@ def main():
     parser.add_argument(
         "-g",
         "--gtex_vcf_dir",
-        default="/home/drk/seqnn/data/gtex_fine/susie_pip90",
+        default="/home/drk/seqnn/data/gtex_fine/susie_pip90r",
         help="GTEx VCF directory",
     )
     parser.add_argument(
@@ -44,7 +44,7 @@ def main():
     parser.add_argument(
         "-s",
         "--snp_stat",
-        default="logSED",
+        default="logSUM",
         help="SNP statistic. [Default: %(default)s]",
     )
     parser.add_argument("-v", "--verbose", action="store_true")
@@ -95,7 +95,7 @@ def main():
         eqtl_df = read_eqtl(tissue, args.gtex_vcf_dir)
         if eqtl_df is not None:
             # read model predictions
-            gtex_scores_file = f"{args.gtex_dir}/{tissue}_pos/sed.h5"
+            gtex_scores_file = f"{args.gtex_dir}/{tissue}_pos/scores.h5"
             eqtl_df = add_scores(
                 gtex_scores_file, keyword, eqtl_df, args.snp_stat, verbose=args.verbose
             )
@@ -107,9 +107,7 @@ def main():
             coef_r = spearmanr(eqtl_df.coef, eqtl_df.score)[0]
 
             # classification AUROC
-            class_auroc = classify_auroc(
-                gtex_scores_file, keyword, eqtl_df, args.snp_stat
-            )
+            class_auroc = classify_auroc(gtex_scores_file, keyword, args.snp_stat)
 
             if args.plot:
                 eqtl_df.to_csv(f"{args.out_dir}/{tissue}.tsv", index=False, sep="\t")
@@ -210,18 +208,17 @@ def add_scores(
     """
     with h5py.File(gtex_scores_file, "r") as gtex_scores_h5:
         # read data
-        snp_i = gtex_scores_h5["si"][:]
-        snps = np.array([snp.decode("UTF-8") for snp in gtex_scores_h5["snp"]])
-        ref_allele = np.array(
-            [ref.decode("UTF-8") for ref in gtex_scores_h5["ref_allele"]]
-        )
-        genes = np.array([snp.decode("UTF-8") for snp in gtex_scores_h5["gene"]])
-        target_ids = np.array(
-            [ref.decode("UTF-8") for ref in gtex_scores_h5["target_ids"]]
-        )
-        target_labels = np.array(
-            [ref.decode("UTF-8") for ref in gtex_scores_h5["target_labels"]]
-        )
+        snps = [snp.decode("UTF-8") for snp in gtex_scores_h5["snp"]]
+        ref_allele = [ref.decode("UTF-8") for ref in gtex_scores_h5["ref_allele"]]
+        target_ids = [tid.decode("UTF-8") for tid in gtex_scores_h5["target_ids"]]
+        target_labels = [tl.decode("UTF-8") for tl in gtex_scores_h5["target_labels"]]
+        gtex_scores = gtex_scores_h5[score_key]
+
+        # map gene identifiers
+        gene_map = {}
+        for gene in gtex_scores_h5["gene"]:
+            gene = gene.decode("UTF-8")
+            gene_map[trim_dot(gene)] = gene
 
         # determine matching GTEx targets
         match_tis = []
@@ -236,62 +233,50 @@ def add_scores(
                     match_tis.append(ti)
         match_tis = np.array(match_tis)
 
-        # read scores and take mean across targets
-        score_table = gtex_scores_h5[score_key][..., match_tis].mean(
-            axis=-1, dtype="float32"
-        )
-        score_table = np.arcsinh(score_table)
-
-    # hash scores to (snp,gene)
-    snpgene_scores = {}
-    for sgi in range(score_table.shape[0]):
-        snp = snps[snp_i[sgi]]
-        gene = trim_dot(genes[sgi])
-        snpgene_scores[(snp, gene)] = score_table[sgi]
-
-    # add scores to eQTL table
-    #  flipping when allele1 doesn't match
-    snp_ref = dict(zip(snps, ref_allele))
-    eqtl_df_scores = []
-    for sgi, eqtl in eqtl_df.iterrows():
-        sgs = snpgene_scores.get((eqtl.variant, eqtl.gene), 0)
-        if sgs != 0 and snp_ref[eqtl.variant] != eqtl.allele1:
-            sgs *= -1
-        eqtl_df_scores.append(sgs)
-    eqtl_df["score"] = eqtl_df_scores
+        # add scores to eQTL table
+        #  flipping when allele1 doesn't match
+        snp_ref = dict(zip(snps, ref_allele))
+        eqtl_df_scores = []
+        for _, eqtl in eqtl_df.iterrows():
+            if eqtl.variant in gtex_scores:
+                snp_scores = gtex_scores[eqtl.variant]
+                egene = gene_map.get(eqtl.gene, "")
+                if egene in snp_scores.keys():
+                    sgs = snp_scores[egene][match_tis].mean(dtype="float32")
+                else:
+                    sgs = 0
+            else:
+                sgs = 0
+
+            # flip
+            if sgs != 0 and snp_ref[eqtl.variant] != eqtl.allele1:
+                sgs *= -1
+
+            eqtl_df_scores.append(sgs)
+        eqtl_df["score"] = eqtl_df_scores
 
     return eqtl_df
 
 
-def classify_auroc(
-    gtex_scores_file: str, keyword: str, eqtl_df: pd.DataFrame, score_key: str = "SED"
-):
+def classify_auroc(gtex_scores_file: str, keyword: str, score_key: str = "SED"):
     """Read eQTL RNA predictions for negatives from the given tissue.
 
     Args:
       gtex_scores_file (str): Variant scores HDF5.
       tissue_keyword (str): tissue keyword, for matching GTEx targets
-      eqtl_df (pd.DataFrame): eQTL dataframe
       score_key (str): score key in HDF5 file
       verbose (bool): Print matching targets.
 
     Returns:
       class_auroc (float): Classification AUROC.
     """
-    gtex_nscores_file = gtex_scores_file.replace("_pos", "_neg")
-    with h5py.File(gtex_nscores_file, "r") as gtex_scores_h5:
-        # read data
-        snp_i = gtex_scores_h5["si"][:]
-        snps = np.array([snp.decode("UTF-8") for snp in gtex_scores_h5["snp"]])
-        genes = np.array([snp.decode("UTF-8") for snp in gtex_scores_h5["gene"]])
-        target_ids = np.array(
-            [ref.decode("UTF-8") for ref in gtex_scores_h5["target_ids"]]
-        )
-        target_labels = np.array(
-            [ref.decode("UTF-8") for ref in gtex_scores_h5["target_labels"]]
-        )
+    # rescore positives using all genes
+    with h5py.File(gtex_scores_file, "r") as gtex_scores_h5:
+        gtex_scores = gtex_scores_h5[score_key]
 
         # determine matching GTEx targets
+        target_ids = [tid.decode("UTF-8") for tid in gtex_scores_h5["target_ids"]]
+        target_labels = [tl.decode("UTF-8") for tl in gtex_scores_h5["target_labels"]]
         match_tis = []
         for ti in range(len(target_ids)):
             if (
@@ -302,26 +287,33 @@ def classify_auroc(
                     match_tis.append(ti)
         match_tis = np.array(match_tis)
 
-        # read scores and take mean across targets
-        score_table = gtex_scores_h5[score_key][..., match_tis].mean(
-            axis=-1, dtype="float32"
-        )
-        # score_table = np.arcsinh(score_table)
+        # aggregate across genes w/ sum abs
+        psnp_scores = {}
+        for snp in gtex_scores.keys():
+            gtex_snp_scores = gtex_scores[snp]
+            psnp_scores[snp] = 0
+            for gene in gtex_snp_scores.keys():
+                sgs = gtex_snp_scores[gene][match_tis].mean(dtype="float32")
+                psnp_scores[snp] += np.abs(sgs)
 
-    # aggregate across genes w/ sum abs
-    nsnp_scores = {}
-    for sgi in range(score_table.shape[0]):
-        snp = snps[snp_i[sgi]]
-        nsnp_scores[snp] = nsnp_scores.get(snp, 0) + np.abs(score_table[sgi])
+    # score negatives
+    gtex_nscores_file = gtex_scores_file.replace("_pos", "_neg")
+    with h5py.File(gtex_nscores_file, "r") as gtex_scores_h5:
+        gtex_scores = gtex_scores_h5[score_key]
 
-    psnp_scores = {}
-    for sgi, eqtl in eqtl_df.iterrows():
-        snp = eqtl.variant
-        psnp_scores[snp] = psnp_scores.get(snp, 0) + np.abs(eqtl.score)
+        # aggregate across genes w/ sum abs
+        nsnp_scores = {}
+        for snp in gtex_scores.keys():
+            gtex_snp_scores = gtex_scores[snp]
+            nsnp_scores[snp] = 0
+            for gene in gtex_snp_scores.keys():
+                sgs = gtex_snp_scores[gene][match_tis].mean(dtype="float32")
+                nsnp_scores[snp] += np.abs(sgs)
 
     # compute AUROC
     Xp = list(psnp_scores.values())
     Xn = list(nsnp_scores.values())
+    print(keyword, len(Xp), len(Xn))
     X = Xp + Xn
     y = [1] * len(Xp) + [0] * len(Xn)
 
diff --git a/src/westminster/scripts/westminster_gtexg_folds.py b/src/westminster/scripts/westminster_gtexg_folds.py
index 0f02186..6471b5e 100755
--- a/src/westminster/scripts/westminster_gtexg_folds.py
+++ b/src/westminster/scripts/westminster_gtexg_folds.py
@@ -18,14 +18,13 @@
 import pickle
 import pdb
 import os
+import sys
 
 import h5py
 import numpy as np
 
 import slurm
 
-from westminster.multi import nonzero_h5
-
 """
 westminster_gtexg_folds.py
 
@@ -41,75 +40,96 @@ def main():
     parser = OptionParser(usage)
 
     # sed options
-    sed_options = OptionGroup(parser, "borzoi_sed.py options")
-    sed_options.add_option(
-        "-b",
-        dest="bedgraph",
-        default=False,
-        action="store_true",
-        help="Write ref/alt predictions as bedgraph [Default: %default]",
+    snp_options = OptionGroup(parser, "hound_snpgene.py options")
+    snp_options.add_option(
+        "-c",
+        dest="cluster_pct",
+        default=0,
+        type="float",
+        help="Cluster genes within a %% of the seq length to make a single ref pred [Default: %default]",
     )
-    sed_options.add_option(
+    snp_options.add_option(
         "-f",
         dest="genome_fasta",
-        default="%s/data/hg38.fa" % os.environ["BASENJIDIR"],
+        default="%s/assembly/ucsc/hg38.fa" % os.environ["HG38"],
         help="Genome FASTA for sequences [Default: %default]",
     )
-    sed_options.add_option(
+    snp_options.add_option(
         "-g",
         dest="genes_gtf",
         default="%s/genes/gencode41/gencode41_basic_nort.gtf" % os.environ["HG38"],
         help="GTF for gene definition [Default %default]",
     )
-    sed_options.add_option(
+    snp_options.add_option(
         "-o",
         dest="out_dir",
         default="sed",
         help="Output directory for tables and plots [Default: %default]",
     )
-    sed_options.add_option(
+    snp_options.add_option(
         "--rc",
         dest="rc",
         default=False,
         action="store_true",
         help="Average forward and reverse complement predictions [Default: %default]",
     )
-    sed_options.add_option(
+    snp_options.add_option(
         "--shifts",
         dest="shifts",
         default="0",
         type="str",
         help="Ensemble prediction shifts [Default: %default]",
     )
-    sed_options.add_option(
+    snp_options.add_option(
         "--span",
         dest="span",
         default=False,
         action="store_true",
         help="Aggregate entire gene span [Default: %default]",
     )
-    sed_options.add_option(
+    snp_options.add_option(
         "--stats",
-        dest="sed_stats",
-        default="SED",
+        dest="snp_stats",
+        default="logSUM",
         help="Comma-separated list of stats to save. [Default: %default]",
     )
-    sed_options.add_option(
+    snp_options.add_option(
         "-t",
         dest="targets_file",
         default=None,
         type="str",
         help="File specifying target indexes and labels in table format",
     )
-    sed_options.add_option(
+    snp_options.add_option(
         "-u", dest="untransform_old", default=False, action="store_true"
     )
-    parser.add_option_group(sed_options)
+    snp_options.add_option(
+        "--gcs",
+        dest="gcs",
+        default=False,
+        action="store_true",
+        help="Input and output are in gcs",
+    )
+    snp_options.add_option(
+        "--require_gpu",
+        dest="require_gpu",
+        default=False,
+        action="store_true",
+        help="Only run on GPU",
+    )
+    snp_options.add_option(
+        "--tensorrt",
+        dest="tensorrt",
+        default=False,
+        action="store_true",
+        help="Model type is tensorrt optimized",
+    )
+    parser.add_option_group(snp_options)
 
     # cross-fold
     fold_options = OptionGroup(parser, "cross-fold options")
     fold_options.add_option(
-        "-c",
+        "--crosses",
         dest="crosses",
         default=1,
         type="int",
@@ -193,7 +213,7 @@ def main():
     gtex_out_dir = options.out_dir
 
     # split SNP stats
-    sed_stats = options.sed_stats.split(",")
+    snp_stats = options.snp_stats.split(",")
 
     # merge study/tissue variants
     mpos_vcf_file = "%s/pos_merge.vcf" % options.gtex_vcf_dir
@@ -235,7 +255,7 @@ def main():
             options_pkl.close()
 
             # create base fold command
-            cmd_fold = "%s time borzoi_sed.py %s %s %s" % (
+            cmd_fold = "%s hound_snpgene.py %s %s %s" % (
                 cmd_base,
                 options_pkl_file,
                 params_file,
@@ -243,8 +263,8 @@ def main():
             )
 
             for pi in range(options.processes):
-                sed_file = "%s/job%d/sed.h5" % (options.out_dir, pi)
-                if not nonzero_h5(sed_file, sed_stats):
+                scores_file = "%s/job%d/scores.h5" % (options.out_dir, pi)
+                if not nonzero_h5(scores_file, snp_stats):
                     cmd_job = "%s %s %d" % (cmd_fold, mneg_vcf_file, pi)
                     j = slurm.Job(
                         cmd_job,
@@ -272,7 +292,7 @@ def main():
             options_pkl.close()
 
             # create base fold command
-            cmd_fold = "%s time borzoi_sed.py %s %s %s" % (
+            cmd_fold = "%s hound_snpgene.py %s %s %s" % (
                 cmd_base,
                 options_pkl_file,
                 params_file,
@@ -280,8 +300,8 @@ def main():
             )
 
             for pi in range(options.processes):
-                sed_file = "%s/job%d/sed.h5" % (options.out_dir, pi)
-                if not nonzero_h5(sed_file, sed_stats):
+                scores_file = "%s/job%d/scores.h5" % (options.out_dir, pi)
+                if not nonzero_h5(scores_file, snp_stats):
                     cmd_job = "%s %s %d" % (cmd_fold, mpos_vcf_file, pi)
                     j = slurm.Job(
                         cmd_job,
@@ -310,13 +330,15 @@ def main():
 
             # collect negatives
             neg_out_dir = "%s/merge_neg" % it_out_dir
-            if not os.path.isfile("%s/sed.h5" % neg_out_dir):
-                collect_scores(neg_out_dir, options.processes, "sed.h5")
+            if not os.path.isfile("%s/scores.h5" % neg_out_dir):
+                print(f"Collecting {neg_out_dir}")
+                collect_scores(neg_out_dir, options.processes, "scores.h5")
 
             # collect positives
             pos_out_dir = "%s/merge_pos" % it_out_dir
-            if not os.path.isfile("%s/sed.h5" % pos_out_dir):
-                collect_scores(pos_out_dir, options.processes, "sed.h5")
+            if not os.path.isfile("%s/scores.h5" % pos_out_dir):
+                print(f"Collecting {pos_out_dir}")
+                collect_scores(pos_out_dir, options.processes, "scores.h5")
 
     ################################################################
     # split study/tissue variants
@@ -324,13 +346,13 @@ def main():
     for ci in range(options.crosses):
         for fi in range(num_folds):
             it_out_dir = "%s/f%dc%d/%s" % (exp_dir, fi, ci, gtex_out_dir)
-            print(it_out_dir)
+            print(f"Splitting {it_out_dir}")
 
             # split positives
-            split_scores(it_out_dir, "pos", options.gtex_vcf_dir, sed_stats)
+            split_scores(it_out_dir, "pos", options.gtex_vcf_dir, snp_stats)
 
             # split negatives
-            split_scores(it_out_dir, "neg", options.gtex_vcf_dir, sed_stats)
+            split_scores(it_out_dir, "neg", options.gtex_vcf_dir, snp_stats)
 
     ################################################################
     # ensemble
@@ -344,36 +366,37 @@ def main():
         os.mkdir(gtex_dir)
 
     for gtex_pos_vcf in glob.glob("%s/*_pos.vcf" % options.gtex_vcf_dir):
+        print(f"Ensembling {gtex_pos_vcf}")
         gtex_neg_vcf = gtex_pos_vcf.replace("_pos.", "_neg.")
         pos_base = os.path.splitext(os.path.split(gtex_pos_vcf)[1])[0]
         neg_base = os.path.splitext(os.path.split(gtex_neg_vcf)[1])[0]
 
-        # collect SED files
-        sed_pos_files = []
-        sed_neg_files = []
+        # collect score files
+        score_pos_files = []
+        score_neg_files = []
         for ci in range(options.crosses):
             for fi in range(num_folds):
                 it_dir = "%s/f%dc%d" % (exp_dir, fi, ci)
                 it_out_dir = "%s/%s" % (it_dir, gtex_out_dir)
 
-                sed_pos_file = "%s/%s/sed.h5" % (it_out_dir, pos_base)
-                sed_pos_files.append(sed_pos_file)
+                score_pos_file = "%s/%s/scores.h5" % (it_out_dir, pos_base)
+                score_pos_files.append(score_pos_file)
 
-                sed_neg_file = "%s/%s/sed.h5" % (it_out_dir, neg_base)
-                sed_neg_files.append(sed_neg_file)
+                score_neg_file = "%s/%s/scores.h5" % (it_out_dir, neg_base)
+                score_neg_files.append(score_neg_file)
 
         # ensemble
         ens_pos_dir = "%s/%s" % (gtex_dir, pos_base)
         os.makedirs(ens_pos_dir, exist_ok=True)
-        ens_pos_file = "%s/sed.h5" % (ens_pos_dir)
+        ens_pos_file = "%s/scores.h5" % (ens_pos_dir)
         if not os.path.isfile(ens_pos_file):
-            ensemble_h5(ens_pos_file, sed_pos_files, sed_stats)
+            ensemble_h5(ens_pos_file, score_pos_files, snp_stats)
 
         ens_neg_dir = "%s/%s" % (gtex_dir, neg_base)
         os.makedirs(ens_neg_dir, exist_ok=True)
-        ens_neg_file = "%s/sed.h5" % (ens_neg_dir)
+        ens_neg_file = "%s/scores.h5" % (ens_neg_dir)
         if not os.path.isfile(ens_neg_file):
-            ensemble_h5(ens_neg_file, sed_neg_files, sed_stats)
+            ensemble_h5(ens_neg_file, score_neg_files, snp_stats)
 
     ################################################################
     # coefficient analysis
@@ -386,9 +409,9 @@ def main():
             it_dir = "%s/f%dc%d" % (exp_dir, fi, ci)
             it_out_dir = "%s/%s" % (it_dir, gtex_out_dir)
 
-            for sed_stat in sed_stats:
-                coef_out_dir = f"{it_out_dir}/coef-{sed_stat}"
-                cmd_coef = f"{cmd_base} -o {coef_out_dir} -s {sed_stat} {it_out_dir}"
+            for snp_stat in snp_stats:
+                coef_out_dir = f"{it_out_dir}/coef-{snp_stat}"
+                cmd_coef = f"{cmd_base} -o {coef_out_dir} -s {snp_stat} {it_out_dir}"
                 j = slurm.Job(
                     cmd_coef,
                     "coef",
@@ -403,9 +426,9 @@ def main():
 
     # ensemble
     it_out_dir = f"{exp_dir}/ensemble/{gtex_out_dir}"
-    for sed_stat in sed_stats:
-        coef_out_dir = f"{it_out_dir}/coef-{sed_stat}"
-        cmd_coef = f"{cmd_base} -o {coef_out_dir} -s {sed_stat} {it_out_dir}"
+    for snp_stat in snp_stats:
+        coef_out_dir = f"{it_out_dir}/coef-{snp_stat}"
+        cmd_coef = f"{cmd_base} -o {coef_out_dir} -s {snp_stat} {it_out_dir}"
         j = slurm.Job(
             cmd_coef,
             "coef",
@@ -421,7 +444,7 @@ def main():
     slurm.multi_run(jobs, verbose=True)
 
 
-def collect_scores(out_dir: str, num_jobs: int, h5f_name: str = "sad.h5"):
+def collect_scores(out_dir: str, num_jobs: int, h5f_name: str = "scores.h5"):
     """Collect parallel SAD jobs' output into one HDF5.
 
     Args:
@@ -430,144 +453,141 @@ def collect_scores(out_dir: str, num_jobs: int, h5f_name: str = "sad.h5"):
     """
     # count variants
     num_variants = 0
-    num_rows = 0
     for pi in range(num_jobs):
         # open job
         job_h5_file = "%s/job%d/%s" % (out_dir, pi, h5f_name)
-        job_h5_open = h5py.File(job_h5_file, "r")
-        num_variants += len(job_h5_open["snp"])
-        num_rows += len(job_h5_open["si"])
-        job_h5_open.close()
-
-    # initialize final h5
-    final_h5_file = "%s/%s" % (out_dir, h5f_name)
-    final_h5_open = h5py.File(final_h5_file, "w")
-
-    # SNP stats
-    snp_stats = {}
-
-    job0_h5_file = "%s/job0/%s" % (out_dir, h5f_name)
-    job0_h5_open = h5py.File(job0_h5_file, "r")
-    for key in job0_h5_open.keys():
-        if key in ["target_ids", "target_labels"]:
-            # copy
-            final_h5_open.create_dataset(key, data=job0_h5_open[key])
-
-        elif key in ["snp", "chr", "pos", "ref_allele", "alt_allele", "gene"]:
-            snp_stats[key] = []
-
-        elif job0_h5_open[key].ndim == 1:
-            final_h5_open.create_dataset(
-                key, shape=(num_rows,), dtype=job0_h5_open[key].dtype
-            )
-
-        else:
-            num_targets = job0_h5_open[key].shape[1]
-            final_h5_open.create_dataset(
-                key, shape=(num_rows, num_targets), dtype=job0_h5_open[key].dtype
-            )
+        with h5py.File(job_h5_file, "r") as job_h5_open:
+            num_variants += len(job_h5_open["snp"])
 
-    job0_h5_open.close()
+    final_dict = {}
 
-    # set values
-    vgi = 0
-    vi = 0
     for pi in range(num_jobs):
         # open job
         job_h5_file = "%s/job%d/%s" % (out_dir, pi, h5f_name)
         with h5py.File(job_h5_file, "r") as job_h5_open:
-            job_snps = len(job_h5_open["snp"])
-            job_rows = job_h5_open["si"].shape[0]
-
-            # append to final
             for key in job_h5_open.keys():
-                try:
-                    if key in ["target_ids", "target_labels"]:
-                        # once is enough
-                        pass
-
-                    elif key in [
-                        "snp",
-                        "chr",
-                        "pos",
-                        "ref_allele",
-                        "alt_allele",
-                        "gene",
-                    ]:
-                        snp_stats[key] += list(job_h5_open[key])
-
-                    elif key == "si":
-                        # re-index SNPs
-                        final_h5_open[key][vgi : vgi + job_rows] = (
-                            job_h5_open[key][:] + vi
-                        )
-
-                    else:
-                        final_h5_open[key][vgi : vgi + job_rows] = job_h5_open[key]
-
-                except TypeError as e:
-                    print(e)
-                    print(
-                        f"{job_h5_file} {key} has the wrong shape. Remove this file and rerun"
-                    )
-                    exit()
-
-        vgi += job_rows
-        vi += job_snps
-
-    # create final SNP stat datasets
-    for key in snp_stats:
-        if key == "pos":
-            final_h5_open.create_dataset(key, data=np.array(snp_stats[key]))
-        else:
-            final_h5_open.create_dataset(key, data=np.array(snp_stats[key], dtype="S"))
+                if key in ["target_ids", "target_labels"]:
+                    final_dict[key] = job_h5_open[key][:]
+                elif key in ["snp", "chr", "pos", "ref_allele", "alt_allele", "gene"]:
+                    final_dict.setdefault(key, []).append(job_h5_open[key][:])
+                elif isinstance(job_h5_open[key], h5py.Group):
+                    snp_stat = key
+                    if snp_stat not in final_dict:
+                        final_dict[snp_stat] = {}
+                    for snp in job_h5_open[snp_stat].keys():
+                        final_dict[snp_stat][snp] = {}
+                        for gene in job_h5_open[snp_stat][snp].keys():
+                            final_dict[snp_stat][snp][gene] = job_h5_open[snp_stat][
+                                snp
+                            ][gene][:]
+                else:
+                    print(f"During collection, unknown key {key}")
 
-    final_h5_open.close()
+    # initialize final h5
+    final_h5_file = "%s/%s" % (out_dir, h5f_name)
+    with h5py.File(final_h5_file, "w") as final_h5_open:
+        for key in final_dict.keys():
+            if key in ["target_ids", "target_labels"]:
+                final_h5_open.create_dataset(key, data=final_dict[key])
+            elif key in ["snp", "chr", "pos", "ref_allele", "alt_allele", "gene"]:
+                fdv = np.concatenate(final_dict[key])
+                final_h5_open.create_dataset(key, data=fdv)
+
+            else:
+                snp_stat = key
+                final_h5_open.create_group(snp_stat)
+                for snp in final_dict[snp_stat].keys():
+                    final_h5_open[snp_stat].create_group(snp)
+                    for gene in final_dict[snp_stat][snp].keys():
+                        final_h5_open[snp_stat][snp].create_dataset(
+                            gene, data=final_dict[snp_stat][snp][gene]
+                        )
 
 
-def ensemble_h5(ensemble_h5_file: str, scores_files: list, sed_stats: list):
+def ensemble_h5(ensemble_h5_file: str, scores_files: list, snp_stats: list):
     """Ensemble scores from multiple files into a single file.
 
     Args:
       ensemble_h5_file (str): ensemble score HDF5.
       scores_files ([str]): list of replicate score HDFs.
-      sed_stats ([str]): SED stats to average over folds.
+      snp_stats ([str]): SNP stats to average over folds.
     """
     # open ensemble
     ensemble_h5 = h5py.File(ensemble_h5_file, "w")
 
-    # transfer non-SED keys
-    sed_shapes = {}
-    scores0_h5 = h5py.File(scores_files[0], "r")
-    for key in scores0_h5.keys():
-        if key not in sed_stats:
-            ensemble_h5.create_dataset(key, data=scores0_h5[key])
-        else:
-            sed_shapes[key] = scores0_h5[key].shape
-    scores0_h5.close()
+    with h5py.File(scores_files[0], "r") as scores0_h5:
+        for key in scores0_h5.keys():
+            if key in snp_stats:
+                ensemble_h5.create_group(key)
+            else:
+                ensemble_h5.create_dataset(key, data=scores0_h5[key])
 
     # average stats
     num_folds = len(scores_files)
-    for sed_stat in sed_stats:
-        # initialize ensemble array
-        sed_values = np.zeros(shape=sed_shapes[sed_stat], dtype="float32")
-
-        # read and add folds
+    for snp_stat in snp_stats:
+        # sum scores across folds
+        snpgene_scores = {}
         for scores_file in scores_files:
             with h5py.File(scores_file, "r") as scores_h5:
-                sed_values += scores_h5[sed_stat][:].astype("float32")
+                for snp in scores_h5[snp_stat].keys():
+                    if snp not in snpgene_scores:
+                        snpgene_scores[snp] = {}
+                    for gene in scores_h5[snp_stat][snp].keys():
+                        if gene not in snpgene_scores[snp]:
+                            snpgene_scores[snp][gene] = scores_h5[snp_stat][snp][gene][
+                                :
+                            ].astype("float32")
+                        else:
+                            snpgene_scores[snp][gene] += scores_h5[snp_stat][snp][gene][
+                                :
+                            ].astype("float32")
+
+        # write average score
+        for snp in snpgene_scores:
+            ensemble_h5[snp_stat].create_group(snp)
+            for gene in snpgene_scores[snp]:
+                ensemble_score = snpgene_scores[snp][gene] / num_folds
+                ensemble_h5[snp_stat][snp].create_dataset(
+                    gene, data=ensemble_score.astype("float16")
+                )
 
-        # normalize and downcast
-        sed_values /= num_folds
-        sed_values = sed_values.astype("float16")
+    ensemble_h5.close()
 
-        # save
-        ensemble_h5.create_dataset(sed_stat, data=sed_values)
 
-    ensemble_h5.close()
+def nonzero_h5(h5_file: str, stat_keys):
+    """Verify the HDF5 exists, and there are nonzero values
+      for each stat key given.
+
+    Args:
+      h5_file (str): HDF5 file name.
+      stat_keys ([str]): List of SNP stat keys.
+    """
+    if os.path.isfile(h5_file):
+        try:
+            with h5py.File(h5_file, "r") as h5_open:
+                snps_all = set([snp.decode("UTF-8") for snp in h5_open["snp"]])
+                for sk in stat_keys:
+                    snps_stat = set(h5_open[sk].keys())
+                    snps_ovl = snps_all & snps_stat
+                    if len(snps_ovl) == 0:
+                        print(f"{h5_file}: {sk} empty.")
+                        return False
+                    else:
+                        for snp in list(snps_ovl)[:5]:
+                            for gene in h5_open[sk][snp].keys():
+                                score = h5_open[sk][snp][gene][:]
+                                if score.var(dtype="float64") == 0:
+                                    print(f"{h5_file}: {sk} {snp} {gene} zero var.")
+                                    return False
+                return True
+        except:
+            print(f"{h5_file}: error", sys.exc_info()[0])
+            return False
+    else:
+        return False
 
 
-def split_scores(it_out_dir: str, posneg: str, vcf_dir: str, sed_stats):
+def split_scores(it_out_dir: str, posneg: str, vcf_dir: str, snp_stats):
     """Split merged VCF predictions in HDF5 into tissue-specific
     predictions in HDF5.
 
@@ -575,23 +595,14 @@ def split_scores(it_out_dir: str, posneg: str, vcf_dir: str, sed_stats):
       it_out_dir (str): output directory for iteration.
       posneg (str): 'pos' or 'neg'.
       vcf_dir (str): directory containing tissue-specific VCFs.
-      sed_stats ([str]]): list of SED stats.
+      snp_stats ([str]]): list of SED stats.
     """
-    merge_h5_file = "%s/merge_%s/sed.h5" % (it_out_dir, posneg)
+    merge_h5_file = "%s/merge_%s/scores.h5" % (it_out_dir, posneg)
     merge_h5 = h5py.File(merge_h5_file, "r")
 
-    # read merged data
-    merge_si = merge_h5["si"][:]
-    merge_snps = [snp.decode("UTF-8") for snp in merge_h5["snp"]]
-    merge_gene = [gene.decode("UTF-8") for gene in merge_h5["gene"]]
-    merge_scores = {}
-    for ss in sed_stats:
-        merge_scores[ss] = merge_h5[ss][:]
-
-    # hash snps to row indexes
-    snp_ri = {}
-    for ri, si in enumerate(merge_si):
-        snp_ri.setdefault(merge_snps[si], []).append(ri)
+    # hash scored SNPs
+    all_snps = set([snp.decode("UTF-8") for snp in merge_h5["snp"]])
+    scored_snps = set([snp for snp in merge_h5[snp_stats[0]].keys()])
 
     # for each tissue VCF
     vcf_glob = "%s/*_%s.vcf" % (vcf_dir, posneg)
@@ -601,74 +612,64 @@ def split_scores(it_out_dir: str, posneg: str, vcf_dir: str, sed_stats):
         tissue_label = tissue_label.replace("_neg.vcf", "")
 
         # initialize HDF5 arrays
-        sed_snp = []
-        sed_chr = []
-        sed_pos = []
-        sed_ref = []
-        sed_alt = []
-        sed_gene = []
-        sed_snpi = []
-        sed_scores = {}
-        for ss in sed_stats:
-            sed_scores[ss] = []
+        snpg_snp = []
+        snpg_chr = []
+        snpg_pos = []
+        snpg_ref = []
+        snpg_alt = []
 
         # fill HDF5 arrays with ordered SNPs
-        si = 0
         for line in open(tissue_vcf_file):
             if not line.startswith("#"):
                 a = line.split()
                 chrm, pos, snp, ref, alt = a[:5]
 
                 # SNPs w/o genes disappear
-                if snp in snp_ri:
-                    sed_snp.append(snp)
-                    sed_chr.append(chrm)
-                    sed_pos.append(int(pos))
-                    sed_ref.append(ref)
-                    sed_alt.append(alt)
-
-                    for ri in snp_ri[snp]:
-                        sed_snpi.append(si)
-                        sed_gene.append(merge_gene[ri])
-                        for ss in sed_stats:
-                            sed_scores[ss].append(merge_scores[ss][ri])
-
-                    si += 1
+                if snp in all_snps:
+                    snpg_snp.append(snp)
+                    snpg_chr.append(chrm)
+                    snpg_pos.append(int(pos))
+                    snpg_ref.append(ref)
+                    snpg_alt.append(alt)
 
         # write tissue HDF5
         tissue_dir = "%s/%s_%s" % (it_out_dir, tissue_label, posneg)
         os.makedirs(tissue_dir, exist_ok=True)
-        with h5py.File("%s/sed.h5" % tissue_dir, "w") as tissue_h5:
+        with h5py.File("%s/scores.h5" % tissue_dir, "w") as tissue_h5:
             # write SNPs
-            tissue_h5.create_dataset("snp", data=np.array(sed_snp, "S"))
+            tissue_h5.create_dataset("snp", data=np.array(snpg_snp, "S"))
 
             # write chr
-            tissue_h5.create_dataset("chr", data=np.array(sed_chr, "S"))
+            tissue_h5.create_dataset("chr", data=np.array(snpg_chr, "S"))
 
             # write SNP pos
-            tissue_h5.create_dataset("pos", data=np.array(sed_pos, dtype="uint32"))
+            tissue_h5.create_dataset("pos", data=np.array(snpg_pos, dtype="uint32"))
 
             # write ref allele
-            tissue_h5.create_dataset("ref_allele", data=np.array(sed_ref, dtype="S"))
+            tissue_h5.create_dataset("ref_allele", data=np.array(snpg_ref, dtype="S"))
 
             # write alt allele
-            tissue_h5.create_dataset("alt_allele", data=np.array(sed_alt, dtype="S"))
-
-            # write SNP i
-            tissue_h5.create_dataset("si", data=np.array(sed_snpi))
-
-            # write gene
-            tissue_h5.create_dataset("gene", data=np.array(sed_gene, "S"))
+            tissue_h5.create_dataset("alt_allele", data=np.array(snpg_alt, dtype="S"))
 
             # write targets
             tissue_h5.create_dataset("target_ids", data=merge_h5["target_ids"])
             tissue_h5.create_dataset("target_labels", data=merge_h5["target_labels"])
 
-            # write sed stats
-            for ss in sed_stats:
-                tissue_h5.create_dataset(
-                    ss, data=np.array(sed_scores[ss], dtype="float16")
-                )
+            # write SNP stats
+            genes = set()
+            for ss in snp_stats:
+                tissue_h5.create_group(ss)
+                for snp in snpg_snp:
+                    if snp in scored_snps:
+                        tissue_h5[ss].create_group(snp)
+                        for gene in merge_h5[ss][snp].keys():
+                            tissue_h5[ss][snp].create_dataset(
+                                gene, data=merge_h5[ss][snp][gene][:]
+                            )
+                            genes.add(gene)
+
+            # write genes
+            tissue_h5.create_dataset("gene", data=np.array(sorted(genes), "S"))
 
     merge_h5.close()
 
diff --git a/src/westminster/scripts/westminster_gtexsed_folds.py b/src/westminster/scripts/westminster_gtexsed_folds.py
new file mode 100755
index 0000000..0f02186
--- /dev/null
+++ b/src/westminster/scripts/westminster_gtexsed_folds.py
@@ -0,0 +1,680 @@
+#!/usr/bin/env python
+# Copyright 2023 Calico LLC
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     https://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========================================================================
+from optparse import OptionParser, OptionGroup
+import glob
+import pickle
+import pdb
+import os
+
+import h5py
+import numpy as np
+
+import slurm
+
+from westminster.multi import nonzero_h5
+
+"""
+westminster_gtexg_folds.py
+
+Benchmark Baskerville model replicates on GTEx eQTL coefficient task.
+"""
+
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = "usage: %prog [options] <params_file> <data_dir>"
+    parser = OptionParser(usage)
+
+    # sed options
+    sed_options = OptionGroup(parser, "borzoi_sed.py options")
+    sed_options.add_option(
+        "-b",
+        dest="bedgraph",
+        default=False,
+        action="store_true",
+        help="Write ref/alt predictions as bedgraph [Default: %default]",
+    )
+    sed_options.add_option(
+        "-f",
+        dest="genome_fasta",
+        default="%s/data/hg38.fa" % os.environ["BASENJIDIR"],
+        help="Genome FASTA for sequences [Default: %default]",
+    )
+    sed_options.add_option(
+        "-g",
+        dest="genes_gtf",
+        default="%s/genes/gencode41/gencode41_basic_nort.gtf" % os.environ["HG38"],
+        help="GTF for gene definition [Default %default]",
+    )
+    sed_options.add_option(
+        "-o",
+        dest="out_dir",
+        default="sed",
+        help="Output directory for tables and plots [Default: %default]",
+    )
+    sed_options.add_option(
+        "--rc",
+        dest="rc",
+        default=False,
+        action="store_true",
+        help="Average forward and reverse complement predictions [Default: %default]",
+    )
+    sed_options.add_option(
+        "--shifts",
+        dest="shifts",
+        default="0",
+        type="str",
+        help="Ensemble prediction shifts [Default: %default]",
+    )
+    sed_options.add_option(
+        "--span",
+        dest="span",
+        default=False,
+        action="store_true",
+        help="Aggregate entire gene span [Default: %default]",
+    )
+    sed_options.add_option(
+        "--stats",
+        dest="sed_stats",
+        default="SED",
+        help="Comma-separated list of stats to save. [Default: %default]",
+    )
+    sed_options.add_option(
+        "-t",
+        dest="targets_file",
+        default=None,
+        type="str",
+        help="File specifying target indexes and labels in table format",
+    )
+    sed_options.add_option(
+        "-u", dest="untransform_old", default=False, action="store_true"
+    )
+    parser.add_option_group(sed_options)
+
+    # cross-fold
+    fold_options = OptionGroup(parser, "cross-fold options")
+    fold_options.add_option(
+        "-c",
+        dest="crosses",
+        default=1,
+        type="int",
+        help="Number of cross-fold rounds [Default:%default]",
+    )
+    fold_options.add_option(
+        "-d",
+        dest="data_head",
+        default=None,
+        type="int",
+        help="Index for dataset/head [Default: %default]",
+    )
+    fold_options.add_option(
+        "-e",
+        dest="conda_env",
+        default="tf210",
+        help="Anaconda environment [Default: %default]",
+    )
+    fold_options.add_option(
+        "--gtex",
+        dest="gtex_vcf_dir",
+        default="/home/drk/seqnn/data/gtex_fine/susie_pip90",
+    )
+    fold_options.add_option(
+        "--name",
+        dest="name",
+        default="gtex",
+        help="SLURM name prefix [Default: %default]",
+    )
+    fold_options.add_option(
+        "--max_proc",
+        dest="max_proc",
+        default=None,
+        type="int",
+        help="Maximum concurrent processes [Default: %default]",
+    )
+    fold_options.add_option(
+        "-p",
+        dest="processes",
+        default=None,
+        type="int",
+        help="Number of processes, passed by multi script. \
+            (Unused, but needs to appear as dummy.)",
+    )
+    fold_options.add_option(
+        "-q",
+        dest="queue",
+        default="geforce",
+        help="SLURM queue on which to run the jobs [Default: %default]",
+    )
+    parser.add_option_group(fold_options)
+
+    (options, args) = parser.parse_args()
+
+    if len(args) != 2:
+        parser.error("Must provide parameters file and cross-fold directory")
+    else:
+        params_file = args[0]
+        exp_dir = args[1]
+
+    #######################################################
+    # prep work
+
+    # count folds
+    num_folds = 0
+    fold0_dir = "%s/f%dc0" % (exp_dir, num_folds)
+    model_file = "%s/train/model_best.h5" % fold0_dir
+    if options.data_head is not None:
+        model_file = "%s/train/model%d_best.h5" % (fold0_dir, options.data_head)
+    while os.path.isfile(model_file):
+        num_folds += 1
+        fold0_dir = "%s/f%dc0" % (exp_dir, num_folds)
+        model_file = "%s/train/model_best.h5" % fold0_dir
+        if options.data_head is not None:
+            model_file = "%s/train/model%d_best.h5" % (fold0_dir, options.data_head)
+    print("Found %d folds" % num_folds)
+    if num_folds == 0:
+        exit(1)
+
+    # extract output subdirectory name
+    gtex_out_dir = options.out_dir
+
+    # split SNP stats
+    sed_stats = options.sed_stats.split(",")
+
+    # merge study/tissue variants
+    mpos_vcf_file = "%s/pos_merge.vcf" % options.gtex_vcf_dir
+    mneg_vcf_file = "%s/neg_merge.vcf" % options.gtex_vcf_dir
+
+    ################################################################
+    # SED
+
+    # SED command base
+    cmd_base = ". /home/drk/anaconda3/etc/profile.d/conda.sh;"
+    cmd_base += " conda activate %s;" % options.conda_env
+    cmd_base += " echo $HOSTNAME;"
+
+    jobs = []
+
+    for ci in range(options.crosses):
+        for fi in range(num_folds):
+            it_dir = "%s/f%dc%d" % (exp_dir, fi, ci)
+            name = "%s-f%dc%d" % (options.name, fi, ci)
+
+            # update output directory
+            it_out_dir = "%s/%s" % (it_dir, gtex_out_dir)
+            os.makedirs(it_out_dir, exist_ok=True)
+
+            # choose model
+            model_file = "%s/train/model_best.h5" % it_dir
+            if options.data_head is not None:
+                model_file = "%s/train/model%d_best.h5" % (it_dir, options.data_head)
+
+            ########################################
+            # negative jobs
+
+            # pickle options
+            options.out_dir = "%s/merge_neg" % it_out_dir
+            os.makedirs(options.out_dir, exist_ok=True)
+            options_pkl_file = "%s/options.pkl" % options.out_dir
+            options_pkl = open(options_pkl_file, "wb")
+            pickle.dump(options, options_pkl)
+            options_pkl.close()
+
+            # create base fold command
+            cmd_fold = "%s time borzoi_sed.py %s %s %s" % (
+                cmd_base,
+                options_pkl_file,
+                params_file,
+                model_file,
+            )
+
+            for pi in range(options.processes):
+                sed_file = "%s/job%d/sed.h5" % (options.out_dir, pi)
+                if not nonzero_h5(sed_file, sed_stats):
+                    cmd_job = "%s %s %d" % (cmd_fold, mneg_vcf_file, pi)
+                    j = slurm.Job(
+                        cmd_job,
+                        "%s_neg%d" % (name, pi),
+                        "%s/job%d.out" % (options.out_dir, pi),
+                        "%s/job%d.err" % (options.out_dir, pi),
+                        "%s/job%d.sb" % (options.out_dir, pi),
+                        queue=options.queue,
+                        gpu=1,
+                        cpu=2,
+                        mem=30000,
+                        time="7-0:0:0",
+                    )
+                    jobs.append(j)
+
+            ########################################
+            # positive jobs
+
+            # pickle options
+            options.out_dir = "%s/merge_pos" % it_out_dir
+            os.makedirs(options.out_dir, exist_ok=True)
+            options_pkl_file = "%s/options.pkl" % options.out_dir
+            options_pkl = open(options_pkl_file, "wb")
+            pickle.dump(options, options_pkl)
+            options_pkl.close()
+
+            # create base fold command
+            cmd_fold = "%s time borzoi_sed.py %s %s %s" % (
+                cmd_base,
+                options_pkl_file,
+                params_file,
+                model_file,
+            )
+
+            for pi in range(options.processes):
+                sed_file = "%s/job%d/sed.h5" % (options.out_dir, pi)
+                if not nonzero_h5(sed_file, sed_stats):
+                    cmd_job = "%s %s %d" % (cmd_fold, mpos_vcf_file, pi)
+                    j = slurm.Job(
+                        cmd_job,
+                        "%s_pos%d" % (name, pi),
+                        "%s/job%d.out" % (options.out_dir, pi),
+                        "%s/job%d.err" % (options.out_dir, pi),
+                        "%s/job%d.sb" % (options.out_dir, pi),
+                        queue=options.queue,
+                        gpu=1,
+                        cpu=2,
+                        mem=30000,
+                        time="7-0:0:0",
+                    )
+                    jobs.append(j)
+
+    slurm.multi_run(
+        jobs, max_proc=options.max_proc, verbose=True, launch_sleep=10, update_sleep=60
+    )
+
+    #######################################################
+    # collect output
+
+    for ci in range(options.crosses):
+        for fi in range(num_folds):
+            it_out_dir = "%s/f%dc%d/%s" % (exp_dir, fi, ci, gtex_out_dir)
+
+            # collect negatives
+            neg_out_dir = "%s/merge_neg" % it_out_dir
+            if not os.path.isfile("%s/sed.h5" % neg_out_dir):
+                collect_scores(neg_out_dir, options.processes, "sed.h5")
+
+            # collect positives
+            pos_out_dir = "%s/merge_pos" % it_out_dir
+            if not os.path.isfile("%s/sed.h5" % pos_out_dir):
+                collect_scores(pos_out_dir, options.processes, "sed.h5")
+
+    ################################################################
+    # split study/tissue variants
+
+    for ci in range(options.crosses):
+        for fi in range(num_folds):
+            it_out_dir = "%s/f%dc%d/%s" % (exp_dir, fi, ci, gtex_out_dir)
+            print(it_out_dir)
+
+            # split positives
+            split_scores(it_out_dir, "pos", options.gtex_vcf_dir, sed_stats)
+
+            # split negatives
+            split_scores(it_out_dir, "neg", options.gtex_vcf_dir, sed_stats)
+
+    ################################################################
+    # ensemble
+
+    ensemble_dir = "%s/ensemble" % exp_dir
+    if not os.path.isdir(ensemble_dir):
+        os.mkdir(ensemble_dir)
+
+    gtex_dir = "%s/%s" % (ensemble_dir, gtex_out_dir)
+    if not os.path.isdir(gtex_dir):
+        os.mkdir(gtex_dir)
+
+    for gtex_pos_vcf in glob.glob("%s/*_pos.vcf" % options.gtex_vcf_dir):
+        gtex_neg_vcf = gtex_pos_vcf.replace("_pos.", "_neg.")
+        pos_base = os.path.splitext(os.path.split(gtex_pos_vcf)[1])[0]
+        neg_base = os.path.splitext(os.path.split(gtex_neg_vcf)[1])[0]
+
+        # collect SED files
+        sed_pos_files = []
+        sed_neg_files = []
+        for ci in range(options.crosses):
+            for fi in range(num_folds):
+                it_dir = "%s/f%dc%d" % (exp_dir, fi, ci)
+                it_out_dir = "%s/%s" % (it_dir, gtex_out_dir)
+
+                sed_pos_file = "%s/%s/sed.h5" % (it_out_dir, pos_base)
+                sed_pos_files.append(sed_pos_file)
+
+                sed_neg_file = "%s/%s/sed.h5" % (it_out_dir, neg_base)
+                sed_neg_files.append(sed_neg_file)
+
+        # ensemble
+        ens_pos_dir = "%s/%s" % (gtex_dir, pos_base)
+        os.makedirs(ens_pos_dir, exist_ok=True)
+        ens_pos_file = "%s/sed.h5" % (ens_pos_dir)
+        if not os.path.isfile(ens_pos_file):
+            ensemble_h5(ens_pos_file, sed_pos_files, sed_stats)
+
+        ens_neg_dir = "%s/%s" % (gtex_dir, neg_base)
+        os.makedirs(ens_neg_dir, exist_ok=True)
+        ens_neg_file = "%s/sed.h5" % (ens_neg_dir)
+        if not os.path.isfile(ens_neg_file):
+            ensemble_h5(ens_neg_file, sed_neg_files, sed_stats)
+
+    ################################################################
+    # coefficient analysis
+
+    cmd_base = "westminster_gtexg_coef.py -g %s" % options.gtex_vcf_dir
+
+    jobs = []
+    for ci in range(options.crosses):
+        for fi in range(num_folds):
+            it_dir = "%s/f%dc%d" % (exp_dir, fi, ci)
+            it_out_dir = "%s/%s" % (it_dir, gtex_out_dir)
+
+            for sed_stat in sed_stats:
+                coef_out_dir = f"{it_out_dir}/coef-{sed_stat}"
+                cmd_coef = f"{cmd_base} -o {coef_out_dir} -s {sed_stat} {it_out_dir}"
+                j = slurm.Job(
+                    cmd_coef,
+                    "coef",
+                    f"{coef_out_dir}.out",
+                    f"{coef_out_dir}.err",
+                    queue="standard",
+                    cpu=2,
+                    mem=22000,
+                    time="12:0:0",
+                )
+                jobs.append(j)
+
+    # ensemble
+    it_out_dir = f"{exp_dir}/ensemble/{gtex_out_dir}"
+    for sed_stat in sed_stats:
+        coef_out_dir = f"{it_out_dir}/coef-{sed_stat}"
+        cmd_coef = f"{cmd_base} -o {coef_out_dir} -s {sed_stat} {it_out_dir}"
+        j = slurm.Job(
+            cmd_coef,
+            "coef",
+            f"{coef_out_dir}.out",
+            f"{coef_out_dir}.err",
+            queue="standard",
+            cpu=2,
+            mem=22000,
+            time="12:0:0",
+        )
+        jobs.append(j)
+
+    slurm.multi_run(jobs, verbose=True)
+
+
+def collect_scores(out_dir: str, num_jobs: int, h5f_name: str = "sad.h5"):
+    """Collect parallel SAD jobs' output into one HDF5.
+
+    Args:
+      out_dir (str): Output directory.
+      num_jobs (int): Number of jobs to combine results from.
+    """
+    # count variants
+    num_variants = 0
+    num_rows = 0
+    for pi in range(num_jobs):
+        # open job
+        job_h5_file = "%s/job%d/%s" % (out_dir, pi, h5f_name)
+        job_h5_open = h5py.File(job_h5_file, "r")
+        num_variants += len(job_h5_open["snp"])
+        num_rows += len(job_h5_open["si"])
+        job_h5_open.close()
+
+    # initialize final h5
+    final_h5_file = "%s/%s" % (out_dir, h5f_name)
+    final_h5_open = h5py.File(final_h5_file, "w")
+
+    # SNP stats
+    snp_stats = {}
+
+    job0_h5_file = "%s/job0/%s" % (out_dir, h5f_name)
+    job0_h5_open = h5py.File(job0_h5_file, "r")
+    for key in job0_h5_open.keys():
+        if key in ["target_ids", "target_labels"]:
+            # copy
+            final_h5_open.create_dataset(key, data=job0_h5_open[key])
+
+        elif key in ["snp", "chr", "pos", "ref_allele", "alt_allele", "gene"]:
+            snp_stats[key] = []
+
+        elif job0_h5_open[key].ndim == 1:
+            final_h5_open.create_dataset(
+                key, shape=(num_rows,), dtype=job0_h5_open[key].dtype
+            )
+
+        else:
+            num_targets = job0_h5_open[key].shape[1]
+            final_h5_open.create_dataset(
+                key, shape=(num_rows, num_targets), dtype=job0_h5_open[key].dtype
+            )
+
+    job0_h5_open.close()
+
+    # set values
+    vgi = 0
+    vi = 0
+    for pi in range(num_jobs):
+        # open job
+        job_h5_file = "%s/job%d/%s" % (out_dir, pi, h5f_name)
+        with h5py.File(job_h5_file, "r") as job_h5_open:
+            job_snps = len(job_h5_open["snp"])
+            job_rows = job_h5_open["si"].shape[0]
+
+            # append to final
+            for key in job_h5_open.keys():
+                try:
+                    if key in ["target_ids", "target_labels"]:
+                        # once is enough
+                        pass
+
+                    elif key in [
+                        "snp",
+                        "chr",
+                        "pos",
+                        "ref_allele",
+                        "alt_allele",
+                        "gene",
+                    ]:
+                        snp_stats[key] += list(job_h5_open[key])
+
+                    elif key == "si":
+                        # re-index SNPs
+                        final_h5_open[key][vgi : vgi + job_rows] = (
+                            job_h5_open[key][:] + vi
+                        )
+
+                    else:
+                        final_h5_open[key][vgi : vgi + job_rows] = job_h5_open[key]
+
+                except TypeError as e:
+                    print(e)
+                    print(
+                        f"{job_h5_file} {key} has the wrong shape. Remove this file and rerun"
+                    )
+                    exit()
+
+        vgi += job_rows
+        vi += job_snps
+
+    # create final SNP stat datasets
+    for key in snp_stats:
+        if key == "pos":
+            final_h5_open.create_dataset(key, data=np.array(snp_stats[key]))
+        else:
+            final_h5_open.create_dataset(key, data=np.array(snp_stats[key], dtype="S"))
+
+    final_h5_open.close()
+
+
+def ensemble_h5(ensemble_h5_file: str, scores_files: list, sed_stats: list):
+    """Ensemble scores from multiple files into a single file.
+
+    Args:
+      ensemble_h5_file (str): ensemble score HDF5.
+      scores_files ([str]): list of replicate score HDFs.
+      sed_stats ([str]): SED stats to average over folds.
+    """
+    # open ensemble
+    ensemble_h5 = h5py.File(ensemble_h5_file, "w")
+
+    # transfer non-SED keys
+    sed_shapes = {}
+    scores0_h5 = h5py.File(scores_files[0], "r")
+    for key in scores0_h5.keys():
+        if key not in sed_stats:
+            ensemble_h5.create_dataset(key, data=scores0_h5[key])
+        else:
+            sed_shapes[key] = scores0_h5[key].shape
+    scores0_h5.close()
+
+    # average stats
+    num_folds = len(scores_files)
+    for sed_stat in sed_stats:
+        # initialize ensemble array
+        sed_values = np.zeros(shape=sed_shapes[sed_stat], dtype="float32")
+
+        # read and add folds
+        for scores_file in scores_files:
+            with h5py.File(scores_file, "r") as scores_h5:
+                sed_values += scores_h5[sed_stat][:].astype("float32")
+
+        # normalize and downcast
+        sed_values /= num_folds
+        sed_values = sed_values.astype("float16")
+
+        # save
+        ensemble_h5.create_dataset(sed_stat, data=sed_values)
+
+    ensemble_h5.close()
+
+
+def split_scores(it_out_dir: str, posneg: str, vcf_dir: str, sed_stats):
+    """Split merged VCF predictions in HDF5 into tissue-specific
+    predictions in HDF5.
+
+    Args:
+      it_out_dir (str): output directory for iteration.
+      posneg (str): 'pos' or 'neg'.
+      vcf_dir (str): directory containing tissue-specific VCFs.
+      sed_stats ([str]]): list of SED stats.
+    """
+    merge_h5_file = "%s/merge_%s/sed.h5" % (it_out_dir, posneg)
+    merge_h5 = h5py.File(merge_h5_file, "r")
+
+    # read merged data
+    merge_si = merge_h5["si"][:]
+    merge_snps = [snp.decode("UTF-8") for snp in merge_h5["snp"]]
+    merge_gene = [gene.decode("UTF-8") for gene in merge_h5["gene"]]
+    merge_scores = {}
+    for ss in sed_stats:
+        merge_scores[ss] = merge_h5[ss][:]
+
+    # hash snps to row indexes
+    snp_ri = {}
+    for ri, si in enumerate(merge_si):
+        snp_ri.setdefault(merge_snps[si], []).append(ri)
+
+    # for each tissue VCF
+    vcf_glob = "%s/*_%s.vcf" % (vcf_dir, posneg)
+    for tissue_vcf_file in glob.glob(vcf_glob):
+        tissue_label = tissue_vcf_file.split("/")[-1]
+        tissue_label = tissue_label.replace("_pos.vcf", "")
+        tissue_label = tissue_label.replace("_neg.vcf", "")
+
+        # initialize HDF5 arrays
+        sed_snp = []
+        sed_chr = []
+        sed_pos = []
+        sed_ref = []
+        sed_alt = []
+        sed_gene = []
+        sed_snpi = []
+        sed_scores = {}
+        for ss in sed_stats:
+            sed_scores[ss] = []
+
+        # fill HDF5 arrays with ordered SNPs
+        si = 0
+        for line in open(tissue_vcf_file):
+            if not line.startswith("#"):
+                a = line.split()
+                chrm, pos, snp, ref, alt = a[:5]
+
+                # SNPs w/o genes disappear
+                if snp in snp_ri:
+                    sed_snp.append(snp)
+                    sed_chr.append(chrm)
+                    sed_pos.append(int(pos))
+                    sed_ref.append(ref)
+                    sed_alt.append(alt)
+
+                    for ri in snp_ri[snp]:
+                        sed_snpi.append(si)
+                        sed_gene.append(merge_gene[ri])
+                        for ss in sed_stats:
+                            sed_scores[ss].append(merge_scores[ss][ri])
+
+                    si += 1
+
+        # write tissue HDF5
+        tissue_dir = "%s/%s_%s" % (it_out_dir, tissue_label, posneg)
+        os.makedirs(tissue_dir, exist_ok=True)
+        with h5py.File("%s/sed.h5" % tissue_dir, "w") as tissue_h5:
+            # write SNPs
+            tissue_h5.create_dataset("snp", data=np.array(sed_snp, "S"))
+
+            # write chr
+            tissue_h5.create_dataset("chr", data=np.array(sed_chr, "S"))
+
+            # write SNP pos
+            tissue_h5.create_dataset("pos", data=np.array(sed_pos, dtype="uint32"))
+
+            # write ref allele
+            tissue_h5.create_dataset("ref_allele", data=np.array(sed_ref, dtype="S"))
+
+            # write alt allele
+            tissue_h5.create_dataset("alt_allele", data=np.array(sed_alt, dtype="S"))
+
+            # write SNP i
+            tissue_h5.create_dataset("si", data=np.array(sed_snpi))
+
+            # write gene
+            tissue_h5.create_dataset("gene", data=np.array(sed_gene, "S"))
+
+            # write targets
+            tissue_h5.create_dataset("target_ids", data=merge_h5["target_ids"])
+            tissue_h5.create_dataset("target_labels", data=merge_h5["target_labels"])
+
+            # write sed stats
+            for ss in sed_stats:
+                tissue_h5.create_dataset(
+                    ss, data=np.array(sed_scores[ss], dtype="float16")
+                )
+
+    merge_h5.close()
+
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == "__main__":
+    main()