Merge pull request #2 from calico/snpclust

Snpclust
calico · Sep 13, 2023 · cf08b90 · cf08b90
2 parents 8688ad4 + a2c5e3a
commit cf08b90
Show file tree

Hide file tree

Showing 10 changed files with 430 additions and 182 deletions.
diff --git a/src/baskerville/blocks.py b/src/baskerville/blocks.py
@@ -18,6 +18,7 @@
 
 from baskerville import layers
 
+
 ############################################################
 # Convolution
 ############################################################
@@ -892,7 +893,7 @@ def conv_tower(
     divisible_by=1,
     repeat=1,
     reprs=[],
-    **kwargs
+    **kwargs,
 ):
     """Construct a reducing convolution block.
 
@@ -943,7 +944,7 @@ def conv_tower_nac(
     divisible_by=1,
     repeat=1,
     reprs=[],
-    **kwargs
+    **kwargs,
 ):
     """Construct a reducing convolution block.
 
@@ -1000,7 +1001,7 @@ def res_tower(
     repeat=1,
     num_convs=2,
     reprs=[],
-    **kwargs
+    **kwargs,
 ):
     """Construct a reducing convolution block.
 
@@ -1087,7 +1088,7 @@ def convnext_tower(
     repeat=1,
     num_convs=2,
     reprs=[],
-    **kwargs
+    **kwargs,
 ):
     """Abc.
 
@@ -1129,7 +1130,7 @@ def _round(x):
             filters=rep_filters_int,
             kernel_size=kernel_size,
             dropout=dropout,
-            **kwargs
+            **kwargs,
         )
         current0 = current
 
@@ -1141,7 +1142,7 @@ def _round(x):
                 filters=rep_filters_int,
                 kernel_size=kernel_size,
                 dropout=dropout,
-                **kwargs
+                **kwargs,
             )
 
         # residual add
@@ -1187,7 +1188,7 @@ def transformer(
     qkv_width=1,
     mha_initializer="he_normal",
     kernel_initializer="he_normal",
-    **kwargs
+    **kwargs,
 ):
     """Construct a transformer block.
 
@@ -1255,7 +1256,7 @@ def transformer_split(
     qkv_width=1,
     mha_initializer="he_normal",
     kernel_initializer="he_normal",
-    **kwargs
+    **kwargs,
 ):
     """Construct a transformer block.
 
@@ -1393,7 +1394,7 @@ def transformer2(
     dropout=0.25,
     dense_expansion=2.0,
     qkv_width=1,
-    **kwargs
+    **kwargs,
 ):
     """Construct a transformer block, with length-wise pooling before
        returning to full length.
@@ -1416,7 +1417,7 @@ def transformer2(
         filters=min(4 * key_size, inputs.shape[-1]),
         kernel_size=3,
         pool_size=2,
-        **kwargs
+        **kwargs,
     )
 
     # layer norm
@@ -1517,7 +1518,7 @@ def squeeze_excite(
     additive=False,
     norm_type=None,
     bn_momentum=0.9,
-    **kwargs
+    **kwargs,
 ):
     return layers.SqueezeExcite(
         activation, additive, bottleneck_ratio, norm_type, bn_momentum
@@ -1545,7 +1546,7 @@ def dilated_dense(
     conv_type="standard",
     dropout=0,
     repeat=1,
-    **kwargs
+    **kwargs,
 ):
     """Construct a residual dilated dense block.
 
@@ -1570,7 +1571,7 @@ def dilated_dense(
             kernel_size=kernel_size,
             dilation_rate=int(np.round(dilation_rate)),
             conv_type=conv_type,
-            **kwargs
+            **kwargs,
         )
 
         # dense concat
@@ -1592,7 +1593,7 @@ def dilated_residual(
     conv_type="standard",
     norm_type=None,
     round=False,
-    **kwargs
+    **kwargs,
 ):
     """Construct a residual dilated convolution block.
 
@@ -1619,7 +1620,7 @@ def dilated_residual(
             conv_type=conv_type,
             norm_type=norm_type,
             norm_gamma="ones",
-            **kwargs
+            **kwargs,
         )
 
         # return
@@ -1629,7 +1630,7 @@ def dilated_residual(
             dropout=dropout,
             norm_type=norm_type,
             norm_gamma="zeros",
-            **kwargs
+            **kwargs,
         )
 
         # InitZero
@@ -1672,7 +1673,7 @@ def dilated_residual_nac(
             filters=filters,
             kernel_size=kernel_size,
             dilation_rate=int(np.round(dilation_rate)),
-            **kwargs
+            **kwargs,
         )
 
         # return
@@ -1697,7 +1698,7 @@ def dilated_residual_2d(
     dropout=0,
     repeat=1,
     symmetric=True,
-    **kwargs
+    **kwargs,
 ):
     """Construct a residual dilated convolution block."""
 
@@ -1717,7 +1718,7 @@ def dilated_residual_2d(
             kernel_size=kernel_size,
             dilation_rate=int(np.round(dilation_rate)),
             norm_gamma="ones",
-            **kwargs
+            **kwargs,
         )
 
         # return
@@ -1726,7 +1727,7 @@ def dilated_residual_2d(
             filters=rep_input.shape[-1],
             dropout=dropout,
             norm_gamma="zeros",
-            **kwargs
+            **kwargs,
         )
 
         # residual add
@@ -1818,7 +1819,7 @@ def dense_block(
     bn_momentum=0.99,
     norm_gamma=None,
     kernel_initializer="he_normal",
-    **kwargs
+    **kwargs,
 ):
     """Construct a single convolution block.
 
@@ -1909,7 +1910,7 @@ def dense_nac(
     bn_momentum=0.99,
     norm_gamma=None,
     kernel_initializer="he_normal",
-    **kwargs
+    **kwargs,
 ):
     """Construct a single convolution block.
 
@@ -1991,7 +1992,7 @@ def final(
     kernel_initializer="he_normal",
     l2_scale=0,
     l1_scale=0,
-    **kwargs
+    **kwargs,
 ):
     """Final simple transformation before comparison to targets.
 

diff --git a/src/baskerville/metrics.py b/src/baskerville/metrics.py
@@ -24,6 +24,7 @@
 for device in gpu_devices:
     tf.config.experimental.set_memory_growth(device, True)
 
+
 ################################################################################
 # Losses
 ################################################################################

diff --git a/src/baskerville/scripts/hound_eval_spec.py b/src/baskerville/scripts/hound_eval_spec.py
@@ -35,6 +35,7 @@
 Test the accuracy of a trained model on targets/predictions normalized across targets.
 """
 
+
 ################################################################################
 # main
 ################################################################################

diff --git a/src/baskerville/scripts/hound_snp.py b/src/baskerville/scripts/hound_snp.py
@@ -17,20 +17,28 @@
 import pdb
 import pickle
 import os
-from baskerville.snps import calculate_sad
+from baskerville.snps import score_snps
 
 """
 hound_snp.py
 
 Compute variant effect predictions for SNPs in a VCF file.
 """
 
+
 ################################################################################
 # main
 ################################################################################
 def main():
     usage = "usage: %prog [options] <params_file> <model_file> <vcf_file>"
     parser = OptionParser(usage)
+    parser.add_option(
+        "-c",
+        dest="cluster_snps_pct",
+        default=0,
+        type="float",
+        help="Cluster SNPs within a %% of the seq length to make a single ref pred [Default: %default]",
+    )
     parser.add_option(
         "-f",
         dest="genome_fasta",
@@ -66,8 +74,8 @@ def main():
     )
     parser.add_option(
         "--stats",
-        dest="sad_stats",
-        default="SAD",
+        dest="snp_stats",
+        default="logSAD",
         help="Comma-separated list of stats to save. [Default: %default]",
     )
     parser.add_option(
@@ -129,17 +137,20 @@ def main():
     else:
         parser.error("Must provide parameters and model files and QTL VCF file")
 
+    if options.targets_file is None:
+        parser.error("Must provide targets file")
+
     if not os.path.isdir(options.out_dir):
         os.mkdir(options.out_dir)
 
     options.shifts = [int(shift) for shift in options.shifts.split(",")]
-    options.sad_stats = options.sad_stats.split(",")
+    options.snp_stats = options.snp_stats.split(",")
 
     # calculate SAD scores:
     if options.processes is not None:
-        calculate_sad(params_file, model_file, vcf_file, worker_index, options)
+        score_snps(params_file, model_file, vcf_file, worker_index, options)
     else:
-        calculate_sad(params_file, model_file, vcf_file, 0, options)
+        score_snps(params_file, model_file, vcf_file, 0, options)
 
 
 ################################################################################

diff --git a/src/baskerville/scripts/hound_snp_slurm.py b/src/baskerville/scripts/hound_snp_slurm.py
@@ -33,6 +33,7 @@
 parallelized across a slurm cluster.
 """
 
+
 ################################################################################
 # main
 ################################################################################
@@ -41,6 +42,13 @@ def main():
     parser = OptionParser(usage)
 
     # snp
+    parser.add_option(
+        "-c",
+        dest="cluster_snps_pct",
+        default=0,
+        type="float",
+        help="Cluster SNPs within a %% of the seq length to make a single ref pred [Default: %default]",
+    )
     parser.add_option(
         "-f",
         dest="genome_fasta",
@@ -69,8 +77,8 @@ def main():
     )
     parser.add_option(
         "--stats",
-        dest="sad_stats",
-        default="SAD",
+        dest="snp_stats",
+        default="logSAD",
         help="Comma-separated list of stats to save. [Default: %default]",
     )
     parser.add_option(
@@ -80,12 +88,19 @@ def main():
         type="str",
         help="File specifying target indexes and labels in table format",
     )
+    parser.add_option(
+        "-u",
+        dest="untransform_old",
+        default=False,
+        action="store_true",
+        help="Untransform old models [Default: %default]",
+    )
 
     # multi
     parser.add_option(
         "-e",
         dest="conda_env",
-        default="tf210",
+        default="tf12",
         help="Anaconda environment [Default: %default]",
     )
     parser.add_option(

diff --git a/src/baskerville/scripts/hound_train.py b/src/baskerville/scripts/hound_train.py
@@ -163,7 +163,6 @@ def main():
         strategy = tf.distribute.MirroredStrategy()
 
         with strategy.scope():
-
             if not args.keras_fit:
                 # distribute data
                 for di in range(len(args.data_dirs)):

diff --git a/src/baskerville/seqnn.py b/src/baskerville/seqnn.py
@@ -524,7 +524,7 @@ def predict(
         stream: bool = False,
         step: int = 1,
         dtype: str = "float32",
-        **kwargs
+        **kwargs,
     ):
         """Predict targets for SeqDataset, with more options.
-Original file line number
+Diff line change
@@ Expand Up / @@ -35,6 +35,7 @@ @@
     Test the accuracy of a trained model on targets/predictions normalized across targets.
     """
     ################################################################################
     # main
     ################################################################################
@@ Expand Down @@