broadinstitute · kirtanav98 · Jan 5, 2024 · Nov 13, 2023 · Nov 14, 2023 · Nov 17, 2023
diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json
@@ -13,7 +13,7 @@
   "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2023-07-28-v0.28.1-beta-e70dfbd7",
   "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-base-mini:5994670",
   "sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
-  "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
+  "sv_pipeline_docker": "us.gcr.io/talkowski-sv-gnomad/kveerara/sv-pipeline:kv_split_variants_8d7ca52",
   "sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
   "sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
   "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
@@ -0,0 +1,89 @@
+#!/bin/python
+import argparse
+import logging
+
+
+# Function to process the bed file by checking for conditions
+def process_bed_file(input_bed, n_per_split, bca=True):
+    svtype_field = 4
+    end_pos = 2
+    start_pos = 1
+
+    # Dictionary to store the conditions to be checked with matching prefixes
+    condition_prefixes = {
+        'gt5kb': {
+            'condition': lambda curr_1: (curr_1[svtype_field] == 'DEL' or curr_1[svtype_field] == 'DUP') and (int(curr_1[end_pos]) - int(curr_1[start_pos]) >= 5000)},
+        'lt5kb': {
+            'condition': lambda curr_2: (curr_2[svtype_field] == 'DEL' or curr_2[svtype_field] == 'DUP') and (int(curr_2[end_pos]) - int(curr_2[start_pos]) < 5000)},
+        'bca': {'condition': lambda curr_3: bca and (
+                curr_3[svtype_field] != 'DEL' and curr_3[svtype_field] != 'DUP' and curr_3[svtype_field] != 'INS')},
+        'ins': {'condition': lambda curr_4: bca and curr_4[svtype_field] == 'INS'}
+    }
+
+    current_lines = {prefix: [] for prefix in condition_prefixes.keys()}
+    current_counts = {prefix: 0 for prefix in condition_prefixes.keys()}
+    current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()}
+
+    # Open the bed file and process
+    with open(input_bed, 'r') as infile:
+        for line in infile:
+            # process bed file line by line
+            line = line.strip().split('\t')
+
+            # Checks which condition and prefix the current line matches and appends it to the corresponding
+            # array and increments the counter for that array
+            for prefix, conditions in condition_prefixes.items():
+                if conditions['condition'](line):
+                    current_lines[prefix].append('\t'.join(line))
+                    current_counts[prefix] += 1
+
+                    # If the current array has the maximum allowed lines added to it create a new array
+                    # with the preceding suffix and write the current array to a file
+                    if current_counts[prefix] == n_per_split:
+                        output_suffix = current_suffixes[prefix].rjust(6, 'a')
+                        output_file = f"{prefix}.{output_suffix}.bed"
+                        with open(output_file, 'w') as outfile:
+                            outfile.write('\n'.join(current_lines[prefix]))
+
+                        logging.info(f"File '{output_file}' written.")
+                        current_lines[prefix] = []
+                        current_counts[prefix] = 0
+                        current_suffixes[prefix] = increment_suffix(current_suffixes[prefix])
+
+    # Handle remaining lines after the loop
+    for prefix, lines in current_lines.items():
+        if lines:
+            output_suffix = current_suffixes[prefix].rjust(6, 'a')
+            output_file = f"{prefix}.{output_suffix}.bed"
+            with open(output_file, 'w') as outfile:
+                outfile.write('\n'.join(lines))
+
+            logging.info(f"File '{output_file}' written.")
+
+
+# Function to generate the pattern for suffixes
+def increment_suffix(suffix):
+    # define the alphabet and ending
+    alphabet = 'abcdefghijklmnopqrstuvwxyz'
+    if suffix == 'z' * 6:
+        raise ValueError('All possible files generated.')
+    else:
+        # if there are available suffixes, increment with appropriate number
+        # of padded zeroes
+        index = alphabet.index(suffix[0])
+        next_char = alphabet[(index + 1) % 26]
+        return next_char + suffix[1:]
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--bed", help="Path to input bed file", required=True)
+    parser.add_argument("--n", help="number of variants per file", required=True)
+    parser.add_argument("--bca", default=False, help="If there are ", action='store_true')
+    args = parser.parse_args()
+    process_bed_file(args.bed, args.n, args.bca)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wdl/TasksGenotypeBatch.wdl b/wdl/TasksGenotypeBatch.wdl
@@ -28,22 +28,12 @@ task SplitVariants {
     Array[File] ins_beds = glob("ins.*")
   }
   command <<<
-
     set -euo pipefail
-    svtk vcf2bed ~{vcf} stdout \
-      | awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2>=5000) {print $1, $2, $3, $4, $6, $5}' \
-      | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - gt5kb.
-    svtk vcf2bed ~{vcf} stdout \
-      | awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2<5000) {print $1, $2, $3, $4, $6, $5}' \
-      | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - lt5kb.
-    if [ ~{generate_bca} == "true" ]; then
-      svtk vcf2bed ~{vcf} stdout \
-        | awk -v OFS="\t" '($5!="DEL" && $5!="DUP" && $5!="INS") {print $1, $2, $3, $4, $6, $5}' \
-        | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - bca.
-      svtk vcf2bed ~{vcf} stdout \
-        | awk -v OFS="\t" '($5=="INS") {print $1, $2, $3, $4, $6, $5}' \
-        | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - ins.
-    fi
+    svtk vcf2bed ~{vcf} bed_file.bed
+    python /opt/sv-pipeline/04_variant_resolution/scripts/split_variants.py \
+    --bed bed_file.bed \
+    ~{"--n " + n_per_split} \
+    ~{if generate_bca then "--bca" else ""}
 
   >>>
   runtime {