Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite the SplitVariants task command in TasksGenotypeBatch.wdl to call svtk only once #618

Merged
merged 15 commits into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion inputs/values/dockers.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2023-07-28-v0.28.1-beta-e70dfbd7",
"sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-base-mini:5994670",
"sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
"sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
"sv_pipeline_docker": "us.gcr.io/talkowski-sv-gnomad/kveerara/sv-pipeline:kv_split_variants_8d7ca52",
epiercehoffman marked this conversation as resolved.
Show resolved Hide resolved
"sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
"sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
"sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
Expand Down
89 changes: 89 additions & 0 deletions src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/bin/python
import argparse
import logging


# Function to process the bed file by checking for conditions
def process_bed_file(input_bed, n_per_split, bca=True):
svtype_field = 4
end_pos = 2
start_pos = 1

# Dictionary to store the conditions to be checked with matching prefixes
condition_prefixes = {
'gt5kb': {
'condition': lambda curr_1: (curr_1[svtype_field] == 'DEL' or curr_1[svtype_field] == 'DUP') and (int(curr_1[end_pos]) - int(curr_1[start_pos]) >= 5000)},
epiercehoffman marked this conversation as resolved.
Show resolved Hide resolved
'lt5kb': {
'condition': lambda curr_2: (curr_2[svtype_field] == 'DEL' or curr_2[svtype_field] == 'DUP') and (int(curr_2[end_pos]) - int(curr_2[start_pos]) < 5000)},
'bca': {'condition': lambda curr_3: bca and (
curr_3[svtype_field] != 'DEL' and curr_3[svtype_field] != 'DUP' and curr_3[svtype_field] != 'INS')},
'ins': {'condition': lambda curr_4: bca and curr_4[svtype_field] == 'INS'}
}

current_lines = {prefix: [] for prefix in condition_prefixes.keys()}
current_counts = {prefix: 0 for prefix in condition_prefixes.keys()}
current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()}

# Open the bed file and process
with open(input_bed, 'r') as infile:
for line in infile:
# process bed file line by line
line = line.strip().split('\t')

# Checks which condition and prefix the current line matches and appends it to the corresponding
# array and increments the counter for that array
for prefix, conditions in condition_prefixes.items():
if conditions['condition'](line):
current_lines[prefix].append('\t'.join(line))
current_counts[prefix] += 1

# If the current array has the maximum allowed lines added to it create a new array
# with the preceding suffix and write the current array to a file
if current_counts[prefix] == n_per_split:
output_suffix = current_suffixes[prefix].rjust(6, 'a')
output_file = f"{prefix}.{output_suffix}.bed"
with open(output_file, 'w') as outfile:
outfile.write('\n'.join(current_lines[prefix]))

logging.info(f"File '{output_file}' written.")
current_lines[prefix] = []
current_counts[prefix] = 0
current_suffixes[prefix] = increment_suffix(current_suffixes[prefix])

# Handle remaining lines after the loop
for prefix, lines in current_lines.items():
if lines:
output_suffix = current_suffixes[prefix].rjust(6, 'a')
output_file = f"{prefix}.{output_suffix}.bed"
with open(output_file, 'w') as outfile:
outfile.write('\n'.join(lines))

logging.info(f"File '{output_file}' written.")


# Function to generate the pattern for suffixes
def increment_suffix(suffix):
# define the alphabet and ending
alphabet = 'abcdefghijklmnopqrstuvwxyz'
if suffix == 'z' * 6:
raise ValueError('All possible files generated.')
else:
# if there are available suffixes, increment with appropriate number
# of padded zeroes
epiercehoffman marked this conversation as resolved.
Show resolved Hide resolved
index = alphabet.index(suffix[0])
next_char = alphabet[(index + 1) % 26]
return next_char + suffix[1:]


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--bed", help="Path to input bed file", required=True)
parser.add_argument("--n", help="number of variants per file", required=True)
parser.add_argument("--bca", default=False, help="If there are ", action='store_true')
args = parser.parse_args()
process_bed_file(args.bed, args.n, args.bca)


if __name__ == '__main__':
main()
20 changes: 5 additions & 15 deletions wdl/TasksGenotypeBatch.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,12 @@ task SplitVariants {
Array[File] ins_beds = glob("ins.*")
}
command <<<

set -euo pipefail
svtk vcf2bed ~{vcf} stdout \
| awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2>=5000) {print $1, $2, $3, $4, $6, $5}' \
| split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - gt5kb.
svtk vcf2bed ~{vcf} stdout \
| awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2<5000) {print $1, $2, $3, $4, $6, $5}' \
| split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - lt5kb.
if [ ~{generate_bca} == "true" ]; then
svtk vcf2bed ~{vcf} stdout \
| awk -v OFS="\t" '($5!="DEL" && $5!="DUP" && $5!="INS") {print $1, $2, $3, $4, $6, $5}' \
| split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - bca.
svtk vcf2bed ~{vcf} stdout \
| awk -v OFS="\t" '($5=="INS") {print $1, $2, $3, $4, $6, $5}' \
| split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - ins.
fi
svtk vcf2bed ~{vcf} bed_file.bed
python /opt/sv-pipeline/04_variant_resolution/scripts/split_variants.py \
--bed bed_file.bed \
~{"--n " + n_per_split} \
~{if generate_bca then "--bca" else ""}
epiercehoffman marked this conversation as resolved.
Show resolved Hide resolved

>>>
runtime {
Expand Down
Loading