From 2ec2e386bc2cf132e547689633bd498792ee204b Mon Sep 17 00:00:00 2001 From: William Rowell Date: Fri, 19 Jul 2024 21:22:02 -0700 Subject: [PATCH] Added tests to compare md5sums for VCF or SAM records while ignoring variable headers. --- .../wdl_tests/calculate_sam_record_md5sum.wdl | 49 +++++++++++++++++++ .../wdl_tests/calculate_vcf_record_md5sum.wdl | 49 +++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 src/wdlci/wdl_tests/calculate_sam_record_md5sum.wdl create mode 100644 src/wdlci/wdl_tests/calculate_vcf_record_md5sum.wdl diff --git a/src/wdlci/wdl_tests/calculate_sam_record_md5sum.wdl b/src/wdlci/wdl_tests/calculate_sam_record_md5sum.wdl new file mode 100644 index 0000000..ac83833 --- /dev/null +++ b/src/wdlci/wdl_tests/calculate_sam_record_md5sum.wdl @@ -0,0 +1,49 @@ +version 1.0 + +# Compare md5sums of SAM records in input files, e.g., BAMs or CRAMs while ignoring headers +# Input type: SAM/BAM/CRAM + +task calculate_sam_record_md5sum { + input { + File current_run_output + File validated_output + } + + Int disk_size = ceil(size(current_run_output, "GB") + size(validated_output, "GB") + 50) + + command <<< + set -euo pipefail + + err() { + message=$1 + + echo -e "[ERROR] $message" >&2 + } + + # Compare files + echo "Comparing SAM record md5sums" + current_run_md5sum=$(samtools view ~{current_run_output} | md5sum | cut -d ' ' -f 1) + validated_output_md5sum=$(samtools view ~{validated_output} | md5sum | cut -d ' ' -f 1) + + if [[ "$current_run_md5sum" != "$validated_output_md5sum" ]]; then + err "SAM record md5sums did not match: + Expected md5sum: [$validated_output_md5sum] + Current run md5sum: [$current_run_md5sum]" + exit 1 + else + echo "SAM record md5sums matched for file [~{basename(validated_output)}]" + fi + >>> + + output { + } + + runtime { + docker: "dnastack/dnastack-wdl-ci-tools:0.0.1" + cpu: 1 + memory: "3.75 GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + } +} diff --git a/src/wdlci/wdl_tests/calculate_vcf_record_md5sum.wdl b/src/wdlci/wdl_tests/calculate_vcf_record_md5sum.wdl new file mode 100644 index 0000000..071aed0 --- /dev/null +++ b/src/wdlci/wdl_tests/calculate_vcf_record_md5sum.wdl @@ -0,0 +1,49 @@ +version 1.0 + +# Compare md5sums of VCF records in input files, ignoring headers +# Input type: VCF/BCF/VCF.gz/BCF.gz + +task calculate_vcf_record_md5sum { + input { + File current_run_output + File validated_output + } + + Int disk_size = ceil(size(current_run_output, "GB") + size(validated_output, "GB") + 50) + + command <<< + set -euo pipefail + + err() { + message=$1 + + echo -e "[ERROR] $message" >&2 + } + + # Compare files + echo "Comparing VCF record md5sums" + current_run_md5sum=$(bcftools view -H ~{current_run_output} | md5sum | cut -d ' ' -f 1) + validated_output_md5sum=$(bcftools view -H ~{validated_output} | md5sum | cut -d ' ' -f 1) + + if [[ "$current_run_md5sum" != "$validated_output_md5sum" ]]; then + err "VCF record md5sums did not match: + Expected md5sum: [$validated_output_md5sum] + Current run md5sum: [$current_run_md5sum]" + exit 1 + else + echo "VCF record md5sums matched for file [~{basename(validated_output)}]" + fi + >>> + + output { + } + + runtime { + docker: "dnastack/dnastack-wdl-ci-tools:0.0.1" + cpu: 1 + memory: "3.75 GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + } +}