From a6f602b938d2687c8ff2f9cc8ab8c3c9d5149a72 Mon Sep 17 00:00:00 2001 From: Kevin Wamae Date: Sat, 22 Apr 2023 18:11:00 +0300 Subject: [PATCH] generate summary statistics --- README.md | 52 ++++--- workflow/envs/variant-calling-stats.yaml | 11 ++ workflow/scripts/get_bam_stats.py | 61 +++++++++ workflow/scripts/get_raw_fastq_stats.py | 54 ++++++++ workflow/scripts/get_trimmed_fastq_stats.py | 128 ++++++++++++++++++ workflow/scripts/get_variant_calling_stats.sh | 15 ++ 6 files changed, 301 insertions(+), 20 deletions(-) create mode 100644 workflow/envs/variant-calling-stats.yaml create mode 100644 workflow/scripts/get_bam_stats.py create mode 100644 workflow/scripts/get_raw_fastq_stats.py create mode 100644 workflow/scripts/get_trimmed_fastq_stats.py create mode 100644 workflow/scripts/get_variant_calling_stats.sh diff --git a/README.md b/README.md index 7b86a8e..c1188bb 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,26 @@ -# **Snakemake workflow: variant calling using GATK4 best practices** +# **Snakemake workflow: variant calling using the Genome Analysis Toolkit (GATK) best practices** -![GitHub release (release name instead of tag name)](https://img.shields.io/github/v/release/kevin-wamae/gatk-variant-voyage) -![GitHub](https://img.shields.io/github/license/kevin-wamae/gatk-variant-voyage?color=red) +![GitHub](https://img.shields.io/github/license/kevin-wamae/variant-calling-with-Snakemake-and-GATK?color=red) [![conda](https://img.shields.io/badge/conda->=23.1.0-brightgreen.svg)](https://github.com/conda/conda) [![snakemake](https://img.shields.io/badge/snakemake-7.24.2-brightgreen.svg)](https://snakemake.readthedocs.io) -[![bedops](https://img.shields.io/badge/bedops-2.4.41-brightgreen.svg)](https://bedops.readthedocs.io/en/latest/) -[![trimmomatic](https://img.shields.io/badge/trimmomatic-0.39-brightgreen.svg)](http://www.usadellab.org/cms/?page=trimmomatic) -[![fastp](https://img.shields.io/badge/fastp-0.23.2-brightgreen.svg)](https://github.com/OpenGene/fastp) -[![bwa](https://img.shields.io/badge/bwa-0.7.17-brightgreen.svg)](https://github.com/lh3/bwa) -[![samtools](https://img.shields.io/badge/samtools-1.16.1-brightgreen.svg)](https://github.com/samtools/samtools) -[![gatk4](https://img.shields.io/badge/gatk4-4.4.0.0-brightgreen.svg)](https://github.com/broadinstitute/gatk) -[![samblaster](https://img.shields.io/badge/samblaster-0.1.26-brightgreen.svg)](https://github.com/GregoryFaust/samblaster) -[![bcftools](https://img.shields.io/badge/bcftools-1.17-brightgreen.svg)](https://github.com/samtools/bcftools) -[![r](https://img.shields.io/badge/R-4.2.3-brightgreen.svg)](https://anaconda.org/conda-forge/r-base) -[![ggplot2](https://img.shields.io/badge/ggplot2-3.4.2-brightgreen.svg)](https://ggplot2.tidyverse.org/) -[![snpeff](https://img.shields.io/badge/snpeff-5.1-brightgreen.svg)](http://pcingola.github.io/SnpEff/) -[![dask](https://img.shields.io/badge/dask-2023.3.2-brightgreen.svg)](https://www.dask.org/) + +[![trimmomatic](https://img.shields.io/badge/trimmomatic-0.39-blue.svg)](http://www.usadellab.org/cms/?page=trimmomatic) +[![fastp](https://img.shields.io/badge/fastp-0.23.2-blue.svg)](https://github.com/OpenGene/fastp) +[![bwa](https://img.shields.io/badge/bwa-0.7.17-blue.svg)](https://github.com/lh3/bwa) +[![samtools](https://img.shields.io/badge/samtools-1.16.1-blue.svg)](https://github.com/samtools/samtools) +[![gatk4](https://img.shields.io/badge/gatk4-4.4.0.0-blue.svg)](https://github.com/broadinstitute/gatk) +[![samblaster](https://img.shields.io/badge/samblaster-0.1.26-blue.svg)](https://github.com/GregoryFaust/samblaster) +[![bcftools](https://img.shields.io/badge/bcftools-1.17-blue.svg)](https://github.com/samtools/bcftools) +[![snpeff](https://img.shields.io/badge/snpeff-5.1-blue.svg)](http://pcingola.github.io/SnpEff/) + --- ## **Table of contents** -- [**Snakemake workflow: variant calling using GATK4 best practices**](#snakemake-workflow-variant-calling-using-gatk4-best-practices) +- [**Snakemake workflow: variant calling using the Genome Analysis Toolkit (GATK) best practices**](#snakemake-workflow-variant-calling-using-the-genome-analysis-toolkit-gatk-best-practices) - [**Table of contents**](#table-of-contents) - [**Motivation**](#motivation) - [**Pipeline sections**](#pipeline-sections) @@ -115,9 +112,9 @@ - [Linux](https://docs.conda.io/projects/conda/en/latest/user-guide/install/linux.html) - [MacOS](https://docs.conda.io/projects/conda/en/latest/user-guide/install/macos.html) - Clone this project using the following command in your terminal: - - `git clone https://github.com/kevin-wamae/gatk-variant-voyage.git` + - `git clone https://github.com/kevin-wamae/variant-calling-with-Snakemake-and-GATK.git` - Type the following command in your terminal to navigate into the cloned directory using the command below. This will be the root directory of the project: - - `cd gatk-variant-voyage` + - `cd variant-calling-with-Snakemake-and-GATK` - **_Note: All subsequent commands should be run from the root directory of this project. However, users can modify the scripts to their liking_** @@ -187,13 +184,28 @@ After navigating into the root directory of the project, run the analysis by exe 5. Once the analysis is complete, look through **output/** directory to view the results of the analysis -6. Finally, you can deactivate the conda environment by running the following command to exit this conda environment: +6. Summary statistics can be generated with stand alone scripts in the `workflow/scripts/` directory: + - To do this, create an conda environment with the following command: + - `conda env create --file workflow/envs/variant-calling-stats.yaml` + - activate the conda environment by running the following command: `conda activate variant-calling-stats` + - To generate a summary of the raw reads, run the following command and look through the `stats_1_raw_fastq.tsv` file in the project directory: + - `python workflow/scripts/get_raw_fastq_stats.py` + - To generate a summary of the trimmed reads, run the following command and look through the `stats_2_trimmed_fastq.tsv` file: + - `python workflow/scripts/get_trimmed_fastq_stats.py` + - To generate a summary of the mapped reads, run the following command and look through the `stats_3_mapped_reads.tsv` file: + - `python workflow/scripts/get_mapped_reads_stats.py` + - To generate a summary of the variants called, run the following command and look through the `stats_4_variant_calling.tsv` file: + - `bash workflow/scripts/get_variant_calling_stats.sh` + - Exit this conda environment by running the following command: + - `conda deactivate variant-calling-stats` + +7. Finally, you can deactivate the variant calling conda environment by running the following command: - `conda deactivate variant-calling-gatk` --- ## **Feedback and Issues** -Report any issues or bugs by openning an issue [here](https://github.com/kevin-wamae/gatk-variant-voyage/issues) or contact me via email at **wamaekevin[at]gmail.com** +Report any issues or bugs by openning an issue [here](https://github.com/kevin-wamae/variant-calling-with-Snakemake-and-GATK/issues) or contact me via email at **wamaekevin[at]gmail.com** diff --git a/workflow/envs/variant-calling-stats.yaml b/workflow/envs/variant-calling-stats.yaml new file mode 100644 index 0000000..ce62b1c --- /dev/null +++ b/workflow/envs/variant-calling-stats.yaml @@ -0,0 +1,11 @@ +name: variant-calling-stats +channels: + - bioconda + - conda-forge +dependencies: + - bioconda::seqkit=2.4.0 + - conda-forge::python=3.11.3 + - conda-forge::pandas=2.0.0 + - conda-forge::pyyaml=6.0 + - conda-forge::yaml=0.2.5 + - bioconda::bcftools=1.17 diff --git a/workflow/scripts/get_bam_stats.py b/workflow/scripts/get_bam_stats.py new file mode 100644 index 0000000..ad00d3b --- /dev/null +++ b/workflow/scripts/get_bam_stats.py @@ -0,0 +1,61 @@ +import os +import re +import yaml + + +def main(): + # Read input directory from the config file + with open("config/config.yaml", "r") as yaml_file: + config = yaml.safe_load(yaml_file) + + dir_path = os.path.join(config["map_qual_stats"]["dir"], "samtools/flagstat") + + # Create an empty list to store the results + results = [] + + # Loop through all files in the directory + for file in os.listdir(dir_path): + if file.endswith(".bam.flagstat.txt"): + with open(os.path.join(dir_path, file), "r") as f: + content = f.read() + + # Extract sample ID + sample_id = re.sub(r"\.bam\.flagstat\.txt$", "", file) + + # Extract total reads + total_reads = int( + re.search(r"^(\d+) \+ \d+ in total", content, re.MULTILINE).group(1) + ) + + # Extract total mapped reads + mapped_reads = int( + re.search(r"^(\d+) \+ \d+ mapped", content, re.MULTILINE).group(1) + ) + + # Calculate total unmapped reads + unmapped_reads = total_reads - mapped_reads + + # Extract total duplicates + duplicates = int( + re.search(r"^(\d+) \+ \d+ duplicates", content, re.MULTILINE).group( + 1 + ) + ) + + # Append the results to the list + results.append([sample_id, mapped_reads, duplicates, unmapped_reads]) + + # Save the results to a file + with open("stats_3_mapped_reads.tsv", "w") as f: + # Write the header + f.write( + "sample_id\treads_mapped_total\treads_mapped_duplicates\treads_mapped_unmapped\n" + ) + + # Write the results in a tab-separated format + for result in results: + f.write("\t".join(map(str, result)) + "\n") + + +if __name__ == "__main__": + main() diff --git a/workflow/scripts/get_raw_fastq_stats.py b/workflow/scripts/get_raw_fastq_stats.py new file mode 100644 index 0000000..95f1a31 --- /dev/null +++ b/workflow/scripts/get_raw_fastq_stats.py @@ -0,0 +1,54 @@ +import os +import re +import pandas as pd +from subprocess import Popen, PIPE +import yaml + + +def get_stats(fastq_file): + process = Popen( + f"seqkit stats {fastq_file}", stdout=PIPE, stderr=PIPE, shell=True, text=True + ) + header, data = process.communicate()[0].strip().split("\n") + stats = { + k: v.replace(",", "") + for k, v in zip(header.split(), data.split()) + if k in ["num_seqs", "min_len", "avg_len", "max_len"] + } + return stats + + +# Read input directory from the config file +with open("config/config.yaml", "r") as yaml_file: + config = yaml.safe_load(yaml_file) + +input_dir = config["input"]["fastq"] + +output_file = "stats_1_raw_fastq.tsv" + +stats_list = [ + { + **get_stats(os.path.join(input_dir, f)), + "id": re.sub(r"_R?[12]\.fastq\.gz", "", f), + } + for f in os.listdir(input_dir) + if f.endswith(("_R1.fastq.gz", "_R2.fastq.gz", "_1.fastq.gz", "_2.fastq.gz")) +] + +df = pd.DataFrame(stats_list).astype( + {"num_seqs": int, "min_len": int, "avg_len": float, "max_len": int} +) +output_df = ( + df.groupby("id") + .agg({"num_seqs": "sum", "min_len": "min", "avg_len": "mean", "max_len": "max"}) + .reset_index() +) +output_df.columns = [ + "id", + "reads_raw_total", + "reads_raw_len_min", + "reads_raw_len_avg", + "reads_raw_len_max", +] + +output_df.to_csv(output_file, sep="\t", index=False) diff --git a/workflow/scripts/get_trimmed_fastq_stats.py b/workflow/scripts/get_trimmed_fastq_stats.py new file mode 100644 index 0000000..48281a0 --- /dev/null +++ b/workflow/scripts/get_trimmed_fastq_stats.py @@ -0,0 +1,128 @@ +import os +import re +import pandas as pd +from subprocess import Popen, PIPE +import yaml + + +def get_stats(fastq_file, prefix): + process = Popen( + f"seqkit stats {fastq_file}", stdout=PIPE, stderr=PIPE, shell=True, text=True + ) + header, data = process.communicate()[0].strip().split("\n") + stats = { + f"{prefix}_{k}": v.replace(",", "") + for k, v in zip(header.split(), data.split()) + if k in ["num_seqs", "min_len", "avg_len", "max_len"] + } + return stats + + +def main(): + config_file = "config/config.yaml" + with open(config_file, "r") as yaml_file: + config = yaml.safe_load(yaml_file) + + output_dir = config["trim_reads"]["dir"] + + paired_dir = os.path.join(output_dir, "paired") + unpaired_dir = os.path.join(output_dir, "unpaired") + output_file = "stats_2_trimmed_fastq.tsv" + + paired_stats_list = [] + unpaired_stats_list = [] + + for f in os.listdir(paired_dir): + if f.endswith( + ( + "_R1.trimmed.fastq.gz", + "_R2.trimmed.fastq.gz", + "_1.trimmed.fastq.gz", + "_2.trimmed.fastq.gz", + ) + ): + paired_stats_list.append( + { + **get_stats(os.path.join(paired_dir, f), "trimmed"), + "id": re.sub(r"_R?[12].trimmed.fastq.gz", "", f), + } + ) + + for f in os.listdir(unpaired_dir): + if f.endswith( + ( + "_R1.unpaired.fastq.gz", + "_R2.unpaired.fastq.gz", + "_1.unpaired.fastq.gz", + "_2.unpaired.fastq.gz", + ) + ): + unpaired_stats_list.append( + { + **get_stats(os.path.join(unpaired_dir, f), "unpaired"), + "id": re.sub(r"_R?[12].unpaired.fastq.gz", "", f), + } + ) + + df_paired = pd.DataFrame(paired_stats_list).astype( + { + "trimmed_num_seqs": int, + "trimmed_min_len": int, + "trimmed_avg_len": float, + "trimmed_max_len": int, + } + ) + df_unpaired = pd.DataFrame(unpaired_stats_list).astype( + { + "unpaired_num_seqs": int, + "unpaired_min_len": int, + "unpaired_avg_len": float, + "unpaired_max_len": int, + } + ) + + output_df_paired = ( + df_paired.groupby("id") + .agg( + { + "trimmed_num_seqs": "sum", + "trimmed_min_len": "min", + "trimmed_avg_len": "mean", + "trimmed_max_len": "max", + } + ) + .reset_index() + ) + output_df_unpaired = ( + df_unpaired.groupby("id") + .agg( + { + "unpaired_num_seqs": "sum", + "unpaired_min_len": "min", + "unpaired_avg_len": "mean", + "unpaired_max_len": "max", + } + ) + .reset_index() + ) + + output_df = output_df_paired.merge(output_df_unpaired, on="id") + + column_mapping = { + "trimmed_num_seqs": "reads_trim_total", + "trimmed_min_len": "reads_trim_len_min", + "trimmed_avg_len": "reads_trim_len_avg", + "trimmed_max_len": "reads_trim_len_max", + "unpaired_num_seqs": "reads_unpaired_total", + "unpaired_min_len": "reads_unpaired_len_min", + "unpaired_avg_len": "reads_unpaired_len_avg", + "unpaired_max_len": "reads_unpaired_len_max", + } + + output_df = output_df.rename(columns=column_mapping) + + output_df.to_csv(output_file, sep="\t", index=False) + + +if __name__ == "__main__": + main() diff --git a/workflow/scripts/get_variant_calling_stats.sh b/workflow/scripts/get_variant_calling_stats.sh new file mode 100644 index 0000000..89fb62e --- /dev/null +++ b/workflow/scripts/get_variant_calling_stats.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Set the VCF file paths +unfiltered_vcf="output/6_variant_filtering/c_gatk_variants_merged/unfiltered.vcf.gz" +filtered_vcf="output/6_variant_filtering/c_gatk_variants_merged/filtered.vcf.gz" + +# Extract the counts of SNPs and indels for each VCF file +snps_raw=$(bcftools view -v snps "$unfiltered_vcf" | grep -v "^#" | wc -l) +snps_pass=$(bcftools view -v snps "$filtered_vcf" | grep -v "^#" | wc -l) +indels_raw=$(bcftools view -v indels "$unfiltered_vcf" | grep -v "^#" | wc -l) +indels_pass=$(bcftools view -v indels "$filtered_vcf" | grep -v "^#" | wc -l) + +# Save the results to a file +echo -e "snps_raw\tsnps_pass\tindels_raw\tindels_pass" >stats_4_variant_calling.tsv +echo -e "${snps_raw}\t${snps_pass}\t${indels_raw}\t${indels_pass}" >>stats_4_variant_calling.tsv