-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
31e199c
commit a6f602b
Showing
6 changed files
with
301 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
name: variant-calling-stats | ||
channels: | ||
- bioconda | ||
- conda-forge | ||
dependencies: | ||
- bioconda::seqkit=2.4.0 | ||
- conda-forge::python=3.11.3 | ||
- conda-forge::pandas=2.0.0 | ||
- conda-forge::pyyaml=6.0 | ||
- conda-forge::yaml=0.2.5 | ||
- bioconda::bcftools=1.17 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import os | ||
import re | ||
import yaml | ||
|
||
|
||
def main(): | ||
# Read input directory from the config file | ||
with open("config/config.yaml", "r") as yaml_file: | ||
config = yaml.safe_load(yaml_file) | ||
|
||
dir_path = os.path.join(config["map_qual_stats"]["dir"], "samtools/flagstat") | ||
|
||
# Create an empty list to store the results | ||
results = [] | ||
|
||
# Loop through all files in the directory | ||
for file in os.listdir(dir_path): | ||
if file.endswith(".bam.flagstat.txt"): | ||
with open(os.path.join(dir_path, file), "r") as f: | ||
content = f.read() | ||
|
||
# Extract sample ID | ||
sample_id = re.sub(r"\.bam\.flagstat\.txt$", "", file) | ||
|
||
# Extract total reads | ||
total_reads = int( | ||
re.search(r"^(\d+) \+ \d+ in total", content, re.MULTILINE).group(1) | ||
) | ||
|
||
# Extract total mapped reads | ||
mapped_reads = int( | ||
re.search(r"^(\d+) \+ \d+ mapped", content, re.MULTILINE).group(1) | ||
) | ||
|
||
# Calculate total unmapped reads | ||
unmapped_reads = total_reads - mapped_reads | ||
|
||
# Extract total duplicates | ||
duplicates = int( | ||
re.search(r"^(\d+) \+ \d+ duplicates", content, re.MULTILINE).group( | ||
1 | ||
) | ||
) | ||
|
||
# Append the results to the list | ||
results.append([sample_id, mapped_reads, duplicates, unmapped_reads]) | ||
|
||
# Save the results to a file | ||
with open("stats_3_mapped_reads.tsv", "w") as f: | ||
# Write the header | ||
f.write( | ||
"sample_id\treads_mapped_total\treads_mapped_duplicates\treads_mapped_unmapped\n" | ||
) | ||
|
||
# Write the results in a tab-separated format | ||
for result in results: | ||
f.write("\t".join(map(str, result)) + "\n") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import os | ||
import re | ||
import pandas as pd | ||
from subprocess import Popen, PIPE | ||
import yaml | ||
|
||
|
||
def get_stats(fastq_file): | ||
process = Popen( | ||
f"seqkit stats {fastq_file}", stdout=PIPE, stderr=PIPE, shell=True, text=True | ||
) | ||
header, data = process.communicate()[0].strip().split("\n") | ||
stats = { | ||
k: v.replace(",", "") | ||
for k, v in zip(header.split(), data.split()) | ||
if k in ["num_seqs", "min_len", "avg_len", "max_len"] | ||
} | ||
return stats | ||
|
||
|
||
# Read input directory from the config file | ||
with open("config/config.yaml", "r") as yaml_file: | ||
config = yaml.safe_load(yaml_file) | ||
|
||
input_dir = config["input"]["fastq"] | ||
|
||
output_file = "stats_1_raw_fastq.tsv" | ||
|
||
stats_list = [ | ||
{ | ||
**get_stats(os.path.join(input_dir, f)), | ||
"id": re.sub(r"_R?[12]\.fastq\.gz", "", f), | ||
} | ||
for f in os.listdir(input_dir) | ||
if f.endswith(("_R1.fastq.gz", "_R2.fastq.gz", "_1.fastq.gz", "_2.fastq.gz")) | ||
] | ||
|
||
df = pd.DataFrame(stats_list).astype( | ||
{"num_seqs": int, "min_len": int, "avg_len": float, "max_len": int} | ||
) | ||
output_df = ( | ||
df.groupby("id") | ||
.agg({"num_seqs": "sum", "min_len": "min", "avg_len": "mean", "max_len": "max"}) | ||
.reset_index() | ||
) | ||
output_df.columns = [ | ||
"id", | ||
"reads_raw_total", | ||
"reads_raw_len_min", | ||
"reads_raw_len_avg", | ||
"reads_raw_len_max", | ||
] | ||
|
||
output_df.to_csv(output_file, sep="\t", index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
import os | ||
import re | ||
import pandas as pd | ||
from subprocess import Popen, PIPE | ||
import yaml | ||
|
||
|
||
def get_stats(fastq_file, prefix): | ||
process = Popen( | ||
f"seqkit stats {fastq_file}", stdout=PIPE, stderr=PIPE, shell=True, text=True | ||
) | ||
header, data = process.communicate()[0].strip().split("\n") | ||
stats = { | ||
f"{prefix}_{k}": v.replace(",", "") | ||
for k, v in zip(header.split(), data.split()) | ||
if k in ["num_seqs", "min_len", "avg_len", "max_len"] | ||
} | ||
return stats | ||
|
||
|
||
def main(): | ||
config_file = "config/config.yaml" | ||
with open(config_file, "r") as yaml_file: | ||
config = yaml.safe_load(yaml_file) | ||
|
||
output_dir = config["trim_reads"]["dir"] | ||
|
||
paired_dir = os.path.join(output_dir, "paired") | ||
unpaired_dir = os.path.join(output_dir, "unpaired") | ||
output_file = "stats_2_trimmed_fastq.tsv" | ||
|
||
paired_stats_list = [] | ||
unpaired_stats_list = [] | ||
|
||
for f in os.listdir(paired_dir): | ||
if f.endswith( | ||
( | ||
"_R1.trimmed.fastq.gz", | ||
"_R2.trimmed.fastq.gz", | ||
"_1.trimmed.fastq.gz", | ||
"_2.trimmed.fastq.gz", | ||
) | ||
): | ||
paired_stats_list.append( | ||
{ | ||
**get_stats(os.path.join(paired_dir, f), "trimmed"), | ||
"id": re.sub(r"_R?[12].trimmed.fastq.gz", "", f), | ||
} | ||
) | ||
|
||
for f in os.listdir(unpaired_dir): | ||
if f.endswith( | ||
( | ||
"_R1.unpaired.fastq.gz", | ||
"_R2.unpaired.fastq.gz", | ||
"_1.unpaired.fastq.gz", | ||
"_2.unpaired.fastq.gz", | ||
) | ||
): | ||
unpaired_stats_list.append( | ||
{ | ||
**get_stats(os.path.join(unpaired_dir, f), "unpaired"), | ||
"id": re.sub(r"_R?[12].unpaired.fastq.gz", "", f), | ||
} | ||
) | ||
|
||
df_paired = pd.DataFrame(paired_stats_list).astype( | ||
{ | ||
"trimmed_num_seqs": int, | ||
"trimmed_min_len": int, | ||
"trimmed_avg_len": float, | ||
"trimmed_max_len": int, | ||
} | ||
) | ||
df_unpaired = pd.DataFrame(unpaired_stats_list).astype( | ||
{ | ||
"unpaired_num_seqs": int, | ||
"unpaired_min_len": int, | ||
"unpaired_avg_len": float, | ||
"unpaired_max_len": int, | ||
} | ||
) | ||
|
||
output_df_paired = ( | ||
df_paired.groupby("id") | ||
.agg( | ||
{ | ||
"trimmed_num_seqs": "sum", | ||
"trimmed_min_len": "min", | ||
"trimmed_avg_len": "mean", | ||
"trimmed_max_len": "max", | ||
} | ||
) | ||
.reset_index() | ||
) | ||
output_df_unpaired = ( | ||
df_unpaired.groupby("id") | ||
.agg( | ||
{ | ||
"unpaired_num_seqs": "sum", | ||
"unpaired_min_len": "min", | ||
"unpaired_avg_len": "mean", | ||
"unpaired_max_len": "max", | ||
} | ||
) | ||
.reset_index() | ||
) | ||
|
||
output_df = output_df_paired.merge(output_df_unpaired, on="id") | ||
|
||
column_mapping = { | ||
"trimmed_num_seqs": "reads_trim_total", | ||
"trimmed_min_len": "reads_trim_len_min", | ||
"trimmed_avg_len": "reads_trim_len_avg", | ||
"trimmed_max_len": "reads_trim_len_max", | ||
"unpaired_num_seqs": "reads_unpaired_total", | ||
"unpaired_min_len": "reads_unpaired_len_min", | ||
"unpaired_avg_len": "reads_unpaired_len_avg", | ||
"unpaired_max_len": "reads_unpaired_len_max", | ||
} | ||
|
||
output_df = output_df.rename(columns=column_mapping) | ||
|
||
output_df.to_csv(output_file, sep="\t", index=False) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#!/bin/bash | ||
|
||
# Set the VCF file paths | ||
unfiltered_vcf="output/6_variant_filtering/c_gatk_variants_merged/unfiltered.vcf.gz" | ||
filtered_vcf="output/6_variant_filtering/c_gatk_variants_merged/filtered.vcf.gz" | ||
|
||
# Extract the counts of SNPs and indels for each VCF file | ||
snps_raw=$(bcftools view -v snps "$unfiltered_vcf" | grep -v "^#" | wc -l) | ||
snps_pass=$(bcftools view -v snps "$filtered_vcf" | grep -v "^#" | wc -l) | ||
indels_raw=$(bcftools view -v indels "$unfiltered_vcf" | grep -v "^#" | wc -l) | ||
indels_pass=$(bcftools view -v indels "$filtered_vcf" | grep -v "^#" | wc -l) | ||
|
||
# Save the results to a file | ||
echo -e "snps_raw\tsnps_pass\tindels_raw\tindels_pass" >stats_4_variant_calling.tsv | ||
echo -e "${snps_raw}\t${snps_pass}\t${indels_raw}\t${indels_pass}" >>stats_4_variant_calling.tsv |