-
Notifications
You must be signed in to change notification settings - Fork 0
/
coverage_summary.sh
executable file
·121 lines (103 loc) · 4.87 KB
/
coverage_summary.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/bin/bash
set -e
set -u
set -o pipefail
# Define usage message
function usage() {
echo -e "\
$0: \n\
\n\
Usage: ./coverage_summary.sh [bam_list] [bam_dir] [region_file] [out_dir] \n\
\n\
NOTE: arguments must be provided in this order. \n\
\n\
where: \n\
1. [bam_list] is a list of bam files to calculate coverage summary \n\
2. [bam_dir] is where our current BAM files are located \n\
3. [region_file] includes regions we are interested in (i.e. BED file) \n\
4. [out_dir] is where we want our files to go \n\
\n\
Required dependencies:
1. Bedtools v2.28.0 or greater
2. datamash v1.1.0 or greater
3. parallel
" >&2
exit 1
}
if [[ $# -lt 3 ]]; then usage; fi
# User provided input arguments
# List of bam files
BAM_LIST=$1
# Exome capture regions we are using to calculate coverage
REGION_FILE=$2
# Where do our output files go?
OUT_DIR=$3
# What is the name of our project?
PROJECT=$4
# Define array to store list of BAM files
declare -a BAM_ARRAY=($(cat "${BAM_LIST}"))
# Check if output directories exist, if not make them
mkdir -p ${OUT_DIR} ${OUT_DIR}/Coverage_Summary ${OUT_DIR}/Coverage_Summary/Histograms
# Function to calculate coverage
# Command lines adapted from sequence_handling
# (https://github.com/MorrellLAB/sequence_handling/blob/master/Handlers/Coverage_Mapping.sh)
function Calc_Coverage() {
local bam_file="$1"
local region_file="$2"
local out_dir="$3"
local project="$4"
# Sample name
sampleName=$(basename "${bam_file}" .bam)
# Generate coverage hist
bedtools coverage -hist -abam "${bam_file}.bam" -b "${region_file}" > ${out_dir}/Histograms/${sampleName}.hist
# Pull out histogram summarizing coverage among "all" features in A
grep ^all ${out_dir}/Histograms/${sampleName}.hist > ${out_dir}/Histograms/${sampleName}.hist.all.txt
# Begin calculating statistics per bp
# The minimum is the coverage on the first line of the "all" fields since they're already sorted
min=$(head -n 1 ${out_dir}/Histograms/${sampleName}.hist.all.txt | awk -F "\t" '{ print $2 }')
# The maximum is the coverage on the last line of the "all" fields
max=$(tail -n 1 ${out_dir}/Histograms/${sampleName}.hist.all.txt | awk -F "\t" '{ print $2 }')
# The mean is the sum of (each coverage * the percent of the genome at that coverage)
mean=$(awk '{ sum += $2*$5 } END { print sum }' ${out_dir}/Histograms/${sampleName}.hist.all.txt)
# The mode is the coverage that has the highest percent of the genome at that coverge (excludes zero coverage)
mode=$(tail -n +2 ${out_dir}/Histograms/${sampleName}.hist.all.txt | sort -grk5,5 | head -1 | awk -F "\t" '{ print $2 }')
# The quantiles are a bit tricky...
# row_count will count how many rows down the "all" fields we are
row_count="0"
# freq_sum will be the sum of the frequency fields (column 5) from row 0 to row_count
freq_sum="0"
# While freq_sum < 0.25
while [ $(echo "if (${freq_sum} < 0.25) 1 else 0" | bc) -eq 1 ]
do
((row_count += 1))
# freq is the value of the frequency field (column 5) on the row corresponding to row_count
freq=$(head -n ${row_count} ${out_dir}/Histograms/${sampleName}.hist.all.txt | tail -1 | awk -F "\t" '{print $5}')
# Add freq to freq_sum until the while loop exits
freq_sum=$(echo "${freq_sum} + ${freq}" | bc -l)
done
# The first quantile is the coverage on the row at which the cumulative frequency hits 0.25 or greater
Q1=$(head -n ${row_count} ${out_dir}/Histograms/${sampleName}.hist.all.txt | tail -1 | awk -F "\t" '{print $2}')
# Repeat for Q2 (median)
while [ $(echo "if (${freq_sum} < 0.5) 1 else 0" | bc) -eq 1 ]
do
((row_count += 1))
freq=$(head -n ${row_count} ${out_dir}/Histograms/${sampleName}.hist.all.txt | tail -1 | awk -F "\t" '{print $5}')
freq_sum=$(echo "${freq_sum} + ${freq}" | bc -l)
done
Q2=$(head -n ${row_count} ${out_dir}/Histograms/${sampleName}.hist.all.txt | tail -1 | awk -F "\t" '{print $2}')
# Repeat for Q3
while [ $(echo "if (${freq_sum} < 0.75) 1 else 0" | bc) -eq 1 ]
do
((row_count += 1))
freq=$(head -n ${row_count} ${out_dir}/Histograms/${sampleName}.hist.all.txt | tail -1 | awk -F "\t" '{print $5}')
freq_sum=$(echo "${freq_sum} + ${freq}" | bc -l)
done
Q3=$(grep "all" "${out_dir}/Histograms/${sampleName}.hist" | head -n ${row_count} | tail -1 | awk -F "\t" '{print $2}')
# Append the statistics to the summary file
echo -e "${sampleName}"'\t'"${min}"'\t'"${Q1}"'\t'"${mode}"'\t'"${Q2}"'\t'"${mean}"'\t'"${Q3}"'\t'"${max}" >> "${out_dir}/${project}_coverage_summary_unfinished.txt"
# Put a call to plotCoverage here
}
# Export function
export -f Calc_Coverage
# Calculate coverage
parallel --jobs 2 --xapply Calc_Coverage {1} "${BAM_DIR}" "${REGION_FILE}" "${OUT_DIR}" :::: "${BAM_LIST}" ::: "${SAMPLE_NAMES[@]}"