-
Notifications
You must be signed in to change notification settings - Fork 6
/
extract_bedgraph.py
executable file
·109 lines (87 loc) · 3.81 KB
/
extract_bedgraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
__author__ = "Jens Luebeck (jluebeck [at] ucsd.edu)"
import argparse
import os
import subprocess
try:
from subprocess import DEVNULL # Python 3.
except ImportError:
DEVNULL = open(os.devnull, 'wb')
# run a command and yield the stdout on the fly, derived from user "tokland" on StackOverflow
# https://stackoverflow.com/questions/4417546/constantly-print-subprocess-output-while-process-is-running
def execute(cmd, redirect_stderr = False):
stderr_loc = None
if redirect_stderr:
stderr_loc = DEVNULL
popen = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, universal_newlines=True, stderr=stderr_loc)
for stdout_line in iter(popen.stdout.readline, ""):
yield stdout_line
popen.stdout.close()
return_code = popen.wait()
if return_code:
raise subprocess.CalledProcessError(return_code, cmd)
# MAIN
#inputs: bam file, bed file
parser = argparse.ArgumentParser(description="Get coverage bedgraph and baseline coverage")
parser.add_argument("--bam",type=str, help="path to bam file", required=True)
parser.add_argument("--bed",type=str, help="path to bed file", required=True)
parser.add_argument("--estimate_average_coverage", help="get an estimate of the average coverage per chromosome",
action='store_true')
parser.add_argument("-o", type=str,help="output file prefix. Default to prefix of input bed")
args = parser.parse_args()
with open(args.bed) as infile:
regions = []
for line in infile:
fields = line.rstrip().rsplit()
regions.append(fields)
base_cmd = "samtools mpileup -B -d 50000 -q 5 -r "
bambase = os.path.splitext(os.path.basename(args.bam))[0]
if not args.o:
bedbase = os.path.splitext(os.path.basename(args.bed))[0]
bgout = bedbase + "_position_coverage.bedgraph"
else:
bgout = args.o + "_position_coverage.bedgraph"
# make the bedgraph
with open(bgout, 'w') as outfile:
for ind, r in enumerate(regions):
prevpoint = ["",0,0,0]
cmd = base_cmd + r[0] + ":" + r[1] + "-" + r[2] + " " + args.bam + " | cut -f 1-2,4"
for pline in execute(cmd):
fields = pline.rstrip().rsplit()
c, p, n = fields[0], int(fields[1]), int(fields[2])
if c != prevpoint[0] or n != prevpoint[3] or p > prevpoint[2] + 1:
if prevpoint[0]:
outfile.write("\t".join([str(x) for x in prevpoint]) + "\n")
prevpoint = [c, p, p, n]
else:
prevpoint[2] = p
if prevpoint[0]:
outfile.write("\t".join([str(x) for x in prevpoint]) + "\n")
# get a bedgraph of the mean coverage values
if args.estimate_average_coverage:
# first get the average read length
# command based on solution by Chris Miller at BioStars. https://www.biostars.org/p/65216/
print("estimating read length")
cmd = "samtools view -F 4 " + args.bam + " | head -n 2000000 | cut -f 10 | perl -ne \'chomp;print length($_) . " + "\"" + "\\n" + "\"" + "' | sort | uniq -c"
print(cmd)
runT = 0.0
runW = 0.0
for i in execute(cmd, redirect_stderr=True):
c, l = [int(x) for x in i.rstrip().lstrip().rsplit()]
runT+=c
runW+=(c*l)
meanRL = runW/runT
# now get the number of aligned reads per chromosome, the length of the chr and do Lander-Waterman stats
print("getting chromosome coverage")
cmd = "samtools idxstats " + args.bam
print(cmd)
with open(bambase + "_chromosome_coverage.bedgraph",'w') as outfile:
for covline in execute(cmd):
fields = covline.rstrip().rsplit()
if fields[0] != '*':
G = int(fields[1])
n = int(fields[2])
a = 0
if G > 0:
a = n*meanRL/G
outfile.write("\t".join([fields[0], "1", fields[1], str(a)]) + "\n")