-
Notifications
You must be signed in to change notification settings - Fork 2
/
makedensitygenesegmentation-BAM.py
220 lines (203 loc) · 8.09 KB
/
makedensitygenesegmentation-BAM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
##################################
# #
# Last modified 06/04/2012 #
# #
# Georgi Marinov #
# #
##################################
import sys
import string
import math
import pysam
def countReads(samfile,chr,rmin,rmax,noMulti,CorrectionDict):
reads = 0
for alignedread in samfile.fetch(chr, rmin, rmax):
ID=str(alignedread).split('\t')[0]
if alignedread.is_read1:
ID = ID + '/1'
if alignedread.is_read2:
ID = ID + '/2'
if noMulti and CorrectionDict[ID] > 1:
continue
reads += 1.0/CorrectionDict[ID]
return reads
def main(argv):
if len(argv) < 9:
print 'usage: python %s refGene.txt labelFields chrFieldID leftFieldID rightFieldID strandFieldID BAM chrom.sizes otufilename [-TSSupstream bp bins] [-TSSdownstream bp bins] [-genebodybins bins] [-3UTR bpdownstream bins]' % argv[0]
print "\tNote: labelFields comma-separated"
print "\tNote: Genes that are shorter than the -TSSdownstream distance plus the number of genebodybins times the length of TSSdownstream bins will not be considered"
print "\tDefault values: "
print "\t\t\tTSSupstreambp = 2000"
print "\t\t\tTSSupstreambins = 40"
print "\t\t\tTSSdownstreambp = 1000"
print "\t\t\tTSSdownstreambins = 20"
print "\t\t\tgenebodybins = 100"
print "\t\t\tUTRdownstreambp = 5000"
print "\t\t\tUTRdownstreambins = 50"
sys.exit(1)
genes = argv[1]
fields=argv[2].split(',')
LabelFields=[]
for ID in fields:
LabelFields.append(int(ID))
chrFieldID = int(argv[3])
leftFieldID = int(argv[4])
rightFieldID = int(argv[5])
strandFieldID = int(argv[6])
BAM = argv[7]
chrominfo=argv[8]
chromInfoList=[]
linelist=open(chrominfo)
for line in linelist:
fields=line.strip().split('\t')
chr=fields[0]
start=0
end=int(fields[1])
chromInfoList.append((chr,start,end))
outfilename = argv[9]
TSSupstreambp = 2000
TSSupstreambins = 40
TSSdownstreambp = 1000
TSSdownstreambins = 20
genebodybins = 100
UTRdownstreambp = 5000
UTRdownstreambins = 50
if '-TSSupstream' in argv:
TSSupstreambp = int(argv[argv.index('-TSSupstream') + 1])
TSSupstreambins = int(argv[argv.index('-TSSupstream') + 2])
if '-TSSdownstream' in argv:
TSSdownstreambp = int(argv[argv.index('-TSSdownstream') + 1])
TSSdownstreambins = int(argv[argv.index('-TSSdownstream') + 2])
if '-genebodybins' in argv:
genebodybins = int(argv[argv.index('-genebodybins')+1])
if '-zeros' in argv:
doZeros=True
zeros = float(argv[argv.index('-zeros')+1])
if '-3UTR' in argv:
UTRdownstreambp = int(argv[argv.index('-3UTR')+1])
UTRdownstreambins = int(argv[argv.index('-3UTR')+2])
noMulti=False
if '-nomulti' in argv:
noMulti = True
ReadDict={}
CorrectionDict={}
i=0
unique=0
multi=0
samfile = pysam.Samfile(BAM, "rb" )
for (chr,start,end) in chromInfoList:
try:
for alignedread in samfile.fetch(chr, 0, 100):
a='b'
except:
print 'region', chr,start,end, 'not found in bam file, skipping'
continue
for alignedread in samfile.fetch(chr, start, end):
i+=1
if i % 5000000 == 0:
print 'examining read multiplicity', str(i/1000000) + 'M alignments processed processed', len(CorrectionDict.keys()), 'reads found', chr, start, alignedread.pos, end
fields=str(alignedread).split('\t')
ID=fields[0]
if alignedread.is_read1:
ID = ID + '/1'
if alignedread.is_read2:
ID = ID + '/2'
if CorrectionDict.has_key(ID):
CorrectionDict[ID]+=1
else:
CorrectionDict[ID]=1
readNumber = len(CorrectionDict.keys())
print 'found', readNumber, 'reads'
normalizeBy = readNumber/1000000.
print 'RPM normalization Factor =', normalizeBy
outfile = open(outfilename, 'w')
linelist= open(genes)
j=1
for line in linelist:
if line.startswith('#'):
continue
if j % 1000 == 0:
print j
j+=1
fields = line.strip().split('\t')
chr = fields[chrFieldID]
left = int(fields[leftFieldID])
right = int(fields[rightFieldID])
try:
for alignedread in samfile.fetch(chr, 0, 100):
a='b'
except:
continue
strand = fields[strandFieldID]
if right - left < TSSdownstreambp + TSSdownstreambins*genebodybins:
continue
values = []
if strand == 'F' or strand == '+':
current=left-TSSupstreambp
slide=float(TSSupstreambp)/TSSupstreambins
for i in range(TSSupstreambins-1):
rmin=int(current)
rmax=int(current+slide)
values.append(countReads(samfile,chr,rmin,rmax,noMulti,CorrectionDict)/(normalizeBy*slide/1000))
current=current+slide
current=left
slide=float(TSSdownstreambp)/TSSdownstreambins
for i in range(TSSdownstreambins-1):
rmin=int(current)
rmax=int(current+slide)
values.append(countReads(samfile,chr,rmin,rmax,noMulti,CorrectionDict)/(normalizeBy*slide/1000))
current=current+slide
current=left+TSSdownstreambp
slide=float(right-current)/genebodybins
for i in range(genebodybins-1):
rmin=int(current)
rmax=int(current+slide)
values.append(countReads(samfile,chr,rmin,rmax,noMulti,CorrectionDict)/(normalizeBy*slide/1000))
current=current+slide
current=right
slide=float(UTRdownstreambp)/UTRdownstreambins
for i in range(UTRdownstreambins-1):
rmin=int(current)
rmax=int(current+slide)
values.append(countReads(samfile,chr,rmin,rmax,noMulti,CorrectionDict)/(normalizeBy*slide/1000))
current=current+slide
if strand == 'F' or strand == '-':
start=right+TSSupstreambp
current=start
slide=float(TSSupstreambp)/TSSupstreambins
for i in range(TSSupstreambins-1):
rmin=int(current-slide)
rmax=int(current)
values.append(countReads(samfile,chr,rmin,rmax,noMulti,CorrectionDict)/(normalizeBy*slide/1000))
current=current-slide
current=right
slide=float(TSSdownstreambp)/TSSdownstreambins
for i in range(TSSdownstreambins-1):
rmin=int(current-slide)
rmax=int(current)
values.append(countReads(samfile,chr,rmin,rmax,noMulti,CorrectionDict)/(normalizeBy*slide/1000))
current=current-slide
current=right-TSSdownstreambp
slide=float(current-left)/genebodybins
for i in range(genebodybins-1):
rmin=int(current-slide)
rmax=int(current)
values.append(countReads(samfile,chr,rmin,rmax,noMulti,CorrectionDict)/(normalizeBy*slide/1000))
current=current-slide
current=left
slide=float(UTRdownstreambp)/UTRdownstreambins
for i in range(UTRdownstreambins-1):
rmin=int(current-slide)
rmax=int(current)
values.append(countReads(samfile,chr,rmin,rmax,noMulti,CorrectionDict)/(normalizeBy*slide/1000))
current=current-slide
outline = ''
for geneID in LabelFields:
outline = outline + fields[geneID] + '\t'
outline = outline + chr + '\t' + str(left) + '\t' + str(right) + '\t' + strand
for value in values:
outline = outline + '\t' + str(value)
outfile.write(outline + '\n')
outfile.close()
if __name__ == '__main__':
main(sys.argv)