-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrmdup_collapsed.py
executable file
·255 lines (193 loc) · 8.54 KB
/
rmdup_collapsed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
#!/usr/bin/env python
#
# Based on 'FilterUniqueBAM' by
# Martin Kircher
#
"""paleomix rmdup_collapsed [options] < sorted.bam > out.bam
The rmdup_collapsed filters a BAM file for PCR duplicates unpaired reads under
the assumption that any unpaired read have been generated by the merging of
overlapping paired-end reads, and thereby represent the complete template
sequence. PCR duplicates are therefore detected based on both the 5' and 3'
alignment coordinate.
Paired reads (0x1), unmapped reads (0x4), secondary alignments (0x100),
reads that failed QC (0x200), and chimeric alignments (0x800), as identified
using the BAM record flags, are not filtered, but simply written to the output.
By default, filtered reads are flagged using the "duplicate" flag (0x400), and
written to the output. Use the --remove-duplicates command-line option to
instead remove these records from the output.
"""
import collections
import random
import sys
from argparse import ArgumentParser
import pysam
_FILTERED_FLAGS = 0x1 # PE reads
_FILTERED_FLAGS |= 0x4 # Unmapped
_FILTERED_FLAGS |= 0x100 # Secondary alignment
_FILTERED_FLAGS |= 0x200 # Failed QC
_FILTERED_FLAGS |= 0x800 # Chimeric alignment
_CIGAR_SOFTCLIP = 4
_CIGAR_HARDCLIP = 5
def read_quality(read):
qualities = read.query_alignment_qualities
if qualities is None:
# Generate value in range (-1; 0]
return -random.random()
return sum(qualities)
def copy_number(read):
# has_tag is faster than try/except, since most reads lack the tag.
if read.has_tag('XP'):
return read.get_tag('XP')
return 0
def mark_duplicate_reads(reads):
"""Identifies the best read from a set of PCR duplicates, and marks all
other reads as duplicates. The best read is selected by quality, among the
reads sharing the most common CIGAR string.
"""
by_cigar = collections.defaultdict(list)
for read in reads:
key = tuple(read.cigartuples)
by_cigar[key].append(read)
# Select the most common CIGAR strings, favoring simple CIGARs
best_count, best_cigar_len = max((len(values), -len(cigar))
for cigar, values in by_cigar.iteritems())
best_cigar_len = -best_cigar_len
best_read = None
best_quality = -1
copies = len(reads)
for cigar, candidates in by_cigar.iteritems():
if len(cigar) == best_cigar_len and len(candidates) == best_count:
for read in candidates:
copies += copy_number(read)
quality = read_quality(read)
if quality > best_quality:
best_read = read
best_quality = quality
else:
copies += sum(copy_number(read) for read in candidates)
best_read.set_tag('XP', copies, 'i')
for read in reads:
read.is_duplicate = (read is not best_read)
def write_read(args, out, read_and_alignment, duplicates_by_alignment):
read, alignment = read_and_alignment
if alignment is not None:
duplicates = duplicates_by_alignment.pop(alignment)
if len(duplicates) > 1:
# Select the best read and mark the others as duplicates.
mark_duplicate_reads(duplicates)
else:
duplicates[0].is_duplicate = False
duplicates[0].set_tag('XP', 1, 'i')
if not (args.remove_duplicates and read.is_duplicate):
out.write(read)
def can_write_read(read_and_alignment, current_position):
"""Returns true if the first read in the cache can safely be written. This
will be the case if the read was not the first in a set of reads with the
same alignment, or if the current position has gone beyond the last base
covered in that alignment.
"""
_, alignment = read_and_alignment
if alignment is None:
return True
current_ref_id, current_ref_start = current_position
alignment_ref_id, _, _, alignment_ref_end = alignment
return alignment_ref_id != current_ref_id \
or alignment_ref_end < current_ref_start
def clipped_bases_at_front(cigartuples):
"""Returns number of bases soft or hard clipped at start of the CIGAR."""
total = 0
for (operation, length) in cigartuples:
if operation != _CIGAR_SOFTCLIP and operation != _CIGAR_HARDCLIP:
break
total += length
return total
def unclipped_alignment_coordinates(read):
"""Returns tuple describing the alignment, with external coordinates
modified to account for clipped bases, assuming an ungapped alignment to
the reference. This is equivalent to the behavior of Picard MarkDuplicates.
"""
cigartuples = read.cigartuples
start = read.reference_start - clipped_bases_at_front(cigartuples)
end = read.reference_end + clipped_bases_at_front(reversed(cigartuples))
return (read.reference_id, read.is_reverse, start, end)
def process_aligned_read(cache, duplicates_by_alignment, read):
"""Processes an aligned read, either pairing it with an existing read, or
creating a new alignment block to track copies of this copies.
"""
alignment = unclipped_alignment_coordinates(read)
try:
duplicates_by_alignment[alignment].append(read)
cache.append((read, None))
except KeyError:
# No previous reads with matching alignment; this read will
# serve to track any other reads with the same alignment.
duplicates_by_alignment[alignment] = [read]
cache.append((read, alignment))
def is_trailing_unmapped_read(read):
return read.is_unmapped \
and read.reference_id == -1 \
and read.reference_start == -1
def process(args, infile, outfile):
cache = collections.deque()
duplicates_by_alignment = {}
last_position = (0, 0)
read_num = 1
for read_num, read in enumerate(infile, start=read_num):
current_position = (read.reference_id, read.reference_start)
if last_position > current_position:
# Check also catches trailing unmapped reads mapped to (-1, -1).
if not is_trailing_unmapped_read(read):
sys.stderr.write("ERROR: Input file is not sorted by "
"coordinates at read %i. Aborting!\n"
% (read_num,))
return 1
cache.append((read, None))
break
elif read.flag & _FILTERED_FLAGS:
cache.append((read, None))
else:
process_aligned_read(cache, duplicates_by_alignment, read)
last_position = current_position
while cache and can_write_read(cache[0], current_position):
write_read(args, outfile, cache.popleft(), duplicates_by_alignment)
while cache:
write_read(args, outfile, cache.popleft(), duplicates_by_alignment)
assert not duplicates_by_alignment, duplicates_by_alignment
for read_num, read in enumerate(infile, start=read_num + 1):
if not is_trailing_unmapped_read(read):
sys.stderr.write("ERROR: Input file is not sorted by "
"coordinates at read %i. Aborting!\n"
% (read_num,))
return 1
outfile.write(read)
return 0
def parse_args(argv):
parser = ArgumentParser(usage=__doc__)
parser.add_argument("input", default="-", nargs="?",
help="BAM file; if not set, input is read from STDIN.")
parser.add_argument("--remove-duplicates",
help="Remove duplicates from output; by default "
"duplicates are only flagged (flag = 0x400).",
default=False, action="store_true")
parser.add_argument("--seed", default=None, type=int,
help="Seed used for randomly selecting representative "
"reads when no reads have quality scores assigned"
"[default: initialized using system time].")
return parser.parse_args(argv)
def main(argv):
args = parse_args(argv)
# Initialize seed used when selecting among reads without quality scores
random.seed(args.seed)
if args.input == "-" and sys.stdin.isatty():
sys.stderr.write("STDIN is a terminal, terminating!\n")
return 1
elif sys.stdout.isatty():
sys.stderr.write("STDOUT is a terminal, terminating!\n")
return 1
with pysam.AlignmentFile(args.input, "rb") as infile:
with pysam.AlignmentFile("-", "wb", template=infile) as outfile:
return process(args, infile, outfile)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))