forked from brianpenghe/bamsam-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
resample_bam.py
executable file
·68 lines (62 loc) · 2.32 KB
/
resample_bam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/python
import argparse
import shutil
import sys
import pysam
import numpy as np
def calculate_samples(bamfile, count, ref_prefix=None):
refcounts = dict()
for s in pysam.idxstats(bamfile).split("\n"):
tok = s.rstrip().split("\t")
if ref_prefix is not None and tok[0].startswith(ref_prefix) == False:
continue
refcounts[tok[0]] = int(tok[2])
coef = (count * 1.0) / sum(refcounts.values())
return dict((k, (v, int(np.round(v * coef)))) for k, v in refcounts.items())
def get_random_alignments(bfd, obfd, ref, refcounts):
refsizes = dict(zip(bfd.references, bfd.lengths))
count = refcounts[ref][1]
total = refcounts[ref][0]
#q = np.arange(total)
#np.random.shuffle(q)
#q = sorted(q[:count])
q = sorted(np.random.permutation(np.arange(total))[:count])
qi = 0
algni = 0
if len(q) == 0:
return
for algn in bfd.fetch(ref):
if algni == q[qi]:
obfd.write(algn)
qi += 1
if qi == len(q):
break
algni += 1
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="")
parser.add_argument("-b", "--bamfile", required=True)
parser.add_argument("-f", "--outfile", required=True)
parser.add_argument("-c", "--count", required=True, type=int)
parser.add_argument("-v", "--verbose", action="store_true", default=False)
args = parser.parse_args()
refcounts = calculate_samples(args.bamfile, args.count, "chr")
bfd = pysam.Samfile(args.bamfile, "rb")
if bfd.mapped < args.count:
print "number of reads specified is greater than number of reads in source file"
exit(1)
obfd = pysam.Samfile(args.outfile, "wb", template=bfd)
for ref in sorted(refcounts):
if args.verbose:
print "processing %s %d" % (ref, refcounts[ref][1])
sys.stdout.flush()
get_random_alignments(bfd, obfd, ref, refcounts)
obfd.close()
bfd.close()
sortedprefix = "%s.sorted" % ".".join(args.outfile.split(".")[:-1])
if args.verbose:
print "sorting %s -> %s.bam" % (args.outfile, sortedprefix)
pysam.sort("-o", "%s.bam" % sortedprefix, args.outfile)
shutil.move("%s.bam" % sortedprefix, args.outfile)
if args.verbose:
print "indexing %s" % args.outfile
pysam.index(args.outfile)