forked from mshcheg/Cartwright_next-gen_scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
shuffle_resamp.py
160 lines (137 loc) · 5.98 KB
/
shuffle_resamp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
'''
This script creates subsampled datasets of paired-end Illumina Reads.
The Screed library is required to execute this script. Screed can be downloaded
from: [email protected]:ctb/screed.git .
usage: python shuffle_resamp.py filein percet number
filein: a fastq file containing both read pairs.
percent: % of dataset to resample.
number: number of resampled datasets to create.
'''
#!/usr/bin/python
from __future__ import print_function
import screed
import sys
import os
import logging, logging.handlers
from random import randint
from math import ceil
def resamp(percent, tempfile, mypath, fileout, db, size, numresamp):
percent, size = float(percent), float(size)
print('The size of the original population is ',size)
NewSize = (percent/100)*size
myrange = ceil(size/NewSize)
NumFiles = dict([(x,(x*NewSize-(NewSize-1),x*NewSize)) for x in range(1,int(myrange)+1)])
print(percent,'% will be resampled from the population. ',2*NewSize,' sequences will be written into ',numresamp,' files.')
print(NumFiles)
i = 0
with open(tempfile,'r') as temp:
for line in temp:
i += 1
myNum = randint(1,size)
if i%1000 == 0:
print('Processing sequence ',i)
for x in range(1,int(numresamp)+1):
myfile = fileout + '_' + str(x)
if NumFiles[x][0] <= myNum and NumFiles[x][1] >= myNum:
R1 = myfile + 'R1'
R2 = myfile + 'R2'
write_R1 = open(os.path.join(mypath,R1),'a')
write_R2 = open(os.path.join(mypath,R2),'a')
myname = line.strip('\n')
match2 = db[myname.replace(" 1:"," 2:")]
write_R1.write("@"+db[myname]['name'].replace(" ","_")+'/1\n'+db[myname]['sequence']+'\n'+'+'+'\n'+db[myname]['accuracy']+'\n')
write_R2.write('@'+match2['name'].replace(" ","_")+'/2\n'+match2['sequence']+'\n'+'+'+'\n'+match2['accuracy']+'\n')
write_R1.close()
write_R2.close()
else:
continue
def log(Mypath, ReadNum, Filein, Fileout, Singleton1, Singleton2,n,j,k,l):
# Make a global logging object.
x = logging.getLogger("logfun")
x.setLevel(logging.DEBUG)
# This handler writes a log file.
h1 = logging.FileHandler(os.path.join(Mypath,"shuffle_resamp.log"))
f = logging.Formatter("%(asctime)s %(lineno)d %(message)s")
h1.setFormatter(f)
h1.setLevel(logging.DEBUG)
x.addHandler(h1)
logfun = logging.getLogger("logfun")
logfun.debug("There are "+str(ReadNum)+" sequences in "+Filein)
logfun.debug(str(n)+" sequences processed.")
logfun.debug(str(j)+ " sequences written to "+Fileout)
logfun.debug(str(k)+ " sequences written to R1 "+Singleton1)
logfun.debug(str(l)+ " sequences written to R2 "+Singleton2)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Mypath, File = os.path.split(sys.argv[1])
Filein = sys.argv[1]
Fileout = Filein + '.shuffled'
Singleton1 = Filein + '.singletons_R1'
Singleton2 = Filein + '.singletons_R2'
NumResamp = sys.argv[3] #number of resamp datasets to produce
try:
Percent = sys.argv[2]
try:
tempfile = open(os.path.join(Mypath,'temp'),'r')
tempfile.close()
except IOError:
pass
if not Percent.isdigit():
print ('You did not enter a valid percent cutoff for resamplaing.')
sys.exit()
except IndexError:
Percent = 100
screeddb = Filein + '_screed'
if os.path.exists(screeddb):
db = screed.ScreedDB(screeddb)
if Percent != 100 and os.path.exists(os.path.join(Mypath,'temp')):
print("Screed database and shuffled files already exists proceeding to resampling.")
resamp(Percent, os.path.join(Mypath,'temp'), Mypath, Fileout, db, sys.argv[3], NumResamp)
os.remove(os.path.join(Mypath,'temp'))
print('Shuffleing done with resampleing. No log file written.')
sys.exit()
else:
print("Screed database already exists proceeding to shuffleing.")
else:
print("Creating screed database.")
db = screed.read_fastq_sequences(Filein)
print(dict(db))
ReadNum = len(db)
N='name'
S='sequence'
A='accuracy'
n, j, k, l = 0, 0, 0, 0
with open(Fileout, 'w') as outfile:
with open(Singleton1, 'w') as R1:
with open(Singleton2, 'w') as R2:
for record, thing in db.iteritems():
n+=1
if " 1:" in thing[N]:
try:
match2 = db[thing[N].replace(" 1:"," 2:")]
outfile.write("@"+thing[N].replace(" ","_")+'/1\n'+thing[S]+'\n'+'+'+'\n'+thing[A]+'\n'+'@'+match2[N].replace(" ","_")+'/2\n'+match2[S]+'\n'+'+'+'\n'+match2[A]+'\n')
j+=2
if Percent != 100:
with open(os.path.join(Mypath,'temp'),'a') as tempfile:
tempfile.write(thing[N]+'\n')
except KeyError:
R1.write("@"+thing[N].replace(" ","_")+'/1\n'+thing[S]+'\n'+'+'+'\n'+thing[A]+'\n')
k+=1
if " 2:" in thing[N]:
try:
match2 = db[thing[N].replace(" 2:"," 1:")]
except KeyError:
R2.write("@"+thing[N].replace(" ","_")+'/2\n'+thing[S]+'\n'+'+'+'\n'+thing[A]+'\n')
l+=1
if n%1000 == 0:
print("Shuffleing sequence ",n)
if k == 0:
os.remove(Singleton1)
if l == 0:
os.remove(Singleton2)
log(Mypath, ReadNum, Filein, Fileout, Singleton1, Singleton2, n, j, k, l)
if Percent !=100:
resamp(Percent, os.path.join(Mypath,'temp'), Mypath, Fileout, db, j/2, NumResamp)
print('Shuffleing done with resampleing. Check shuffle_resamp.log for a summary of the run.')
os.remove(os.path.join(Mypath,'temp'))
else:
print('Shuffleing done without resampleing. Check shuffle_resamp.log for a summary of the run.')