-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_matrices.py
executable file
·403 lines (356 loc) · 16.4 KB
/
make_matrices.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
#!/usr/bin/env python
"""
Generate contact matrices from Hi-C reads in FASTQ format.
If quality assessment and trimming are necessary for Hi-C reads, they must be
made before launching the script on these reads.
Usage:
make_matrices.py (-1 <left_fastq_reads> -2 <right_fastq_reads> -p <reads_prefix> | -r <fastq_reads>) -g <fasta_ref_genome> -i <ref_genome_index> -o <output_directory> [-e <restriction_enzyme> -c <list_of_chromosome_names> -R <matrix_resolution> -t <threads_number> --clean --tmp-dir <temp_directory>]
Options:
-h --help Show this screen.
--version Show version.
-1 <left_fastq_reads> FASTQ file with left (forward) Hi-C reads.
-2 <right_fastq_reads> FASTQ file with right (reverse) Hi-C reads.
-p <output_prefix> Prefix for output files with information about both left and right Hi-C reads.
-r <fastq_reads> FASTQ file with both left (forward) and right (reverse) Hi-C reads.
-g <fasta_ref_genome> FASTA file with reference genome.
-i <ref_genome_index> Reference genome index for GEMtools.
-e <restriction_enzyme> Name of the restriction enzyme. Default: HindIII. The full list of possible restriction enzyme names see https://github.com/3DGenomes/tadbit/blob/master/_pytadbit/mapping/restriction_enzymes.py
-c <list_of_chromosome_names> Text file with chromosome names (one name per line, without empty spaces). Default: chr1, ..., chr22, chrX, chrY (Homo sapiens kariotype).
-R <matrix_resolution> Resolution of the output Hi-C matrices (bp). Default: 100000.
-o <output_directory> Output directory.
-t <threads_number> Number of threads for read mapping. Default: 8.
--clean Remove all SAM and TSV files from the output directory (they're necessary for only the counstruction of contact matrices).
--tmp-dir <temp-directory> Temp directory. Default: /tmp.
"""
# Some code from the TADbit tutorial is used here
import sys
print
modules = ["docopt", "pytadbit", "os"]
exit_flag = False
for module in modules:
try:
__import__(module)
except ImportError:
exit_flag = True
sys.stderr.write("Error: Python module " + module + " is not installed.\n")
if exit_flag:
sys.stderr.write("You can install these modules with a command: pip install <module>\n")
sys.stderr.write("(Administrator privileges may be required.)\n")
sys.exit(1)
from docopt import docopt
from pytadbit.mapping.mapper import iterative_mapping
from pytadbit.parsers.sam_parser import parse_sam
from pytadbit.parsers.genome_parser import parse_fasta
from pytadbit.mapping.mapper import get_intersection
from pytadbit.mapping.filter import filter_reads
from pytadbit.mapping.filter import apply_filter
from pytadbit.mapping.analyze import hic_map
from os.path import basename
from os.path import splitext
from os.path import join
from os.path import exists
from os.path import isfile
from os import rename
from os import remove
from sys import stdout
def calc_left_right_ranges(reads_fastq):
# called when both left and right reads are stored in one file
with open(reads_fastq, 'r') as src:
read_id = src.readline()
read_seq = src.readline().rstrip('\n')
read_len = len(read_seq) / 2
print 'Read lenght:', read_len
stdout.flush()
min_len = 20
step = 5
range_len = (read_len - min_len + 1) / step + 1
# calc range_start_left
range_start_left = [1] * range_len
# calc range_stop_left
first_left_stop_pos = min_len
last_left_stop_pos = first_left_stop_pos + (range_len - 1) * step
range_stop_left = [pos for pos in range(first_left_stop_pos, last_left_stop_pos + 1, step)]
# calc range_start_right
right_start_pos = read_len + 1
range_start_right = [right_start_pos] * range_len
# calc range_stop_right
first_right_stop_pos = right_start_pos + min_len - 1
last_right_stop_pos = first_right_stop_pos + (range_len - 1) * step
range_stop_right = [pos for pos in range(first_right_stop_pos, \
last_right_stop_pos + 1, step)]
return range_start_left, range_stop_left, range_start_right, range_stop_right
def calc_range(left_reads_fastq):
# called when reads are separated in two files
# to calc ranges we need only one of the two files here
with open(left_reads_fastq, 'r') as src:
read_id = src.readline()
read_seq = src.readline().rstrip('\n')
read_len = len(read_seq)
print 'Read lenght:', read_len
stdout.flush()
min_len = 20
step = 5
range_len = (read_len - min_len + 1) / step + 1
# calc range_start
range_start = [1] * range_len
# calc range_stop
first_stop_pos = min_len
last_stop_pos = first_stop_pos + (range_len - 1) * step
range_stop = [pos for pos in range(first_stop_pos, last_stop_pos + 1, step)]
range_start_left = range_start
range_start_right = range_start[:]
range_stop_left = range_stop
range_stop_right = range_stop[:]
return range_start_left, range_stop_left, range_start_right, range_stop_right
def count_lines(filename):
with open(filename, 'r') as src:
for i, line in enumerate(src):
pass
return i + 1
def add_resolution(chromosomes, resolution, output_directory):
stdout.flush()
for chr in chromosomes:
chr_filename = join(output_directory, chr + '.mat')
chr_new_filename = join(output_directory, chr + '_' + str(resolution) + '.mat')
rename(chr_filename, chr_new_filename)
stdout.flush()
def add_headers(chromosomes, resolution, output_directory):
stdout.flush()
for chr in chromosomes:
chr_filename = join(output_directory, chr + '_' + str(resolution) + '.mat')
matrix_size = count_lines(chr_filename)
id_list = [chr + '_' + str(i) for i in range(1, matrix_size + 1)]
header = '\t'.join(id_list)
chr_filename_tmp = chr_filename + '.tmp'
with open(chr_filename, 'r') as src, open(chr_filename_tmp, 'w') as dst:
stdout.flush()
dst.write(header + '\n')
for id, line in zip(id_list, src):
dst.write(id + '\t' + line)
rename(chr_filename_tmp, chr_filename)
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \
output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \
clean_tmp, tmp_dir):
print 'Begin to process reads.'
left_reads = ''
right_reads = ''
if reads_fastq != '': # left and right reads are stored in one file
range_start_left, range_stop_left, \
range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq)
print 'Reads: ', reads_fastq
left_reads = reads_fastq
right_reads = reads_fastq
else: # left and right reads are stored separately
range_start_left, range_stop_left, \
range_start_right, range_stop_right = calc_range(left_reads_fastq)
print 'Left reads: ', left_reads_fastq
print 'Right reads: ', right_reads_fastq
print 'Output prefix: ', output_prefix
left_reads = left_reads_fastq
right_reads = right_reads_fastq
print 'Reference genome FASTA: ', genome_fasta
print 'Reference genome GEM index:', genome_index
print 'Output directory: ', output_directory
print 'Temp directory: ', tmp_dir
print 'Enzyme: ', enzyme
print 'Resolution: ', res, 'bp'
print 'Number of threads: ', threads_number
print 'Start pos for left reads: ', range_start_left
print 'Stop pos for left reads: ', range_stop_left
print 'Start pos for right reads: ', range_start_right
print 'Stop pos for right reads: ', range_stop_right
stdout.flush()
# map left reads to reference genome
out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam'
out_sam_left_path = join(output_directory, out_sam_left_name)
print 'Iterative mapping of left reads (using ' + str(threads_number) + ' threads)...'
stdout.flush()
sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \
range_start_left, range_stop_left, nthreads=threads_number,
temp_dir=tmp_dir)
print 'Done.'
stdout.flush()
# map right reads to reference genome
out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam'
out_sam_right_path = join(output_directory, out_sam_right_name)
print 'Iterative mapping of right reads (using ' + str(threads_number) + ' threads)...'
stdout.flush()
sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \
range_start_right, range_stop_right, nthreads=threads_number,
temp_dir=tmp_dir)
print 'Done.'
stdout.flush()
# load reference genome sequence
print 'Load reference genome sequence...'
stdout.flush()
chroms = chromosomes[:]
genome_seq = parse_fasta(genome_fasta, chr_names=chroms)
print 'Done.'
stdout.flush()
# create files with information about every left and right read
# and about their placement with respect to restriction sites
tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv'
tsv_left = join(output_directory, tsv_left_name)
tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv'
tsv_right = join(output_directory, tsv_right_name)
print 'Get information about restriction sites and reads placement...'
stdout.flush()
parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \
verbose=True, ncpus=8)
print 'Done.'
stdout.flush()
# create file with both left and right reads that uniquelly mapped to reference genome
if reads_fastq != '': # left and right reads are stored in one file
common_reads_prefix = splitext(basename(reads_fastq))[0]
else: # left and right reads are stored separately
common_reads_prefix = output_prefix
uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv'
uniq_reads = join(output_directory, uniq_reads_name)
print 'Merge info about left and right reads in one file...'
stdout.flush()
get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True)
print 'Done.'
stdout.flush()
# find read IDs that are filtered by default TADbit filters
print 'Mask reads...'
stdout.flush()
masked = filter_reads(uniq_reads)
print 'Done.'
stdout.flush()
# apply all filters (exclude reads that were filtered)
print 'Filter masked reads...'
stdout.flush()
filtered_reads_name = common_reads_prefix + '_filtered.tsv'
filtered_reads = join(output_directory, filtered_reads_name)
apply_filter(uniq_reads, filtered_reads, masked)
print 'Done.'
stdout.flush()
# create matrices (one matrix per chromosome)
print 'Create Hi-C maps (one per chromosome)...'
stdout.flush()
hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory)
print 'Done.'
stdout.flush()
print 'Add resolution (' + str(resolution) + ') to matrix filenames...'
stdout.flush()
add_resolution(chromosomes, resolution, output_directory)
print 'Done.'
stdout.flush()
print 'Add headers to matrix files...'
stdout.flush()
add_headers(chromosomes, resolution, output_directory)
print 'Done.'
stdout.flush()
if clean_tmp: # Remove all SAM and TSV files from the output directory
print 'Remove SAM and TSV files from the output directory.'
stdout.flush()
remove(out_sam_left_path + '*')
remove(out_sam_right_path + '*')
remove(join(output_directory, '*.tsv'))
print 'Done.'
stdout.flush()
def get_chromosome_names(chromosome_names_file):
chrom_names = []
if not exists(chromosome_names_file):
print "Error: Can't find text file with chromosome names: no such file '" + \
chromosome_names_file + "'. Exit.\n"
sys.exit(1)
if not isfile(chromosome_names_file):
print "Error: File with chromosome names must be a regular file. " + \
"Something else given. Exit.\n"
sys.exit(1)
with open(chromosome_names_file, 'r') as src:
for line in src:
name = line.rstrip('\n')
chrom_names.append(name)
return chrom_names
if __name__ == '__main__':
arguments = docopt(__doc__, version='make_matrices 0.8')
if arguments["-r"] != None: # left and right reads are stored in one file
reads_fastq = arguments["-r"]
if not exists(reads_fastq):
print "Error: Can't find FASTQ file with reads: no such file '" + \
reads_fastq + "'. Exit.\n"
sys.exit(1)
if not isfile(reads_fastq):
print "Error: FASTQ file with reads must be a regular file. " + \
"Something else given. Exit.\n"
sys.exit(1)
left_reads_fastq = ''
right_reads_fastq = ''
output_prefix = ''
else: # left and right reads are stored separately
left_reads_fastq = arguments["-1"]
if not exists(left_reads_fastq):
print "Error: Can't find FASTQ file with left (forward) reads: no such file '" + \
left_reads_fastq + "'. Exit.\n"
sys.exit(1)
if not isfile(left_reads_fastq):
print "Error: FASTQ file with left (forward) reads must be a regular file. " + \
"Something else given. Exit.\n"
sys.exit(1)
right_reads_fastq = arguments["-2"]
if not exists(right_reads_fastq):
print "Error: Can't find FASTQ file with right (reverse) reads: no such file '" + \
right_reads_fastq + "'. Exit.\n"
sys.exit(1)
if not isfile(right_reads_fastq):
print "Error: FASTQ file with reads must be a regular file. " + \
"Something else given. Exit.\n"
sys.exit(1)
output_prefix = arguments["-p"]
reads_fastq = ''
genome_fasta = arguments["-g"]
if not exists(genome_fasta):
print "Error: Can't find FASTA file with reference genome sequence: " + \
"no such file '" + genome_fasta + "'. Exit.\n"
sys.exit(1)
if not isfile(genome_fasta):
print "Error: FASTA file with reference genome sequence must be a regular file. " + \
"Something else given. Exit.\n"
sys.exit(1)
genome_index = arguments["-i"]
if not exists(genome_index):
print "Error: Can't find GEM index file for reference genome: no such file '" + \
genome_index + "'. Exit.\n"
sys.exit(1)
if not isfile(genome_index):
print "Error: GEM index file for reference genome must be a regular file. " + \
"Something else given. Exit.\n"
sys.exit(1)
output_directory = arguments["-o"].rstrip('/')
if arguments["-e"] != None:
enzyme = arguments["-e"]
else:
enzyme = 'HindIII'
if arguments["-R"] != None:
try:
resolution = int(arguments["-R"])
except ValueError:
print "Error: Resolution must be an integer greater than 0. Exit.\n"
sys.exit(1)
else:
resolution = 100000
if arguments["-c"] != None:
chromosomes = get_chromosome_names(arguments["-c"]) # Exit if file not found
else:
chromosomes = ['chr' + str(c) for c in range(1, 23) + ['X', 'Y']]
if arguments["-t"] != None:
try:
threads_number = int(arguments["-t"])
except ValueError:
print "Error: Thread number must be an integer greater than 0. Exit.\n"
sys.exit(1)
else:
threads_number = 8
if arguments["--clean"]:
clean_tmp = True
else:
clean_tmp = False
if arguments["--tmp-dir"] != None:
tmp_dir = arguments["--tmp-dir"]
else:
tmp_dir = "/tmp"
make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \
output_directory, output_prefix, enzyme, resolution, chromosomes, threads_number, \
clean_tmp, tmp_dir)