-
Notifications
You must be signed in to change notification settings - Fork 1
/
panX.py
executable file
·338 lines (315 loc) · 17.6 KB
/
panX.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
#!/usr/bin/env python
import argparse
import os, sys, time
panX_script_dir= os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0,panX_script_dir+'/scripts/')
from pangenome_computation import pangenome
from sf_miscellaneous import times, check_dependency
'''
panX clusters genes from a set of microbial genomes into orthologous clusters
and exports alignments, phylogenies, and meta data for interactive web visualization
and powerful pan-genomic data exploration.
USAGE EXAMPLE: ./panX -fn data/TestSet -sl TestSet-RefSeq.txt 1>TestSet.log 2>TestSet.err
For help, type:
./panX -h
'''
parser = argparse.ArgumentParser(description=\
'panX: Software for computing core and pan-genome from a set of genome sequences.'
' The results will be exported as json files for visualization in the browser.',
usage='./%(prog)s'+' -h (help)')
parser.add_argument('-fn', '--folder_name', type = str, required=True,
help='the absolute path for project folder ', metavar='')
parser.add_argument('-sl', '--species_name', type = str, required=True,
help='species name as prefix for some temporary folders (e.g.: P_aeruginosa)', metavar='')
parser.add_argument('-ngbk', '--gbk_present', action='store_false',
help='use nucleotide/amino acid sequence files (fna/faa) when no genBank files given (this option does not consider annotations)')
parser.add_argument('-st', '--steps', nargs='+', type = int, default = ['all'],
help='run specific steps or run all steps by default', metavar='')
parser.add_argument('-mo', '--metainfo_organism', action='store_true',
help='add organism information in metadata table.')
parser.add_argument('-mr', '--metainfo_reconcile', action='store_true',
help='use reconciled metadata (redundancy removed) instead of original metadata.')
parser.add_argument('-rt', '--raxml_max_time', type = int, default = 30,
help='RAxML tree optimization: maximal runing time (minutes, default:30min)' , metavar='')
parser.add_argument('-t', '--threads', type = int, default = 1,
help='number of threads', metavar='')
parser.add_argument('-v', '--version', action='version', version='panX analysis v1.5.1')
#/*==================================
# clustering
#==================================*/
parser.add_argument('-bp', '--blast_file_path', type = str, default = 'none',
help='the absolute path for blast result (e.g.: /path/blast.out)' , metavar='')
parser.add_argument('-rp', '--roary_file_path', type = str, default = 'none',
help='the absolute path for roary result (e.g.: /path/roary.out)' , metavar='')
parser.add_argument('-op', '--orthofinder_file_path', type = str, default = 'none',
help='the absolute path for orthofinder result (e.g.: /path/orthofinder.out)' , metavar='')
parser.add_argument('-otp', '--other_tool_fpath', type = str, default = 'none',
help='the absolute path for result from other orthology inference tool (e.g.: /path/other_tool.out)' , metavar='')
parser.add_argument('-mi', '--metainfo_fpath', type = str, default = 'none',
help='the absolute path for meta_information file (e.g.: /path/meta.out)' , metavar='')
parser.add_argument('-dmp', '--diamond_path', type = str, default = '',
help='alternative diamond path provided by user', metavar='')
parser.add_argument('-dme', '--diamond_evalue', type = str, default = '0.001',
help='default: e-value threshold below 0.001', metavar='')
parser.add_argument('-dmt', '--diamond_max_target_seqs', type = str, default = '600',
help='Diamond: maximum number of target sequences per query\
Estimation: #strain * #max_duplication (50*10=500)', metavar='')
parser.add_argument('-dmi', '--diamond_identity', type = str, default = '0',
help='Diamond: sequence identity threshold to report an alignment. Default: no restriction (0)', metavar='')
parser.add_argument('-dmqc', '--diamond_query_cover', type = str, default = '0',
help='Diamond: query sequence coverage threshold to report an alignment. Default: no restriction (0)', metavar='')
parser.add_argument('-dmsc', '--diamond_subject_cover', type = str, default = '0',
help='Diamond: subject sequence coverage threshold to report an alignment. Default: no restriction (0)', metavar='')
parser.add_argument('-dmdc', '--diamond_divide_conquer', action='store_true',
help='running diamond alignment in divide-and-conquer(DC) algorithm for large dataset')
parser.add_argument('-dcs', '--subset_size', type = int, default = 50,
help='subset_size (number of strains in a subset) for divide-and-conquer(DC) algorithm. Default:50',\
metavar='')
parser.add_argument('-dmsi', '--diamond_identity_subproblem', type = str, default = '90',
help='Diamond divide-and-conquer subproblem: sequence identity threshold to report an alignment.', metavar='')
parser.add_argument('-dmsqc', '--diamond_query_cover_subproblem', type = str, default = '90',
help='Diamond divide-and-conquer subproblem: query sequence coverage threshold to report an alignment', metavar='')
parser.add_argument('-dmssc', '--diamond_subject_cover_subproblem', type = str, default = '90',
help='Diamond divide-and-conquer subproblem: subject sequence coverage threshold to report an alignment', metavar='')
parser.add_argument('-imcl', '--mcl_inflation', type = float, default = 1.5,
help='MCL: inflation parameter (this parameter affects granularity) ', metavar='')
parser.add_argument('-bmt', '--blastn_RNA_max_target_seqs', type = str, default = '1000',
help='Blastn on RNAs: the maximum number of target sequences per query\
Estimation: #strain * #max_duplication', metavar='')
#/*=======================================
# post-processing
#=======================================*/
parser.add_argument('-np', '--disable_cluster_postprocessing', action='store_true',
help='disable postprocessing (split overclustered genes and paralogs, and cluster unclustered genes)')
parser.add_argument('-nsl', '--disable_long_branch_splitting', action='store_true',
help='disable splitting long branch')
parser.add_argument('-rna', '--enable_RNA_clustering', action='store_true',
help='cluster rRNAs')
## split tree via breaking up long branches (resolving over-clustering)
#parser.add_argument('-sf', '--split_long_branch_factor', type = float, default = 3.0,
# help='use (0.1+3.0*core_diversity)/(1+3.0*core_diversity) to decide split_long_branch_cutoff',metavar='')
parser.add_argument('-fcd', '--factor_core_diversity', type = float, default = 2.0,
help='default: factor used to refine raw core genome diversity, \
apply (0.1+2.0*core_diversity)/(1+2.0*core_diversity) to decide split_long_branch_cutoff', metavar='')
parser.add_argument('-slb', '--split_long_branch_cutoff', type = float, default = 0.0,
help='split long branch cutoff provided by user (by default: 0.0 as not given):',metavar='')
## split paralogy
parser.add_argument('-pep', '--explore_paralog_plot', action='store_true',
help='default: not plot paralog statistics')
parser.add_argument('-pfc', '--paralog_frac_cutoff', type = float, default = 0.33,
help='fraction of strains required for splitting paralogy. Default: 0.33', metavar='')
parser.add_argument('-pbc', '--paralog_branch_cutoff', type = float, default = 0.0,
help='branch_length cutoff used in paralogy splitting', metavar='')
## resolve peaks (unclustered records)
parser.add_argument('-ws', '--window_size_smoothed', type = int, default = 5,
help='postprocess_unclustered_genes: window size for smoothed cluster length distribution',
metavar='')
parser.add_argument('-spr', '--strain_proportion', type = float, default = 0.3,
help='postprocess_unclustered_genes: strain proportion', metavar='')
parser.add_argument('-ss', '--sigma_scale', type = int, default = 3,
help='postprocess_unclustered_genes: sigma scale', metavar='')
## core genome cutoff
parser.add_argument('-cg', '--core_genome_threshold', type = float, default = 1.0,
help='percentage of strains used to decide whether a gene is core.\
Default: 1.0 for strictly core gene; < 1.0 for soft core genes',
metavar='')
## core gene strain constraint
parser.add_argument('-csf', '--core_gene_strain_fpath', type = str, default = '',
help='file path for user-provided subset of strains (core genes should be present in all strains in this list)',
metavar='')
parser.add_argument('-sitr', '--simple_tree', action='store_true',
help='simple tree: does not use treetime for ancestral inference')
## gene gain loss inference
parser.add_argument('-dgl', '--disable_gain_loss', action='store_true',
help='disable enable gene gain and loss inference (not recommended)')
parser.add_argument('-mglo', '--merged_gain_loss_output', action='store_true',
help='not split gene presence/absence and gain/loss pattern into separate files for each cluster')
## branch association inference
parser.add_argument('-iba', '--infer_branch_association', action='store_true',
help='infer branch association')
parser.add_argument('-bamin', '--min_strain_fraction_branch_association', type = float, default = 0.9,
help='minimal fraction of the total number of strains for branch association',metavar='')
parser.add_argument('-pamin', '--min_strain_fraction_presence_association', type = float, default = 0.02,
help='minimal fraction of the total number of strains for presence/absence association',metavar='')
parser.add_argument('-pamax', '--max_strain_fraction_presence_association', type = float, default = 0.95,
help='maximal fraction of the total number of strains for presence/absence association',metavar='')
## other options
parser.add_argument('-slt', '--store_locus_tag', action='store_true',
help='store locus_tags in a separate file instead of saving locus_tags in gene cluster json for large dataset')
parser.add_argument('-rlt', '--raw_locus_tag', action='store_true',
help='use raw locus_tag from GenBank instead of strain_ID + locus_tag')
parser.add_argument('-otc', '--optional_table_column', action='store_true',
help='add customized column in gene cluster json file for visualization.')
parser.add_argument('-mtf', '--meta_data_config', type = str, default = '',
help='file path for pre-defined metadata structure (discrete/continuous data type, etc.)', metavar='')
parser.add_argument('-rxm', '--raxml_path', type = str, default = '',
help='absolute path of raxml', metavar='')
parser.add_argument('-ct', '--clean_temporary_files', action='store_true',
help='default: keep temporary files')
params = parser.parse_args()
path = os.path.abspath(params.folder_name)+'/'
if params.steps[0]=='all':
## run all steps
params.steps=range(1,12)
print 'Running panX in main folder: %s'%path
#species=params.species_name
programs={'mcl':'mcl', 'mafft':'mafft', 'fasttree':'FastTree', 'raxml':'raxmlHPC'}
if params.diamond_path=='':
programs['diamond']='diamond'
for program_alias, program_name in programs.items():
passed=False
## check whether program_alias exists (if yes, test passed)
if check_dependency(program_alias):
continue
## if program_alias does not exist, check whether origin program_name exists
if check_dependency(program_name):
continue
## if the program is not installed, exit
if not passed:
if program_name=='diamond':
warning='\ndiamond not found:\nplease make sure that diamond is installed '+\
'and diamond binary file is included in the executable search path (e.g.: /usr/bin/diamond);\n'+\
'alternatively, one can specify diamond path via the parameter -dmp (e.g.: ./panX.py -dmp /mypath/diamond -fn ...)'
print warning
else:
print 'program '+program_name+' not found, please install it.'
exit()
myPangenome=pangenome(
path=path,
#species=params.strain_list.split('-RefSeq')[0],
species=params.species_name,
gbk_present=params.gbk_present,
threads=params.threads,
metainfo_organism=params.metainfo_organism,
metainfo_reconcile=params.metainfo_reconcile,
raxml_max_time=params.raxml_max_time,
blast_fpath=params.blast_file_path,
roary_fpath=params.roary_file_path,
orthofinder_fpath=params.orthofinder_file_path,
other_tool_fpath=params.other_tool_fpath,
metainfo_fpath=params.metainfo_fpath,
diamond_path=params.diamond_path,
diamond_evalue=params.diamond_evalue,
diamond_max_target_seqs=params.diamond_max_target_seqs,
diamond_identity=params.diamond_identity,
diamond_query_cover=params.diamond_query_cover,
diamond_subject_cover=params.diamond_subject_cover,
diamond_identity_subproblem=params.diamond_identity_subproblem,
diamond_query_cover_subproblem=params.diamond_query_cover_subproblem,
diamond_subject_cover_subproblem=params.diamond_subject_cover_subproblem,
diamond_dc_used=params.diamond_divide_conquer,
diamond_dc_subset_size=params.subset_size,
mcl_inflation=params.mcl_inflation,
blastn_RNA_max_target_seqs=params.blastn_RNA_max_target_seqs,
disable_cluster_postprocessing=params.disable_cluster_postprocessing,
disable_long_branch_splitting=params.disable_long_branch_splitting,
enable_RNA_clustering=params.enable_RNA_clustering,
factor_core_diversity=params.factor_core_diversity,
split_long_branch_cutoff=params.split_long_branch_cutoff,
explore_paralog_plot=params.explore_paralog_plot,
paralog_frac_cutoff=params.paralog_frac_cutoff,
paralog_branch_cutoff=params.paralog_branch_cutoff,
window_size_smoothed=params.window_size_smoothed,
strain_proportion=params.strain_proportion,
sigma_scale=params.sigma_scale,
core_genome_threshold=params.core_genome_threshold,
core_gene_strain_fpath=params.core_gene_strain_fpath,
disable_gain_loss=params.disable_gain_loss,
simple_tree=params.simple_tree,
merged_gain_loss_output=params.merged_gain_loss_output,
store_locus_tag=params.store_locus_tag,
raw_locus_tag=params.raw_locus_tag,
meta_data_config=params.meta_data_config,
raxml_path=params.raxml_path,
min_strain_fraction_branch_association=params.min_strain_fraction_branch_association,
min_strain_fraction_presence_association=params.min_strain_fraction_presence_association,
max_strain_fraction_presence_association=params.max_strain_fraction_presence_association,
optional_table_column=params.optional_table_column,
clean_temporary_files=params.clean_temporary_files
)
if 1 in params.steps:#step 01:
myPangenome.make_strain_list()
print '====== step01: strain list successfully loaded'
## deactivated (activation:'2' -> 2)
if '2' in params.steps:# step02:
print '====== starting step02: download NCBI RefSeq GenBank file from strain list'
start = time.time()
fetch_refseq(path, strain_list)
print '====== time for step02:'
print times(start),'\n'
if 3 in params.steps:# step03:
print '====== starting step03: extract sequences from GenBank file'
start = time.time()
myPangenome.extract_gbk_sequences()
print '====== time for step03:'
print times(start),'\n'
if 4 in params.steps:# step04:
print '====== starting step04: extract metadata from GenBank file'
start = time.time()
myPangenome.extract_gbk_metadata()
print '====== time for step04:'
print times(start),'\n'
if 5 in params.steps:# step05:
print '====== starting step05: cluster proteins'
start = time.time()
if params.diamond_divide_conquer:
## clustering with divide_and_conquer method for large dataset
myPangenome.clustering_protein_divide_conquer()
else:
#if 0:## pre-clustering
# myPangenome.finding_gene_copies()
## clustering
myPangenome.clustering_protein_sequences()
## clustering RNA when option activated
if params.enable_RNA_clustering:
myPangenome.RNA_clustering()
print '====== time for step05:'
print times(start),'\n'
if 6 in params.steps:# step06:
print '====== starting step06: align genes in geneCluster by mafft and build gene trees'
start = time.time()
myPangenome.process_clusters()
if params.enable_RNA_clustering:
myPangenome.make_RNACluster_alignment_and_tree()
print '====== time for step06:'
print times(start),'\n'
if 7 in params.steps:# step07:
print '====== starting step07: call SNPs from core genes'
start = time.time()
myPangenome.create_SNP_alignment()
print '====== time for step07:'
print times(start),'\n'
if 8 in params.steps:# step08:
print '====== starting step08: run fasttree and raxml for tree construction'
start = time.time()
myPangenome.build_core_tree()
print '====== time for step08:'
print times(start),'\n'
if 9 in params.steps:# step09:
print '====== starting step09: infer presence/absence and gain/loss patterns of all genes'
start = time.time()
myPangenome.compute_gene_presence_pattern()
if not params.disable_gain_loss:
myPangenome.infer_gene_gain_loss_pattern()
if params.infer_branch_association:
myPangenome.inferAssociations()
print '====== time for step09:'
print times(start),'\n'
if 10 in params.steps:# step10:
print '====== starting step10: create json file for geneDataTable visualization'
start = time.time()
myPangenome.export_geneCluster_json()
print '====== time for step10:'
print times(start),'\n'
if 11 in params.steps:# step11:
print '====== starting step11: extract json files for tree and treeDataTable visualization,etc'
start = time.time()
myPangenome.export_coreTree_json()
print '====== time for step11:'
print times(start),'\n'
if 12 in params.steps:# step12:
from SF12_create_panGenome import ete_tree_split
print '====== starting step12: create pan-genome'
start = time.time()
ete_tree_split(path, species)
print times(start),'\n'