-
Notifications
You must be signed in to change notification settings - Fork 17
/
2020plus.py
executable file
·276 lines (254 loc) · 12.9 KB
/
2020plus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#!/usr/bin/env python
# import print function for printing to stderr
from __future__ import print_function
# force project root directory to be in path. Otherwise
# package imports will fail if run.py is ran from another
# directory.
import os
import sys
sys.path.append(os.path.abspath(os.path.dirname(__file__)))
# regular imports
import logging
import datetime
import traceback
import argparse
# define exit status
EXCEPTION_EXIT_STATUS = 1
BAD_ARG_EXIT_STATUS = 2
def handle_uncaught_exceptions(t, ex, tb):
"""Handle any uncaught exceptions."""
traceback_contents = ''.join(traceback.format_list(traceback.extract_tb(tb)))
print('*'*40, file=sys.stderr)
print('AN ERROR HAS OCCURRED: check the log file', file=sys.stderr)
print('*'*40, file=sys.stderr)
logging.error('Type: ' + str(t))
logging.error('Exception: ' + str(ex))
logging.error('Traceback:\n ' + traceback_contents)
sys.exit(EXCEPTION_EXIT_STATUS)
def _classify():
"""Wrapper function to call scripts in the classify folder."""
opts = vars(args) # create a dictionary for CLI options
src.classify.python.classifier.main(opts) # run code
def _train():
"""Wrapper function to call script in the train folder."""
opts = vars(args) # create a dictionary for CLI options
src.train.python.train.main(opts) # run code
def _features():
"""Wrapper function to call the features main function."""
opts = vars(args) # make CLI options a dictionary
src.features.python.features.main(opts)
if __name__ == '__main__':
# initializations
sys.excepthook = handle_uncaught_exceptions # handle exceptions
# parse command line arguments
parser = argparse.ArgumentParser(description='Run 20/20+ pipeline')
parser.add_argument('--out-dir',
type=str,
action='store',
default=None,
help='Path to output directory. Used by all positional arguments. '
'(Default: result/)')
parser.add_argument('-ll', '--log-level',
type=str,
action='store',
default='',
help='Write a log file (--log-level=DEBUG for debug mode, '
'--log-level=INFO for info mode)')
parser.add_argument('-l', '--log',
type=str,
action='store',
default='stdout',
help='Path to log file. (accepts stdout)')
parser.add_argument('-v', '--verbose',
action='store_true',
default=False,
help='Flag for more verbose log output')
subparser = parser.add_subparsers(help='sub-command help')
# features sub-command
help_string = ('Generate the features used in classification.'
' This command should be ran before "classify".'
' Features are saved as a text file.')
parser_features = subparser.add_parser('features',
help=help_string,
description=help_string)
parser_features.set_defaults(func=_features)
help_str = 'mutation annotate output from probabilistic 20/20'
parser_features.add_argument('-s', '--summary',
type=str, required=True,
help=help_str)
help_str = 'TSG output from probabilistic 20/20 ("probabilistic2020 tsg")'
parser_features.add_argument('-tsg-test', '--tsg-test',
type=str, required=True,
help=help_str)
help_str = 'Oncogene output from probabilistic 20/20 ("probabilistic2020 oncogene")'
parser_features.add_argument('-og-test', '--og-test',
type=str, required=True,
help=help_str)
help_str = 'Mutsigcv covariate features (Default: use config file)'
parser_features.add_argument('-c', '--covariates',
type=str, default=None,
help=help_str)
help_str = 'BioGrid interaction network statistics (Default: use config file)'
parser_features.add_argument('-b', '--biogrid',
type=str, default=None,
help=help_str)
help_str = 'Randomly permute biogrid features (use for null distribution only)'
parser_features.add_argument('-p', '--permute-biogrid',
action='store_true', default=False,
help=help_str)
parser_features.add_argument('-rs', '--random-seed',
type=int, action='store',
default=71,
help='Random seed for permute biogrid option (default: 71)')
help_str = 'Output feature file for 20/20+'
parser_features.add_argument('-o', '--output',
type=str, required=True,
help=help_str)
# train sub-command
parser_train = subparser.add_parser('train',
help='Train random forest classifier (only used for null distribution)',
description='Train random forest classifier (only used for null distribution)')
parser_train.add_argument('-f', '--features',
type=str,
action='store', required=True,
help='Path to file containing features in tab '
'separated format. Defaults to path specified '
'in config.')
parser_train.add_argument('-d', '--driver-rate',
type=float,
action='store',
default=.7,
help='Sample rate for R\'s random forest for '
'oncogenes and TSGs. (default: .7)')
parser_train.add_argument('-o', '--other-ratio',
type=float,
action='store',
default=1.,
help='Ratio of sample size for R\'s random forest for '
'"other" genes. (default: 1.0)')
parser_train.add_argument('-n', '--ntrees',
type=int,
action='store',
default=500,
help='Number of decision trees for random forests. '
'(default: 500)')
parser_train.add_argument('-c', '--cv',
action='store_true',
default=False,
help='Train a gene hold-out cross-validated classifier')
parser_train.add_argument('-m', '--min-count',
type=int,
action='store',
default=0,
help='Minimum number of mutations in a gene '
'for the gene to be considered in classification.'
' (default: 0)')
parser_train.add_argument('-rs', '--random-seed',
type=int, action='store',
default=71,
help='Random seed (default: 71)')
parser_train.add_argument('-r', '--output',
type=str, required=True,
help="Store the .Rdata file containing the trained"
" random forest classifier")
parser_train.set_defaults(func=_train)
# classify sub-command
parser_classify = subparser.add_parser('classify',
help='Runs classification either with '
'a provided trained classifier (using '
'train) or '
'using k-fold cross-validation (no train command needed).')
parser_classify.add_argument('-t', '--trained-classifier',
type=str,
action='store',
default=None,
help='If provided, use trained classifier from '
'the train sub-command. Otherwise, perform '
'cross-validation within the data set (default: None)')
parser_classify.add_argument('-f', '--features',
type=str,
action='store',
default=None,
help='Path to file containing features in tab '
'separated format. Defaults to path specified '
'in config.')
parser_classify.add_argument('-nd', '--null-distribution',
type=str,
action='store',
default=None,
help='Path to file outputing null distiribution for p-values '
'if provided input represents simulated data. '
'(Default: None)')
parser_classify.add_argument('-s', '--simulated',
action='store_true',
default=False,
help='Flag indicating if input features were simulated. '
'Simulated data is used to construct a null distribution. '
'(Default: False)')
parser_classify.add_argument('-m', '--min-count',
type=int,
action='store',
default=0,
help='Minimum number of mutations in a gene '
'for the gene to be considered in classification.'
' (default: 0)')
parser_classify.add_argument('-d', '--driver-rate',
type=float,
action='store',
default=.7,
help='Sample rate for R\'s random forest for '
'oncogenes and TSGs. (default: .7)')
parser_classify.add_argument('-o', '--other-ratio',
type=float,
action='store',
default=1.,
help='Ratio of sample size for R\'s random forest for '
'"other" genes. (default: 1.0)')
parser_classify.add_argument('-n', '--ntrees',
type=int,
action='store',
default=200,
help='Number of decision trees for random forests. '
'(default: 200)')
parser_classify.add_argument('-c', '--cv',
action='store_true',
default=False,
help='Classify using a gene hold-out cross '
'validated classifier (from the trained_clasifier option)')
parser_classify.add_argument('-rs', '--random-seed',
type=int, action='store',
default=71,
help='Random seed (default: 71)')
parser_classify.set_defaults(func=_classify)
parser.set_defaults(database='genes') # by default work on sqlite db
args = parser.parse_args() # parse the command line options
# handle logging
if args.log_level or args.log:
if args.log:
log_file = args.log
else:
log_file = '' # auto-name the log file
else:
log_file = os.devnull
import src.utils.python.util as _utils
log_level = args.log_level
_utils.start_logging(log_file=log_file,
log_level=log_level,
verbose=args.verbose) # start logging
# log user entered command
import src
logging.info('Version: {0}'.format(src.__version__))
logging.info('Command: {0}'.format(' '.join(sys.argv)))
# import all the modules for 20/20+
import src.classify.python.classifier
import src.features.python.features
import src.savedb.python.gene_tsv
import src.savedb.python.gene_features
import src.savedb.python.gene_maf
import src.savedb.python.merge_mutations
import src.train.python.train
# make output directory if specified by user
save_dir = args.out_dir
_utils.make_result_dir(save_dir)
args.func() # run function corresponding to user's command
logging.info('FINISHED SUCCESSFULLY!')