-
Notifications
You must be signed in to change notification settings - Fork 0
/
MCMCTrace.py
executable file
·305 lines (269 loc) · 10.4 KB
/
MCMCTrace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
from Utilities.Utilities import GetTrainedEmulator, PlotTrace
from Preprocessor.PipeLine import *
from scipy.stats import multivariate_normal as mvn
from numpy import array
import pymc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from contextlib import contextmanager
import tempfile
import shutil
import math
import argparse
from types import SimpleNamespace
import multiprocessing as mp
import os
import random
import sys
import tables
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
if sys.version_info > (3, 0):
import pickle
else:
import cPickle as pickle
def GenerateTrace(
emulators,
exp_Ys,
exp_Yerrs,
prior,
id_,
iter,
output_filename,
burnin=1000,
adaptive=False,
step_sd_scale=1):
"""
The main function to generate pandas trace file after comparing the emulator with experimental value
Uses pymc2 as it is found to be faster
"""
pymc.numpy.random.seed(random.randint(0, 1000) + id_)
n_models = len(emulators)
emulators_list = []
id_to_model_names = []
parameters = []
pRanges = []
for i, ename in enumerate(sorted(emulators.keys())):
id_to_model_names.append(ename)
emulators_list.append(emulators[ename])
ind_parameters = []
ind_pRanges = []
for name, row in prior.iterrows():
ind_pRanges.append(float(row['Max']) - float(row['Min']))
if row["Type"] == "Uniform" or row["Type"] == 'Uniform with mean':
ind_parameters.append(
pymc.Uniform(
name if i == 0 else '%s_%d' % (name, i),
float(row["Min"]),
float(row["Max"]),
value=float(row['Mean']) if row['Type'] == 'Uniform with mean' else 0.5 * (float(row["Min"]) + float(row["Max"])),
)
)
else:
ind_parameters.append(
pymc.TruncatedNormal(
name if i == 0 else '%s_%d' % (name, i),
mu=float(row["Mean"]),
tau=1.0 / float(row["SD"]) ** 2,
a=float(row["Min"]),
b=float(row["Max"]),
value=float(row["Mean"]),
)
)
parameters.append(ind_parameters)
pRanges.append(ind_pRanges)
# transpose emulator_list
emulators_list = list(map(list, zip(*emulators_list)))
if n_models == 1:
model_choice = 0
else:
model_choice = pymc.DiscreteUniform('ModelChoice', lower=0, upper=n_models-1)
for emu, exp_Y, exp_Yerr in zip(emulators_list, exp_Ys, exp_Yerrs):
exp_cov = np.diag(np.square(exp_Yerr))
@pymc.stochastic(observed=True)
def emulator_result(
value=exp_Y,
x=parameters,
exp_cov=exp_cov,
emulator=emu,
mc=model_choice):
mean, var = emulator[mc].Predict(np.array(x[mc]).reshape(1, -1))
return np.array(
mvn.logpdf(
value,
np.squeeze(mean),
np.squeeze(var) +
exp_cov))
# model = pymc.Model(parameters)
# prepare for MCMC
new_output_filename = "%s_%d.h5" % (output_filename, id_)
mcmc = pymc.MCMC(
parameters if model_choice == 0 else parameters + [model_choice],
dbname=new_output_filename,
db="hdf5",
dbmode="w")
for prs, ps in zip(pRanges, parameters):
for pr, p in zip(prs, ps):
if adaptive:
mcmc.use_step_method(pymc.AdaptiveMetropolis, p, shrink_if_necessary=False, cov=np.atleast_2d(np.square(step_sd_scale*pr)))
else:
mcmc.use_step_method(pymc.Metropolis, p, proposal_sd=step_sd_scale*pr)
# sample from our posterior distribution 50,000 times, but
# throw the first 20,000 samples out to ensure that we're only
# sampling from our steady-state posterior distribution
mcmc.sample(iter, burn=burnin)
mcmc.db.close()
return new_output_filename, id_to_model_names # pd.DataFrame.from_dict(trace_dict)
def MCMCParallel(config_file, dirpath=None, nevents=10000, burnin=1000, model_comp=False, adaptive=False, step_sd_scale=1):
if isinstance(config_file, str):
config_file = [config_file]
prior_filename = config_file[0]
if model_comp:
config_file = GroupConfigFiles(config_file)
else:
config_file = {None: config_file}
clf = {}
prior = None
exp_Y = []
exp_Yerr = []
first_model = True
for name, files in config_file.items():
clf[name] = []
for file_ in files:
args = GetTrainedEmulator(file_)
clf[name].append(args[0])
if first_model:
exp_Y.append(args[2])
exp_Yerr.append(args[3])
if file_ == prior_filename:
prior = args[1]
first_model = False
"""
If config_file is a list of strings, then it means we want to chain up multiple emulators
Check if they have the same prior. Can't work if they don't agree
"""
# if not all([all(prior[0].columns == prior1.columns) for prior1 in prior]):
# raise RuntimeError("The variables list from all files are not consistent.")
if dirpath is None:
dirpath = tempfile.mkdtemp()
result = GenerateTrace(
clf,
exp_Y,
exp_Yerr,
prior,
mp.current_process().pid,
nevents,
os.path.join(dirpath, "temp"),
burnin,
adaptive,
step_sd_scale
)
return result
def GroupConfigFiles(config_files):
files_by_model = {}
for filename in config_files:
with pd.HDFStore(filename, 'r') as store:
ynames = tuple(store['Exp_Y'].index)
attr = store.get_storer("PriorAndConfig").attrs.my_attribute
if 'name' not in attr:
raise RuntimeError('Model name is not specified in file ' + filename)
model_name = attr['name']
if model_name in files_by_model:
files_by_model[model_name].append((ynames, filename))
else:
files_by_model[model_name] = [(ynames, filename)]
obsToId = {}
first_filenames = []
first_modelname = None
first = True
for name in files_by_model.keys():
res = []
if first:
for idx, (obs_names, filename) in enumerate(files_by_model[name]):
obsToId[obs_names] = idx
res.append(filename)
first_modelname = name
first_filenames = res
else:
res = ['']*len(obsToId)
for obs_names, filename in files_by_model[name]:
try:
res[obsToId[obs_names]] = filename
except Exception as e:
raise RuntimeError('Files that corresponds to %s of model %s is missing from model %s' % (filename, name, first_modelname))
if len(files_by_model[name]) != len(res):
missing_file = first_filenames[res.index('')]
raise RuntimeError('Files that corresponds to %s of model %s is missing from model %s' % (missing_file, first_modelname, name))
files_by_model[name] = res
first = False
return files_by_model
def Merging(config_file, list_of_traces, clear_trace=False):
chained_files = None
if isinstance(config_file, list):
# store list of config file as meta data
chained_files = config_file[1:]
config_file = config_file[0]
with pd.HDFStore(config_file) as store:
id_to_model = []
if clear_trace and "trace" in store:
store.remove("trace")
for f in list_of_traces:
if isinstance(f, tuple):
id_to_model = f[1]
f = f[0]
tab = tables.open_file(f)
df = pd.DataFrame.from_records(tab.root.chain0.PyMCsamples.read())
store.append(key="trace", value=df.astype(float))
tab.close()
prior = store["PriorAndConfig"]
if len(id_to_model) > 0:
store.get_storer('trace').attrs.model_names = id_to_model
if chained_files is not None:
#store.get_storer('trace').meta = SimpleNamespace()
store.get_storer('trace').attrs.chained_files = chained_files
return prior
if __name__ == "__main__":
#from mpi4py import MPI
from Utilities.ControllerDeviceMP import ControllerDevice, ThreadsException
#comm = MPI.COMM_WORLD
#size = comm.Get_size()
size = 5
work_environment = ControllerDevice(None, ncores=size)#comm)
parser = argparse.ArgumentParser(
description="This script will choose an optimal set of hyperparameters by minizing loss function")
parser.add_argument(
"-i",
"--inputs",
nargs='+',
help="Location of the trained parameter (output from Training.py)",
required=True
)
parser.add_argument("-n", type=int, help="Number of events", required=True)
parser.add_argument("-c", action='store_true', help="Enable model comparison")
parser.add_argument("-b", type=int, help="burn in")
parser.add_argument("-s", type=float, help="MCMC step scale")
parser.add_argument('-p', '--plot', action='store_true', help='Use this if you want to plot posterior immediatly after trace generation')
args = vars(parser.parse_args())
#MCMCParallel(**{"config_file": args['inputs'], "nevents": args['n'], "model_comp": args['c']})
work_environment.Submit(
MCMCParallel, **{"config_file": args['inputs'], "nevents": args['n'], "model_comp": args['c'],
"burnin": args['b'], "step_sd_scale": args['s']})
refresh_rate = 0.2
refresh_interval = refresh_rate * size
# some stdio must be discarded for MPI network efficiency
work_environment.RefreshRate(refresh_interval)
try:
while work_environment.IsRunning():
out = work_environment.stdout[1]
if out is not None:
print(out, flush=True)
except ThreadsException as ex:
print(ex)
else:
prior = Merging(args["inputs"], work_environment.results, clear_trace=True)
# shutil.rmtree(work_environment.results)
if args['plot']:
PlotTrace(args["inputs"][0])
plt.show()
# work_environment.Close()