From 31fd884442f6bceb7d4623b116b6998171c7227b Mon Sep 17 00:00:00 2001 From: Juan Pavez Date: Sat, 19 Mar 2016 21:00:16 +0100 Subject: [PATCH] Full clean up of ACAT branch, pep8 formatting, deleted unused code --- DecomposingTest_10D.py | 421 ++++----- DecomposingTest_1D.py | 177 ++-- decomposed_test.py | 1885 ++++++++++++++++++++-------------------- logistic_sgd.py | 90 +- make_data.py | 694 +++++++++------ mlp.py | 112 ++- pyMorphWrapper.py | 501 ----------- train_classifiers.py | 177 ++-- utils.py | 1174 ++++++++++++------------- xgboost_wrapper.py | 44 +- 10 files changed, 2422 insertions(+), 2853 deletions(-) delete mode 100644 pyMorphWrapper.py diff --git a/DecomposingTest_10D.py b/DecomposingTest_10D.py index 94a4cda..32c115b 100644 --- a/DecomposingTest_10D.py +++ b/DecomposingTest_10D.py @@ -1,8 +1,8 @@ #!/usr/bin/env python ''' - This python script can be used to reproduce the results on 10D distributions - on the article Experiments using machine learning to approximate likelihood ratios - for mixture models (ACAT 2016). + This python script can be used to reproduce the results on 10D distributions + on the article Experiments using machine learning to approximate likelihood ratios + for mixture models (ACAT 2016). ''' __author__ = "Pavez J. " @@ -11,279 +11,186 @@ import ROOT import numpy as np from sklearn import svm, linear_model -from sklearn.externals import joblib -from sklearn.metrics import roc_curve, auc from sklearn.ensemble import GradientBoostingClassifier import sys import os.path -import pdb - -import matplotlib.pyplot as plt -from mpl_toolkits.mplot3d import Axes3D - from mlp import MLPTrainer from make_data import makeData, makeModelND, makeModelPrivateND,\ - makeModel + makeModel from utils import printMultiFrame, printFrame, saveFig, loadData,\ - makeROC, makeSigBkg, makePlotName + makeROC, makeSigBkg, makePlotName from train_classifiers import trainClassifiers, predict from decomposed_test import DecomposedTest from xgboost_wrapper import XGBoostClassifier -''' - A simple example for the work on the section - 5.4 of the paper 'Approximating generalized +''' + A simple example for the work on the section + 5.4 of the paper 'Approximating generalized likelihood ratio test with calibrated discriminative classifiers' by Kyle Cranmer -''' - -def evalC1C2Likelihood(test,c0,c1,dir='/afs/cern.ch/user/j/jpavezse/systematics', - workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', - c1_g='',model_g='mlp',use_log=False,true_dist=False,vars_g=None,clf=None, - verbose_printing=False): - - f = ROOT.TFile('{0}/{1}'.format(dir,workspace)) - w = f.Get('w') - f.Close() - if true_dist == True: - vars = ROOT.TList() - for var in vars_g: - vars.Add(w.var(var)) - x = ROOT.RooArgSet(vars) - else: - x = None - - score = ROOT.RooArgSet(w.var('score')) - if use_log == True: - evaluateRatio = test.evaluateLogDecomposedRatio - post = 'log' - else: - evaluateRatio = test.evaluateDecomposedRatio - post = '' - - npoints = 25 - csarray = np.linspace(0.01,0.2,npoints) - cs2array = np.linspace(0.1,0.4,npoints) - testdata = np.loadtxt('{0}/data/{1}/{2}/{3}_{4}.dat'.format(dir,model_g,c1_g,'test','F1')) - - decomposedLikelihood = np.zeros((npoints,npoints)) - trueLikelihood = np.zeros((npoints,npoints)) - c1s = np.zeros(c1.shape[0]) - c0s = np.zeros(c1.shape[0]) - pre_pdf = [] - pre_dist = [] - pre_pdf.extend([[],[]]) - pre_dist.extend([[],[]]) - for k,c0_ in enumerate(c0): - pre_pdf[0].append([]) - pre_pdf[1].append([]) - pre_dist[0].append([]) - pre_dist[1].append([]) - for j,c1_ in enumerate(c1): - if k <> j: - f0pdf = w.function('bkghistpdf_{0}_{1}'.format(k,j)) - f1pdf = w.function('sighistpdf_{0}_{1}'.format(k,j)) - outputs = predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(dir,model_g,c1_g, - 'adaptive',k,j),testdata,model_g=model_g,clf=clf) - f0pdfdist = np.array([test.evalDist(score,f0pdf,[xs]) for xs in outputs]) - f1pdfdist = np.array([test.evalDist(score,f1pdf,[xs]) for xs in outputs]) - pre_pdf[0][k].append(f0pdfdist) - pre_pdf[1][k].append(f1pdfdist) - else: - pre_pdf[0][k].append(None) - pre_pdf[1][k].append(None) - if true_dist == True: - f0 = w.pdf('f{0}'.format(k)) - f1 = w.pdf('f{0}'.format(j)) - if len(testdata.shape) > 1: - f0dist = np.array([test.evalDist(x,f0,xs) for xs in testdata]) - f1dist = np.array([test.evalDist(x,f1,xs) for xs in testdata]) - else: - f0dist = np.array([test.evalDist(x,f0,[xs]) for xs in testdata]) - f1dist = np.array([test.evalDist(x,f1,[xs]) for xs in testdata]) - pre_dist[0][k].append(f0dist) - pre_dist[1][k].append(f1dist) - - # Evaluate Likelihood in different c1[0] and c1[1] values - for i,cs in enumerate(csarray): - for j, cs2 in enumerate(cs2array): - c1s[:] = c1[:] - c1s[0] = cs - c1s[1] = cs2 - c1s[2] = 1.-cs-cs2 - decomposedRatios,trueRatios = evaluateRatio(w,testdata, - x=x,plotting=False,roc=False,c0arr=c0,c1arr=c1s,true_dist=true_dist, - pre_evaluation=pre_pdf, - pre_dist=pre_dist) - - if use_log == False: - decomposedLikelihood[i,j] = np.log(decomposedRatios).sum() - trueLikelihood[i,j] = np.log(trueRatios).sum() - else: - decomposedLikelihood[i,j] = decomposedRatios.sum() - trueLikelihood[i,j] = trueRatios.sum() - - decomposedLikelihood = decomposedLikelihood - decomposedLikelihood.min() - X,Y = np.meshgrid(csarray, cs2array) - decMin = np.unravel_index(decomposedLikelihood.argmin(), decomposedLikelihood.shape) - min_value = [csarray[decMin[0]],cs2array[decMin[1]]] - if verbose_printing == True: - saveFig(X,[Y,decomposedLikelihood,trueLikelihood],makePlotName('comp','train',type='multilikelihood'),labels=['composed','true'],contour=True,marker=True,dir=dir,marker_value=(c1[0],c1[1]),print_pdf=True,min_value=min_value) - if true_dist == True: - trueLikelihood = trueLikelihood - trueLikelihood.min() - trueMin = np.unravel_index(trueLikelihood.argmin(), trueLikelihood.shape) - return [[csarray[trueMin[0]],cs2array[trueMin[1]]], [csarray[decMin[0]],cs2array[decMin[1]]]] - else: - return [[0.,0.],[csarray[decMin[0]],cs2array[decMin[1]]]] - - -def fitCValues(test,c0,c1,dir='/afs/cern.ch/user/j/jpavezse/systematics', - c1_g='',model_g='mlp',true_dist=False,vars_g=None, - workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', - use_log=False, clf=None): - if use_log == True: - post = 'log' - else: - post = '' - n_hist_c = 200 - keys = ['true','dec'] - c1_ = dict((key,np.zeros(n_hist_c)) for key in keys) - c1_values = dict((key,np.zeros(n_hist_c)) for key in keys) - c2_values = dict((key,np.zeros(n_hist_c)) for key in keys) - - fil2 = open('{0}/fitting_values_c1c2{1}.txt'.format(dir,post),'w') - - for i in range(n_hist_c): +''' - makeData(vars_g, c0,c1, num_train=200000,num_test=500,no_train=True, - workspace=workspace,dir=dir,c1_g=c1_g) - if i == 0: - verbose_printing = True +def plotCValues( + test, + c0, + c1, + dir='/afs/cern.ch/user/j/jpavezse/systematics', + c1_g='', + model_g='mlp', + true_dist=False, + vars_g=None, + workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', + use_log=False, + n_hist_c=100): + if use_log: + post = 'log' else: - verbose_printing = False - - ((c1_true,c2_true),(c1_dec,c2_dec)) = evalC1C2Likelihood(test,c0,c1,dir=dir, - c1_g=c1_g,model_g=model_g, true_dist=true_dist,vars_g=vars_g, - workspace=workspace,use_log=use_log,clf=clf, verbose_printing=verbose_printing) - - print '2: {0} {1} {2} {3}'.format(c1_true, c1_dec, c2_true, c2_dec) - - fil2.write('{0} {1} {2} {3}\n'.format(c1_true, c1_dec, c2_true, c2_dec)) - - fil2.close() - -def plotCValues(test,c0,c1,dir='/afs/cern.ch/user/j/jpavezse/systematics', - c1_g='',model_g='mlp',true_dist=False,vars_g=None, - workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', - use_log=False): - if use_log == True: - post = 'log' - else: - post = '' - - n_hist_c = 200 - keys = ['true','dec'] - c1_values = dict((key,np.zeros(n_hist_c)) for key in keys) - c2_values = dict((key,np.zeros(n_hist_c)) for key in keys) - c1_2 = np.loadtxt('{0}/fitting_values_c1c2{1}.txt'.format(dir,post)) - c1_values['true'] = c1_2[:,0] - c1_values['dec'] = c1_2[:,1] - c2_values['true'] = c1_2[:,2] - c2_values['dec'] = c1_2[:,3] - - saveFig([],[c1_values['true'],c1_values['dec']], - makePlotName('c1c2','train',type='c1_hist{0}'.format(post)),hist=True, - axis=['signal weight'],marker=True,marker_value=c1[0], - labels=['true','composed'],x_range=[0.,0.2],dir=dir, - model_g=model_g,title='Histogram for estimated values signal weight',print_pdf=True) - saveFig([],[c2_values['true'],c2_values['dec']], - makePlotName('c1c2','train',type='c2_hist{0}'.format(post)),hist=True, - axis=['bkg. weight'],marker=True,marker_value=c1[1], - labels=['true','composed'],x_range=[0.1,0.4],dir=dir, - model_g=model_g,title='Histogram for estimated values bkg. weight',print_pdf=True) + post = '' + + keys = ['true', 'dec'] + c1_values = dict((key, np.zeros(n_hist_c)) for key in keys) + c2_values = dict((key, np.zeros(n_hist_c)) for key in keys) + c1_2 = np.loadtxt('{0}/fitting_values_c1c2{1}.txt'.format(dir, post)) + c1_values['true'] = c1_2[:, 0] + c1_values['dec'] = c1_2[:, 1] + c2_values['true'] = c1_2[:, 2] + c2_values['dec'] = c1_2[:, 3] + + saveFig([], [c1_values['true'], c1_values['dec']], + makePlotName('c1c2', 'train', type='c1_hist{0}'.format(post)), type='hist', + axis=['signal weight'], marker=True, marker_value=c1[0], + labels=['Exact', 'Approx. Decomposed'], x_range=[0., 0.2], dir=dir, + model_g=model_g, title='Histogram for estimated values signal weight', print_pdf=True) + saveFig([], [c2_values['true'], c2_values['dec']], + makePlotName('c1c2', 'train', type='c2_hist{0}'.format(post)), type='hist', + axis=['bkg. weight'], marker=True, marker_value=c1[1], + labels=['Exact', 'Approx. Decomposed'], x_range=[0.1, 0.4], dir=dir, + model_g=model_g, title='Histogram for estimated values bkg. weight', print_pdf=True) if __name__ == '__main__': - # Setting the classifier to use - model_g = None - classifiers = {'svc':svm.NuSVC(probability=True),'svr':svm.NuSVR(), - 'logistic': linear_model.LogisticRegression(), - 'bdt':GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, - max_depth=5, random_state=0), - 'mlp':MLPTrainer(n_hidden=40, L2_reg=0.0001), - 'xgboost': XGBoostClassifier(num_class=2, nthread=4, silent=1, - num_boost_round=100, eta=0.5, max_depth=4)} - clf = None - if (len(sys.argv) > 1): - model_g = sys.argv[1] - clf = classifiers.get(sys.argv[1]) - if clf == None: - model_g = 'logistic' - clf = classifiers['logistic'] - print 'Not found classifier, Using logistic instead' - - # parameters of the mixture model - c0 = np.array([.0,.3, .7]) - c1 = np.array([.1,.3, .7]) - c1_g = '' - - c0 = c0/c0.sum() - c1[0] = sys.argv[2] - if c1[0] < 0.01: - c1_g = "%.3f"%c1[0] - else: - c1_g = "%.2f"%c1[0] - c1[0] = (c1[0]*(c1[1]+c1[2]))/(1.-c1[0]) - c1 = c1 / c1.sum() - - verbose_printing = True - dir = '/afs/cern.ch/user/j/jpavezse/systematics' - workspace_file = 'workspace_DecomposingTestOfMixtureModelsClassifiers.root' - - # features - vars_g = ['x0','x1','x2','x3','x4','x5','x6','x7','x8','x9'] - - ROOT.gROOT.SetBatch(ROOT.kTRUE) - ROOT.RooAbsPdf.defaultIntegratorConfig().setEpsRel(1E-15) - ROOT.RooAbsPdf.defaultIntegratorConfig().setEpsAbs(1E-15) - # Set this value to False if only final plots are needed - verbose_printing = True - - if (len(sys.argv) > 3): - print 'Setting seed: {0} '.format(sys.argv[3]) - ROOT.RooRandom.randomGenerator().SetSeed(int(sys.argv[3])) - np.random.seed(int(sys.argv[3])) - - # make private mixture model - makeModelPrivateND(vars_g=vars_g,c0=c0,c1=c1,workspace=workspace_file,dir=dir, - model_g=model_g,verbose_printing=verbose_printing,load_cov=True) - - # make sintetic data to train the classifiers - makeData(vars_g=vars_g,c0=c0,c1=c1,num_train=100000,num_test=50000, - workspace=workspace_file,dir=dir, c1_g=c1_g, model_g='mlp') - - # train the pairwise classifiers - trainClassifiers(clf,3,dir=dir, model_g=model_g, - c1_g=c1_g ,model_file='adaptive') - - # class which implement the decomposed method - test = DecomposedTest(c0,c1,dir=dir,c1_g=c1_g,model_g=model_g, - input_workspace=workspace_file, verbose_printing = verbose_printing, - dataset_names=['0','1','2'], clf=clf if model_g=='mlp' else None) - test.fit(data_file='test',importance_sampling=False, true_dist=True,vars_g=vars_g) - - test.computeRatios(true_dist=True,vars_g=vars_g,use_log=False) - - # compute likelihood for c0[0] and c0[1] values - fitCValues(test,c0,c1,dir=dir,c1_g=c1_g,model_g=model_g,true_dist=True,vars_g=vars_g, - workspace=workspace_file,use_log=False, clf=clf if model_g=='mlp' else None) - - plotCValues(test,c0,c1,dir=dir,c1_g=c1_g,model_g=model_g,true_dist=True,vars_g=vars_g, - workspace=workspace_file,use_log=False) - - + # Setting the classifier to use + model_g = None + classifiers = { + 'svc': svm.NuSVC( + probability=True), + 'svr': svm.NuSVR(), + 'logistic': linear_model.LogisticRegression(), + 'bdt': GradientBoostingClassifier( + n_estimators=100, + learning_rate=1.0, + max_depth=5, + random_state=0), + 'mlp': MLPTrainer( + n_hidden=40, + L2_reg=0.0001), + 'xgboost': XGBoostClassifier( + num_class=2, + nthread=4, + silent=1, + num_boost_round=100, + eta=0.5, + max_depth=4)} + clf = None + if (len(sys.argv) > 1): + model_g = sys.argv[1] + clf = classifiers.get(sys.argv[1]) + if clf is None: + model_g = 'logistic' + clf = classifiers['logistic'] + print 'Not found classifier, Using logistic instead' + + # parameters of the mixture model + c0 = np.array([.0, .3, .7]) + c1 = np.array([.1, .3, .7]) + c1_g = '' + + c0 = c0 / c0.sum() + c1[0] = sys.argv[2] + if c1[0] < 0.01: + c1_g = "%.3f" % c1[0] + else: + c1_g = "%.2f" % c1[0] + c1[0] = (c1[0] * (c1[1] + c1[2])) / (1. - c1[0]) + c1 = c1 / c1.sum() + + verbose_printing = True + dir = '/afs/cern.ch/user/j/jpavezse/systematics' + workspace_file = 'workspace_DecomposingTestOfMixtureModelsClassifiers.root' + + # features + vars_g = ['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9'] + + ROOT.gROOT.SetBatch(ROOT.kTRUE) + ROOT.RooAbsPdf.defaultIntegratorConfig().setEpsRel(1E-15) + ROOT.RooAbsPdf.defaultIntegratorConfig().setEpsAbs(1E-15) + # Set this value to False if only final plots are needed + verbose_printing = True + + if (len(sys.argv) > 3): + print 'Setting seed: {0} '.format(sys.argv[3]) + ROOT.RooRandom.randomGenerator().SetSeed(int(sys.argv[3])) + np.random.seed(int(sys.argv[3])) + + # make private mixture model + makeModelPrivateND( + vars_g=vars_g, + c0=c0, + c1=c1, + workspace=workspace_file, + dir=dir, + model_g=model_g, + verbose_printing=verbose_printing, + load_cov=True) + + # make sintetic data to train the classifiers + makeData(vars_g=vars_g, c0=c0, c1=c1, num_train=100000, num_test=50000, + workspace=workspace_file, dir=dir, c1_g=c1_g, model_g='mlp') + + # train the pairwise classifiers + trainClassifiers(clf, 3, dir=dir, model_g=model_g, + c1_g=c1_g, model_file='adaptive') + + # class which implement the decomposed method + test = DecomposedTest( + c0, + c1, + dir=dir, + c1_g=c1_g, + model_g=model_g, + input_workspace=workspace_file, + verbose_printing=verbose_printing, + dataset_names=[ + '0', + '1', + '2'], + clf=clf if model_g == 'mlp' else None) + test.fit( + data_file='test', + true_dist=True, + vars_g=vars_g) + + test.computeRatios(true_dist=True, vars_g=vars_g, use_log=False) + + n_hist = 200 + # compute likelihood for c0[0] and c0[1] values + test.fitCValues( + c0, + c1, + dir=dir, + c1_g=c1_g, + model_g=model_g, + true_dist=True, + vars_g=vars_g, + workspace=workspace_file, + use_log=False, + clf=clf if model_g == 'mlp' else None, + n_hist_c=n_hist) + + plotCValues(test,c0,c1,dir=dir,c1_g=c1_g,model_g=model_g,true_dist=True,vars_g=vars_g, + workspace=workspace_file,use_log=False, n_hist_c=n_hist) diff --git a/DecomposingTest_1D.py b/DecomposingTest_1D.py index 5b92ea6..29a7684 100644 --- a/DecomposingTest_1D.py +++ b/DecomposingTest_1D.py @@ -1,19 +1,16 @@ #!/usr/bin/env python ''' - This python script can be used to reproduce the results on 1D distributions - on the article Experiments using machine learning to approximate likelihood ratios - for mixture models (ACAT 2016). + This python script can be used to reproduce the results on 1D distributions + on the article Experiments using machine learning to approximate likelihood ratios + for mixture models (ACAT 2016). ''' __author__ = "Pavez J. " - import ROOT import numpy as np from sklearn import svm, linear_model -from sklearn.externals import joblib -from sklearn.metrics import roc_curve, auc from sklearn.ensemble import GradientBoostingClassifier import sys @@ -23,9 +20,9 @@ from mlp import MLPTrainer from make_data import makeData, makeModelND, makeModelPrivateND,\ - makeModel + makeModel from utils import printMultiFrame, printFrame, saveFig, loadData,\ - makeROC, makeSigBkg, makePlotName + makeROC, makeSigBkg, makePlotName from train_classifiers import trainClassifiers, predict from decomposed_test import DecomposedTest @@ -34,75 +31,95 @@ if __name__ == '__main__': - # Setting the classifier to use - model_g = None - classifiers = {'svc':svm.NuSVC(probability=True),'svr':svm.NuSVR(), - 'logistic': linear_model.LogisticRegression(), - 'bdt':GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, - max_depth=5, random_state=0), - 'mlp':MLPTrainer(n_hidden=4, L2_reg=0), - 'xgboost': XGBoostClassifier(num_class=2, nthread=4, silent=1, - num_boost_round=100, eta=0.5, max_depth=4)} - clf = None - if (len(sys.argv) > 1): - model_g = sys.argv[1] - clf = classifiers.get(sys.argv[1]) - if clf == None: - model_g = 'logistic' - clf = classifiers['logistic'] - print 'Not found classifier, Using logistic instead' - - # parameters of the mixture model - c0 = np.array([.0,.3, .7]) - c1 = np.array([.1,.3, .7]) - c1_g = '' - - c0 = c0/c0.sum() - c1[0] = sys.argv[2] - if c1[0] < 0.01: - c1_g = "%.3f"%c1[0] - else: - c1_g = "%.2f"%c1[0] - c1[0] = (c1[0]*(c1[1]+c1[2]))/(1.-c1[0]) - c1 = c1 / c1.sum() - - verbose_printing = True - dir = '.' - workspace_file = 'workspace_DecomposingTestOfMixtureModelsClassifiers.root' - - # features - vars_g = ['x'] - - ROOT.gROOT.SetBatch(ROOT.kTRUE) - ROOT.RooAbsPdf.defaultIntegratorConfig().setEpsRel(1E-15) - - # Set this value to False if only final plots are needed - verbose_printing = True - - if (len(sys.argv) > 3): - print 'Setting seed: {0} '.format(sys.argv[3]) - ROOT.RooRandom.randomGenerator().SetSeed(int(sys.argv[3])) - np.random.seed(int(sys.argv[3])) - - # Create models to sample from - makeModel(c0=c0,c1=c1,workspace=workspace_file,dir=dir,verbose_printing= - verbose_printing) - - # make sintetic data to train the classifiers - makeData(vars_g=vars_g,c0=c0,c1=c1,num_train=100000,num_test=50000, - workspace=workspace_file,dir=dir, c1_g=c1_g, model_g='mlp') - - # train the pairwise classifiers - trainClassifiers(clf,3,dir=dir, model_g=model_g, - c1_g=c1_g ,model_file='adaptive') - - # class which implement the decomposed method - test = DecomposedTest(c0,c1,dir=dir,c1_g=c1_g,model_g=model_g, - input_workspace=workspace_file, verbose_printing = verbose_printing, - dataset_names=['0','1','2'],clf=clf if model_g=='mlp' else None) - - test.fit(data_file='test',importance_sampling=False, true_dist=True,vars_g=vars_g) - test.computeRatios(true_dist=True,vars_g=vars_g,use_log=False) - - - + # Setting the classifier to use + model_g = None + classifiers = { + 'svc': svm.NuSVC( + probability=True), + 'svr': svm.NuSVR(), + 'logistic': linear_model.LogisticRegression(), + 'bdt': GradientBoostingClassifier( + n_estimators=100, + learning_rate=1.0, + max_depth=5, + random_state=0), + 'mlp': MLPTrainer( + n_hidden=4, + L2_reg=0), + 'xgboost': XGBoostClassifier( + num_class=2, + nthread=4, + silent=1, + num_boost_round=100, + eta=0.5, + max_depth=4)} + clf = None + if (len(sys.argv) > 1): + model_g = sys.argv[1] + clf = classifiers.get(sys.argv[1]) + if clf is None: + model_g = 'logistic' + clf = classifiers['logistic'] + print 'Not found classifier, Using logistic instead' + + # parameters of the mixture model + c0 = np.array([.0, .3, .7]) + c1 = np.array([.1, .3, .7]) + c1_g = '' + + c0 = c0 / c0.sum() + c1[0] = sys.argv[2] + if c1[0] < 0.01: + c1_g = "%.3f" % c1[0] + else: + c1_g = "%.2f" % c1[0] + c1[0] = (c1[0] * (c1[1] + c1[2])) / (1. - c1[0]) + c1 = c1 / c1.sum() + + verbose_printing = True + dir = '.' + workspace_file = 'workspace_DecomposingTestOfMixtureModelsClassifiers.root' + + # features + vars_g = ['x'] + + ROOT.gROOT.SetBatch(ROOT.kTRUE) + ROOT.RooAbsPdf.defaultIntegratorConfig().setEpsRel(1E-15) + + # Set this value to False if only final plots are needed + verbose_printing = True + + if (len(sys.argv) > 3): + print 'Setting seed: {0} '.format(sys.argv[3]) + ROOT.RooRandom.randomGenerator().SetSeed(int(sys.argv[3])) + np.random.seed(int(sys.argv[3])) + + # Create models to sample from + makeModel(c0=c0,c1=c1,workspace=workspace_file,dir=dir,verbose_printing= + verbose_printing) + + # make sintetic data to train the classifiers + makeData(vars_g=vars_g,c0=c0,c1=c1,num_train=100000,num_test=50000, + workspace=workspace_file,dir=dir, c1_g=c1_g, model_g='mlp') + + # train the pairwise classifiers + trainClassifiers(clf,3,dir=dir, model_g=model_g, + c1_g=c1_g ,model_file='adaptive') + + # class which implement the decomposed method + test = DecomposedTest( + c0, + c1, + dir=dir, + c1_g=c1_g, + model_g=model_g, + input_workspace=workspace_file, + verbose_printing=verbose_printing, + dataset_names=[ + '0', + '1', + '2'], + clf=clf if model_g == 'mlp' else None) + + test.fit(data_file='test',true_dist=True,vars_g=vars_g) + test.computeRatios(true_dist=True, vars_g=vars_g, use_log=False) diff --git a/decomposed_test.py b/decomposed_test.py index 583db11..7d4ec4c 100644 --- a/decomposed_test.py +++ b/decomposed_test.py @@ -4,988 +4,975 @@ import ROOT import numpy as np -from sklearn import svm, linear_model -from sklearn.externals import joblib -from sklearn.metrics import roc_curve, auc -from sklearn.ensemble import GradientBoostingClassifier import sys import os.path -import pdb -import matplotlib.pyplot as plt -from mpl_toolkits.mplot3d import Axes3D - -from make_data import makeData, makeModelND from utils import printMultiFrame, printFrame, saveFig, loadData, printFrame, makePlotName,\ - loadData,makeSigBkg,makeROC, makeMultiROC,saveMultiFig,preProcessing + makeSigBkg, makeROC, makeMultiROC, saveMultiFig + +from make_data import makeData from train_classifiers import predict -from pyMorphWrapper import MorphingWrapper - class DecomposedTest: - ''' - Class which implement the decomposed test on - the data - ''' - def __init__(self,c0,c1,model_file='adaptive', + ''' + Class which implement the decomposed test on + the data + ''' + + def __init__( + self, + c0, + c1, + model_file='adaptive', input_workspace=None, output_workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', dir='/afs/cern.ch/user/j/jpavezse/systematics', - c1_g='',model_g='mlp', + c1_g='', + model_g='mlp', verbose_printing=False, dataset_names=None, - preprocessing=False, - scaler=None, seed=1234, F1_dist='F1', F0_dist='F0', - cross_section=None, all_couplings=None, F1_couplings=None, basis_indexes=None, clf=None): - self.input_workspace = input_workspace - self.workspace = output_workspace - self.c0 = c0 - self.c1 = c1 - self.model_file = model_file - self.dir = dir - self.c1_g = c1_g - self.model_g = model_g - self.verbose_printing=verbose_printing - self.preprocessing = preprocessing - self.scaler = scaler - self.seed=seed - self.F1_dist=F1_dist - self.F0_dist=F0_dist - self.cross_section=cross_section - self.dataset_names=dataset_names - self.basis_indexes = basis_indexes if basis_indexes <> None else range(len(dataset_names)) - self.F1_couplings=F1_couplings - self.all_couplings=all_couplings - self.nsamples = len(dataset_names) - self.clf = clf - - - def fit(self, data_file='test',importance_sampling=False, true_dist=True,vars_g=None): - ''' - Create pdfs for the classifier - score to be used later on the ratio - test, input workspace only needed in case - there exist true pdfs for the distributions - the models being used are ./model/{model_g}/{c1_g}/{model_file}_i_j.pkl - and the data files are ./data/{model_g}/{c1_g}/{data_file}_i_j.dat - ''' - - bins = 40 - low = 0. - high = 1. - - if self.input_workspace <> None: - #f = ROOT.TFile('{0}/{1}'.format('/afs/cern.ch/work/j/jpavezse/private',self.workspace)) - f = ROOT.TFile('{0}/{1}'.format(self.dir,self.workspace)) - - w = f.Get('w') - # TODO test this when workspace is present - w = ROOT.RooWorkspace('w') if w == None else w - f.Close() - else: - w = ROOT.RooWorkspace('w') - w.Print() - - print 'Generating Score Histograms' - - w.factory('score[{0},{1}]'.format(low,high)) - s = w.var('score') - - if importance_sampling == True: - if true_dist == True: - vars = ROOT.TList() - for var in vars_g: - vars.Add(w.var(var)) - x = ROOT.RooArgSet(vars) - else: - x = None - - #This is because most of the data of the full model concentrate around 0 - bins_full = 40 - low_full = 0. - high_full = 1. - w.factory('scoref[{0},{1}]'.format(low_full, high_full)) - s_full = w.var('scoref') - histos = [] - histos_names = [] - inv_histos = [] - inv_histos_names = [] - sums_histos = [] - def saveHistos(w,outputs,s,bins,low,high,pos=None,importance_sampling=False,importance_data=None, - importance_outputs=None): - if pos <> None: - k,j = pos - else: - k,j = ('F0','F1') - print 'Estimating {0} {1}'.format(k,j) - for l,name in enumerate(['sig','bkg']): - data = ROOT.RooDataSet('{0}data_{1}_{2}'.format(name,k,j),"data", - ROOT.RooArgSet(s)) - hist = ROOT.TH1F('{0}hist_{1}_{2}'.format(name,k,j),'hist',bins,low,high) - values = outputs[l] - #values = values[self.findOutliers(values)] - for val in values: - hist.Fill(val) - s.setVal(val) - data.add(ROOT.RooArgSet(s)) - norm = 1./hist.Integral() - hist.Scale(norm) - - s.setBins(bins) - datahist = ROOT.RooDataHist('{0}datahist_{1}_{2}'.format(name,k,j),'hist', - ROOT.RooArgList(s),hist) - #histpdf = ROOT.RooHistPdf('{0}histpdf_{1}_{2}'.format(name,k,j),'hist', - # ROOT.RooArgSet(s), datahist, 1) - histpdf = ROOT.RooHistFunc('{0}histpdf_{1}_{2}'.format(name,k,j),'hist', - ROOT.RooArgSet(s), datahist, 1) - #histpdf.setUnitNorm(True) - #testvalues = np.array([self.evalDist(ROOT.RooArgSet(s), histpdf, [xs]) for xs in values]) - - #histpdf.specialIntegratorConfig(ROOT.kTRUE).method1D().setLabel('RooBinIntegrator') - - #print 'INTEGRAL' - #print histpdf.createIntegral(ROOT.RooArgSet(s)).getVal() - #print histpdf.Integral() - - #histpdf.specialIntegratorConfig(ROOT.kTRUE).method1D().setLabel('RooAdaptiveGaussKronrodIntegrator1D') - - getattr(w,'import')(hist) - getattr(w,'import')(data) - getattr(w,'import')(datahist) # work around for morph = w.import(morph) - getattr(w,'import')(histpdf) # work around for morph = w.import(morph) - score_str = 'scoref' if pos == None else 'score' - # Calculate the density of the classifier output using kernel density - #w.factory('KeysPdf::{0}dist_{1}_{2}({3},{0}data_{1}_{2},RooKeysPdf::NoMirror,2)'.format(name,k,j,score_str)) - - # Print histograms pdfs and estimated densities - if self.verbose_printing == True and name == 'bkg' and k <> j: - full = 'full' if pos == None else 'dec' - if k < j and k <> 'F0': - histos.append([w.function('sighistpdf_{0}_{1}'.format(k,j)), w.function('bkghistpdf_{0}_{1}'.format(k,j))]) - histos_names.append(['f{0}-f{1}_f{1}(signal)'.format(k,j), 'f{0}-f{1}_f{0}(background)'.format(k,j)]) - if j < k and k <> 'F0': - inv_histos.append([w.function('sighistpdf_{0}_{1}'.format(k,j)), w.function('bkghistpdf_{0}_{1}'.format(k,j))]) - inv_histos_names.append(['f{0}-f{1}_f{1}(signal)'.format(k,j), 'f{0}-f{1}_f{0}(background)'.format(k,j)]) - - if self.scaler == None: - self.scaler = {} - - # change this - for k in range(self.nsamples): - for j in range(self.nsamples): - if k == j: - continue - #if k <> 2 and j <> 2: - # continue - if self.dataset_names <> None: - name_k, name_j = (self.dataset_names[k], self.dataset_names[j]) + self.input_workspace = input_workspace + self.workspace = output_workspace + self.c0 = c0 + self.c1 = c1 + self.model_file = model_file + self.dir = dir + self.c1_g = c1_g + self.model_g = model_g + self.verbose_printing = verbose_printing + self.seed = seed + self.F1_dist = F1_dist + self.F0_dist = F0_dist + self.dataset_names = dataset_names + self.basis_indexes = basis_indexes if basis_indexes is not None else range( + len(dataset_names)) + self.F1_couplings = F1_couplings + self.all_couplings = all_couplings + self.nsamples = len(dataset_names) + self.clf = clf + + def fit( + self, + data_file='test', + true_dist=True, + vars_g=None): + ''' + Create pdfs for the classifier + score to be used later on the ratio + test, input workspace only needed in case + there exist true pdfs for the distributions + the models being used are ./model/{model_g}/{c1_g}/{model_file}_i_j.pkl + and the data files are ./data/{model_g}/{c1_g}/{data_file}_i_j.dat + ''' + + bins = 40 + low = 0. + high = 1. + + if self.input_workspace is not None: + f = ROOT.TFile('{0}/{1}'.format(self.dir, self.workspace)) + w = f.Get('w') + w = ROOT.RooWorkspace('w') if w is None else w + f.Close() else: - name_k, name_j = (k,j) - print 'Loading {0}:{1} {2}:{3}'.format(k,name_k, j,name_j) - traindata, targetdata = loadData(data_file,name_k,name_j,dir=self.dir,c1_g=self.c1_g, - preprocessing=self.preprocessing,scaler=self.scaler,persist=True) - - numtrain = traindata.shape[0] - size2 = traindata.shape[1] if len(traindata.shape) > 1 else 1 - #output = [predict('/afs/cern.ch/work/j/jpavezse/private/{0}_{1}_{2}.pkl'.format(self.model_file,k,j),traindata[targetdata == 1],model_g=self.model_g), - # predict('/afs/cern.ch/work/j/jpavezse/private/{0}_{1}_{2}.pkl'.format(self.model_file,k,j),traindata[targetdata == 0],model_g=self.model_g)] - output = [predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file,k,j),traindata[targetdata==1],model_g=self.model_g,clf=self.clf), - predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file,k,j),traindata[targetdata==0],model_g=self.model_g,clf=self.clf)] - saveHistos(w,output,s,bins,low,high,(k,j)) - #w.writeToFile('{0}/{1}'.format('/afs/cern.ch/work/j/jpavezse/private',self.workspace)) - w.writeToFile('{0}/{1}'.format(self.dir,self.workspace)) - - if self.verbose_printing==True: - for ind in range(1,(len(histos)/3+1)): - print_histos = histos[(ind-1)*3:(ind-1)*3+3] - print_histos_names = histos_names[(ind-1)*3:(ind-1)*3+3] - printMultiFrame(w,['score']*len(print_histos),print_histos, makePlotName('dec{0}'.format(ind-1),'all',type='hist',dir=self.dir,c1_g=self.c1_g,model_g=self.model_g),print_histos_names, - dir=self.dir,model_g=self.model_g,y_text='score(x)',print_pdf=True,title='Pairwise score distributions') - # Full model - traindata, targetdata = loadData(data_file,self.F0_dist,self.F1_dist,dir=self.dir,c1_g=self.c1_g, - preprocessing=self.preprocessing, scaler=self.scaler) - numtrain = traindata.shape[0] - size2 = traindata.shape[1] if len(traindata.shape) > 1 else 1 - outputs = [predict('{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file),traindata[targetdata==1],model_g=self.model_g,clf=self.clf), - predict('{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file),traindata[targetdata==0],model_g=self.model_g,clf=self.clf)] - #outputs = [predict('/afs/cern.ch/work/j/jpavezse/private/{0}_F0_F1.pkl'.format(self.model_file),traindata[targetdata==1],model_g=self.model_g), - # predict('/afs/cern.ch/work/j/jpavezse/private/{0}_F0_F1.pkl'.format(self.model_file),traindata[targetdata==0],model_g=self.model_g)] - - saveHistos(w,outputs,s_full, bins_full, low_full, high_full,importance_sampling=False) - if self.verbose_printing == True: - printFrame(w,['scoref'],[w.function('sighistpdf_F0_F1'),w.function('bkghistpdf_F0_F1')], makePlotName('full','all',type='hist',dir=self.dir,c1_g=self.c1_g,model_g=self.model_g),['signal','bkg'], - dir=self.dir,model_g=self.model_g,y_text='score(x)',print_pdf=True,title='Pairwise score distributions') - - #w.writeToFile('{0}/{1}'.format('/afs/cern.ch/work/j/jpavezse/private',self.workspace)) - w.writeToFile('{0}/{1}'.format(self.dir,self.workspace)) - - w.Print() - - # To calculate the ratio between single functions - def singleRatio(self,f0,f1): - ratio = f1 / f0 - ratio[np.abs(ratio) == np.inf] = 0 - ratio[np.isnan(ratio)] = 0 - return ratio - - def evalDist(self,x,f0,val): - iter = x.createIterator() - v = iter.Next() - i = 0 - while v: - v.setVal(val[i]) - v = iter.Next() - i = i+1 - return f0.getVal(x) - - # To calculate the ratio between single functions - def __regFunc(self,x,f0,f1,val): - iter = x.createIterator() - v = iter.Next() - i = 0 - while v: - v.setVal(val[i]) - v = iter.Next() - i = i+1 - if (f0.getVal(x) + f1.getVal(x)) == 0.: - return 0. - return f1.getVal(x) / (f0.getVal(x) + f1.getVal(x)) - - - def evaluateDecomposedRatio(self,w,evalData,x=None,plotting=True, roc=False,gridsize=None,c0arr=None, c1arr=None,true_dist=False,pre_evaluation=None,pre_dist=None,data_type='test',debug=False,cross_section=None,indexes=None): - ''' - Compute composed ratio for dataset 'evalData'. - Single ratios can be precomputed in pre_evaluation - ''' + w = ROOT.RooWorkspace('w') + w.Print() - # pair-wise ratios - # and decomposition computation - #f = ROOT.TFile('{0}/{1}'.format(self.dir,self.workspace)) - #w = f.Get('w') - #f.Close() - - if indexes == None: - indexes = self.basis_indexes - - score = ROOT.RooArgSet(w.var('score')) - npoints = evalData.shape[0] - fullRatios = np.zeros(npoints) - fullRatiosReal = np.zeros(npoints) - c0arr = self.c0 if c0arr == None else c0arr - c1arr = self.c1 if c1arr == None else c1arr - - true_score = [] - train_score = [] - all_targets = [] - all_positions = [] - all_ratios = [] - for k,c in enumerate(c0arr): - innerRatios = np.zeros(npoints) - innerTrueRatios = np.zeros(npoints) - if c == 0: - continue - for j,c_ in enumerate(c1arr): - index_k, index_j = (indexes[k],indexes[j]) - f0pdf = w.function('bkghistpdf_{0}_{1}'.format(index_k,index_j)) - f1pdf = w.function('sighistpdf_{0}_{1}'.format(index_k,index_j)) - if index_k<>index_j: - if pre_evaluation == None: - traindata = evalData - if self.preprocessing == True: - traindata = preProcessing(evalData,self.dataset_names[min(index_k,index_j)], - self.dataset_names[max(index_k,index_j)],self.scaler) - outputs = predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file,k,j),traindata,model_g=self.model_g,clf=self.clf) - #outputs = predict('/afs/cern.ch/work/j/jpavezse/private/{0}_{1}_{2}.pkl'.format(self.model_file,index_k, - #index_j),traindata,model_g=self.model_g) - f0pdfdist = np.array([self.evalDist(score,f0pdf,[xs]) for xs in outputs]) - f1pdfdist = np.array([self.evalDist(score,f1pdf,[xs]) for xs in outputs]) - else: - f0pdfdist = pre_evaluation[0][index_k][index_j] - f1pdfdist = pre_evaluation[1][index_k][index_j] - if f0pdfdist == None or f1pdfdist == None: - pdb.set_trace() - pdfratios = self.singleRatio(f0pdfdist,f1pdfdist) - else: - pdfratios = np.ones(npoints) - all_ratios.append(pdfratios) - innerRatios += (c_/c) * pdfratios - if true_dist == True: - if pre_dist == None: - f0 = w.pdf('f{0}'.format(index_k)) - f1 = w.pdf('f{0}'.format(index_j)) - if len(evalData.shape) > 1: - f0dist = np.array([self.evalDist(x,f0,xs) for xs in evalData]) - f1dist = np.array([self.evalDist(x,f1,xs) for xs in evalData]) - else: - f0dist = np.array([self.evalDist(x,f0,[xs]) for xs in evalData]) - f1dist = np.array([self.evalDist(x,f1,[xs]) for xs in evalData]) - else: - f0dist = pre_dist[0][index_k][index_j] - f1dist = pre_dist[1][index_k][index_j] - ratios = self.singleRatio(f0dist, f1dist) - innerTrueRatios += (c_/c) * ratios - # ROC curves for pair-wise ratios - if (roc == True or plotting==True) and k < j: - all_positions.append((k,j)) - if roc == True: - if self.dataset_names <> None: - name_k, name_j = (self.dataset_names[index_k], self.dataset_names[index_j]) - else: - name_k, name_j = (index_k,index_j) - testdata, testtarget = loadData(data_type,name_k,name_j,dir=self.dir,c1_g=self.c1_g, - preprocessing=self.preprocessing, scaler=self.scaler) - else: - testdata = evalData - size2 = testdata.shape[1] if len(testdata.shape) > 1 else 1 - outputs = predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file,k,j),testdata,model_g=self.model_g,clf=self.clf) - #outputs = predict('/afs/cern.ch/work/j/jpavezse/private/{0}_{1}_{2}.pkl'.format(self.model_file,index_k, - # index_j),testdata.reshape(testdata.shape[0],size2),model_g=self.model_g) - f0pdfdist = np.array([self.evalDist(score,f0pdf,[xs]) for xs in outputs]) - f1pdfdist = np.array([self.evalDist(score,f1pdf,[xs]) for xs in outputs]) - clfRatios = self.singleRatio(f0pdfdist,f1pdfdist) - train_score.append(clfRatios) - if roc == True: - all_targets.append(testtarget) - #individual ROC - #makeROC(clfRatios, testtarget,makePlotName('dec','train',k,j,type='roc',dir=self.dir, - #model_g=self.model_g,c1_g=self.c1_g),dir=self.dir,model_g=self.model_g) - if true_dist == True: - if len(evalData.shape) > 1: - f0dist = np.array([self.evalDist(x,f0,xs) for xs in testdata]) - f1dist = np.array([self.evalDist(x,f1,xs) for xs in testdata]) + print 'Generating Score Histograms' + + w.factory('score[{0},{1}]'.format(low, high)) + s = w.var('score') + + bins_full = 40 + low_full = 0. + high_full = 1. + w.factory('scoref[{0},{1}]'.format(low_full, high_full)) + s_full = w.var('scoref') + + histos = [] + histos_names = [] + inv_histos = [] + inv_histos_names = [] + sums_histos = [] + + def saveHistos( + w, + outputs, + s, + bins, + low, + high, + pos=None): + if pos is not None: + k, j = pos else: - f0dist = np.array([self.evalDist(x,f0,[xs]) for xs in testdata]) - f1dist = np.array([self.evalDist(x,f1,[xs]) for xs in testdata]) - - trRatios = self.singleRatio(f0dist,f1dist) - - true_score.append(trRatios) - - # makeROC(trRatios, testtarget, makePlotName('dec','truth',k,j,type='roc', - # dir=self.dir,model_g=self.model_g,c1_g=self.c1_g),dir=self.dir,model_g=self.model_g) - - - innerRatios = 1./innerRatios - innerRatios[np.abs(innerRatios) == np.inf] = 0. - fullRatios += innerRatios - if true_dist == True: - innerTrueRatios = 1./innerTrueRatios - innerTrueRatios[np.abs(innerTrueRatios) == np.inf] = 0. - fullRatiosReal += innerTrueRatios - if roc == True: - for ind in range(1,(len(train_score)/3+1)): - print_scores = train_score[(ind-1)*3:(ind-1)*3+3] - print_targets = all_targets[(ind-1)*3:(ind-1)*3+3] - print_positions = all_positions[(ind-1)*3:(ind-1)*3+3] - if true_dist == True: - makeMultiROC(print_scores, print_targets,makePlotName('all{0}'.format(ind-1),'comparison',type='roc', - dir=self.dir,model_g=self.model_g,c1_g=self.c1_g),dir=self.dir,model_g=self.model_g, - true_score = true_score,print_pdf=True,title='ROC for pairwise trained classifier',pos=print_positions) - else: - makeMultiROC(print_scores, print_targets,makePlotName('all{0}'.format(ind-1),'comparison',type='roc', - dir=self.dir,model_g=self.model_g,c1_g=self.c1_g),dir=self.dir,model_g=self.model_g, - print_pdf=True,title='ROC for pairwise trained classifier',pos=print_positions) + k, j = ('F0', 'F1') + print 'Estimating {0} {1}'.format(k, j) + for l, name in enumerate(['sig', 'bkg']): + data = ROOT.RooDataSet( + '{0}data_{1}_{2}'.format( + name, k, j), "data", ROOT.RooArgSet(s)) + hist = ROOT.TH1F( + '{0}hist_{1}_{2}'.format( + name, k, j), 'hist', bins, low, high) + values = outputs[l] + for val in values: + hist.Fill(val) + s.setVal(val) + data.add(ROOT.RooArgSet(s)) + norm = 1. / hist.Integral() + hist.Scale(norm) + + s.setBins(bins) + datahist = ROOT.RooDataHist( + '{0}datahist_{1}_{2}'.format( + name, k, j), 'hist', ROOT.RooArgList(s), hist) + + histpdf = ROOT.RooHistFunc('{0}histpdf_{1}_{2}'.format( + name, k, j), 'hist', ROOT.RooArgSet(s), datahist, 1) + + getattr(w, 'import')(hist) + getattr(w, 'import')(data) + getattr(w, 'import')(datahist) + getattr(w, 'import')(histpdf) + score_str = 'scoref' if pos is None else 'score' + # Calculate the density of the classifier output using kernel density + # w.factory('KeysPdf::{0}dist_{1}_{2}({3},{0}data_{1}_{2},RooKeysPdf::NoMirror,2)'.format(name,k,j,score_str)) + + # Print histograms pdfs and estimated densities + if self.verbose_printing and name == 'bkg' and k != j: + full = 'full' if pos is None else 'dec' + if k < j and k != 'F0': + histos.append([w.function('sighistpdf_{0}_{1}'.format( + k, j)), w.function('bkghistpdf_{0}_{1}'.format(k, j))]) + histos_names.append( + ['f{0}-f{1}_f{1}(signal)'.format(k, j), 'f{0}-f{1}_f{0}(background)'.format(k, j)]) + if j < k and k != 'F0': + inv_histos.append([w.function('sighistpdf_{0}_{1}'.format( + k, j)), w.function('bkghistpdf_{0}_{1}'.format(k, j))]) + inv_histos_names.append( + ['f{0}-f{1}_f{1}(signal)'.format(k, j), 'f{0}-f{1}_f{0}(background)'.format(k, j)]) - if plotting == True: - saveMultiFig(evalData,[x for x in zip(train_score,true_score)], - makePlotName('all_dec','train',type='ratio'),labels=[['f0-f1(trained)','f0-f1(truth)'],['f0-f2(trained)','f0-f2(truth)'],['f1-f2(trained)','f1-f2(truth)']],title='Pairwise Ratios',print_pdf=True,dir=self.dir) + for k in range(self.nsamples): + for j in range(self.nsamples): + if k == j: + continue + if self.dataset_names is not None: + name_k, name_j = ( + self.dataset_names[k], self.dataset_names[j]) + else: + name_k, name_j = (k, j) + print 'Loading {0}:{1} {2}:{3}'.format(k, name_k, j, name_j) + traindata, targetdata = loadData(data_file, name_k, name_j, dir=self.dir, c1_g=self.c1_g) + + numtrain = traindata.shape[0] + size2 = traindata.shape[1] if len(traindata.shape) > 1 else 1 + output = [ + predict( + '{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format( + self.dir, self.model_g, self.c1_g, self.model_file, k, j), traindata[ + targetdata == 1], model_g=self.model_g, clf=self.clf), predict( + '{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format( + self.dir, self.model_g, self.c1_g, self.model_file, k, j), traindata[ + targetdata == 0], model_g=self.model_g, clf=self.clf)] + saveHistos(w, output, s, bins, low, high, (k, j)) + w.writeToFile('{0}/{1}'.format(self.dir, self.workspace)) + + if self.verbose_printing: + for ind in range(1, (len(histos) / 3 + 1)): + print_histos = histos[(ind - 1) * 3:(ind - 1) * 3 + 3] + print_histos_names = histos_names[ + (ind - 1) * 3:(ind - 1) * 3 + 3] + printMultiFrame( + w, + ['score'] * len(print_histos), + print_histos, + makePlotName( + 'dec{0}'.format( + ind - 1), + 'all', + type='hist', + dir=self.dir, + c1_g=self.c1_g, + model_g=self.model_g), + print_histos_names, + dir=self.dir, + model_g=self.model_g, + y_text='score(x)', + print_pdf=True, + title='Pairwise score distributions') + # Full model + traindata, targetdata = loadData(data_file, self.F0_dist, self.F1_dist, dir=self.dir, c1_g=self.c1_g) + numtrain = traindata.shape[0] + size2 = traindata.shape[1] if len(traindata.shape) > 1 else 1 + outputs = [predict('{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(self.dir, + self.model_g, + self.c1_g, + self.model_file), + traindata[targetdata == 1], + model_g=self.model_g, + clf=self.clf), + predict('{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(self.dir, + self.model_g, + self.c1_g, + self.model_file), + traindata[targetdata == 0], + model_g=self.model_g, + clf=self.clf)] + saveHistos( + w, + outputs, + s_full, + bins_full, + low_full, + high_full) + if self.verbose_printing: + printFrame(w, + ['scoref'], + [w.function('sighistpdf_F0_F1'), + w.function('bkghistpdf_F0_F1')], + makePlotName('full', + 'all', + type='hist', + dir=self.dir, + c1_g=self.c1_g, + model_g=self.model_g), + ['signal', + 'bkg'], + dir=self.dir, + model_g=self.model_g, + y_text='score(x)', + print_pdf=True, + title='Pairwise score distributions') + + w.writeToFile('{0}/{1}'.format(self.dir, self.workspace)) + + w.Print() + + # To calculate the ratio between single functions + def singleRatio(self, f0, f1): + ratio = f1 / f0 + ratio[~np.isfinite(ratio)] = 0 + return ratio + + def evalDist(self, x, f0, val): + iter = x.createIterator() + v = iter.Next() + i = 0 + while v: + v.setVal(val[i]) + v = iter.Next() + i = i + 1 + return f0.getVal(x) + + # To calculate the ratio between single functions + def __regFunc(self, x, f0, f1, val): + iter = x.createIterator() + v = iter.Next() + i = 0 + while v: + v.setVal(val[i]) + v = iter.Next() + i = i + 1 + if (f0.getVal(x) + f1.getVal(x)) == 0.: + return 0. + return f1.getVal(x) / (f0.getVal(x) + f1.getVal(x)) + + def evaluateDecomposedRatio( + self, + w, + evalData, + x=None, + roc=False, + c0arr=None, + c1arr=None, + true_dist=False, + pre_evaluation=None, + pre_dist=None, + data_type='test', + indexes=None, + plotting=None): + ''' + Compute composed ratio for dataset 'evalData'. + Single ratios can be precomputed in pre_evaluation + ''' + + plotting = plotting if plotting <> None else self.verbose_printing + if indexes is None: + indexes = self.basis_indexes + + score = ROOT.RooArgSet(w.var('score')) + npoints = evalData.shape[0] + fullRatios = np.zeros(npoints) + fullRatiosReal = np.zeros(npoints) + c0arr = self.c0 if c0arr is None else c0arr + c1arr = self.c1 if c1arr is None else c1arr + + true_score = [] + train_score = [] + all_targets = [] + all_positions = [] + all_ratios = [] + for k, c in enumerate(c0arr): + innerRatios = np.zeros(npoints) + innerTrueRatios = np.zeros(npoints) + if c == 0: + continue + for j, c_ in enumerate(c1arr): + index_k, index_j = (indexes[k], indexes[j]) + f0pdf = w.function( + 'bkghistpdf_{0}_{1}'.format( + index_k, index_j)) + f1pdf = w.function( + 'sighistpdf_{0}_{1}'.format( + index_k, index_j)) + if index_k != index_j: + if pre_evaluation is None: + traindata = evalData + outputs = predict( + '{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format( + self.dir, + self.model_g, + self.c1_g, + self.model_file, + k, + j), + traindata, + model_g=self.model_g, + clf=self.clf) + + f0pdfdist = np.array( + [self.evalDist(score, f0pdf, [xs]) for xs in outputs]) + f1pdfdist = np.array( + [self.evalDist(score, f1pdf, [xs]) for xs in outputs]) + else: + f0pdfdist = pre_evaluation[0][index_k][index_j] + f1pdfdist = pre_evaluation[1][index_k][index_j] + pdfratios = self.singleRatio(f0pdfdist, f1pdfdist) + else: + pdfratios = np.ones(npoints) + all_ratios.append(pdfratios) + innerRatios += (c_ / c) * pdfratios + if true_dist: + if pre_dist is None: + f0 = w.pdf('f{0}'.format(index_k)) + f1 = w.pdf('f{0}'.format(index_j)) + if len(evalData.shape) > 1: + f0dist = np.array( + [self.evalDist(x, f0pdf, xs) for xs in evalData]) + f1dist = np.array( + [self.evalDist(x, f1pdf, xs) for xs in evalData]) + else: + f0dist = np.array( + [self.evalDist(x, f0pdf, [xs]) for xs in evalData]) + f1dist = np.array( + [self.evalDist(x, f1pdf, [xs]) for xs in evalData]) + else: + f0dist = pre_dist[0][index_k][index_j] + f1dist = pre_dist[1][index_k][index_j] + ratios = self.singleRatio(f0dist, f1dist) + innerTrueRatios += (c_ / c) * ratios + # ROC curves for pair-wise ratios + if (roc or plotting) and k < j: + all_positions.append((k, j)) + if roc: + if self.dataset_names is not None: + name_k, name_j = ( + self.dataset_names[index_k], self.dataset_names[index_j]) + else: + name_k, name_j = (index_k, index_j) + testdata, testtarget = loadData(data_type, name_k, name_j, dir=self.dir, c1_g=self.c1_g) + else: + testdata = evalData + size2 = testdata.shape[1] if len(testdata.shape) > 1 else 1 + outputs = predict( + '{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format( + self.dir, + self.model_g, + self.c1_g, + self.model_file, + k, + j), + testdata, + model_g=self.model_g, + clf=self.clf) + f0pdfdist = np.array( + [self.evalDist(score, f0pdf, [xs]) for xs in outputs]) + f1pdfdist = np.array( + [self.evalDist(score, f1pdf, [xs]) for xs in outputs]) + clfRatios = self.singleRatio(f0pdfdist, f1pdfdist) + train_score.append(clfRatios) + if roc: + all_targets.append(testtarget) + if true_dist: + if len(evalData.shape) > 1: + f0dist = np.array( + [self.evalDist(x, f0pdf, xs) for xs in testdata]) + f1dist = np.array( + [self.evalDist(x, f1pdf, xs) for xs in testdata]) + else: + f0dist = np.array( + [self.evalDist(x, f0pdf, [xs]) for xs in testdata]) + f1dist = np.array( + [self.evalDist(x, f1pdf, [xs]) for xs in testdata]) + + trRatios = self.singleRatio(f0dist, f1dist) + + true_score.append(trRatios) + + innerRatios = 1. / innerRatios + innerRatios[np.abs(innerRatios) == np.inf] = 0. + fullRatios += innerRatios + if true_dist: + innerTrueRatios = 1. / innerTrueRatios + innerTrueRatios[np.abs(innerTrueRatios) == np.inf] = 0. + fullRatiosReal += innerTrueRatios + if roc: + for ind in range(1, (len(train_score) / 3 + 1)): + print_scores = train_score[(ind - 1) * 3:(ind - 1) * 3 + 3] + print_targets = all_targets[(ind - 1) * 3:(ind - 1) * 3 + 3] + print_positions = all_positions[ + (ind - 1) * 3:(ind - 1) * 3 + 3] + if true_dist: + makeMultiROC( + print_scores, + print_targets, + makePlotName( + 'all{0}'.format( + ind - 1), + 'comparison', + type='roc', + dir=self.dir, + model_g=self.model_g, + c1_g=self.c1_g), + dir=self.dir, + model_g=self.model_g, + true_score=true_score, + print_pdf=True, + title='ROC for pairwise trained classifier', + pos=print_positions) + else: + makeMultiROC( + print_scores, + print_targets, + makePlotName( + 'all{0}'.format( + ind - 1), + 'comparison', + type='roc', + dir=self.dir, + model_g=self.model_g, + c1_g=self.c1_g), + dir=self.dir, + model_g=self.model_g, + print_pdf=True, + title='ROC for pairwise trained classifier', + pos=print_positions) + + return fullRatios, fullRatiosReal + + def findOutliers(self, x): + q5, q95 = np.percentile(x, [5, 95]) + iqr = 2.0 * (q95 - q5) + outliers = (x <= q95 + iqr) & (x >= q5 - iqr) + return outliers + + def computeRatios(self, true_dist=False, vars_g=None, + data_file='test', use_log=False): + ''' + Use the computed score densities to compute + the decomposed ratio test. + set true_dist to True if workspace have the true distributions to + make plots, in that case vars_g also must be provided + Final result is histogram for ratios and signal - bkf rejection curves + ''' + + f = ROOT.TFile('{0}/{1}'.format(self.dir, self.workspace)) + w = f.Get('w') + f.Close() + + c1 = self.c1 + c0 = self.c0 + c1 = c1 / c1.sum() + c0 = c0 / c0.sum() + + print 'Calculating ratios' + + npoints = 50 + + if true_dist: + vars = ROOT.TList() + for var in vars_g: + vars.Add(w.var(var)) + x = ROOT.RooArgSet(vars) + + if use_log: + evaluateRatio = self.evaluateLogDecomposedRatio + post = 'log' + else: + evaluateRatio = self.evaluateDecomposedRatio + post = '' - return fullRatios,fullRatiosReal + score = ROOT.RooArgSet(w.var('score')) + scoref = ROOT.RooArgSet(w.var('scoref')) - def findOutliers(self, x): - q5, q95 = np.percentile(x, [5,95]) - iqr = 2.0*(q95 - q5) - outliers = (x <= q95 + iqr) & (x >= q5 - iqr) - return outliers + if use_log: + getRatio = self.singleLogRatio + else: + getRatio = self.singleRatio + + # NN trained on complete model + F0pdf = w.function('bkghistpdf_F0_F1') + F1pdf = w.function('sighistpdf_F0_F1') + + # TODO Here assuming that signal is first dataset + testdata, testtarget = loadData( + data_file, self.F0_dist, 0, dir=self.dir, c1_g=self.c1_g) + if len(vars_g) == 1: + xarray = np.linspace(0, 5, npoints) + fullRatios, _ = evaluateRatio( + w, xarray, x=x, roc=False, true_dist=True) + + F1dist = np.array([self.evalDist(x, w.pdf('F1'), [xs]) + for xs in xarray]) + F0dist = np.array([self.evalDist(x, w.pdf('F0'), [xs]) + for xs in xarray]) + y2 = getRatio(F1dist, F0dist) + + # NN trained on complete model + outputs = predict( + '{0}/model/{1}/{2}/adaptive_F0_F1.pkl'.format( + self.dir, self.model_g, self.c1_g), xarray.reshape( + xarray.shape[0], 1), model_g=self.model_g, clf=self.clf) + F1fulldist = np.array( + [self.evalDist(scoref, F1pdf, [xs]) for xs in outputs]) + F0fulldist = np.array( + [self.evalDist(scoref, F0pdf, [xs]) for xs in outputs]) + + pdfratios = getRatio(F1fulldist, F0fulldist) + + saveFig(xarray, + [fullRatios, + y2, + pdfratios], + makePlotName('all', + 'train', + type='ratio' + post), + title='Likelihood Ratios', + labels=['Approx. Decomposed', + 'Exact', + 'Approx.'], + print_pdf=True, + dir=self.dir) + + if true_dist: + decomposedRatio, _ = evaluateRatio( + w, testdata, x=x, roc=self.verbose_printing, true_dist=True) + else: + decomposedRatio, _ = evaluateRatio( + w, testdata, c0arr=c0, c1arr=c1, roc=True, data_type=data_file) + if len(testdata.shape) > 1: + outputs = predict( + '{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format( + self.dir, + self.model_g, + self.c1_g, + self.model_file), + testdata, + model_g=self.model_g, + clf=self.clf) - def computeRatios(self,true_dist=False, vars_g=None, - data_file='test',use_log=False): - ''' - Use the computed score densities to compute - the decomposed ratio test. - set true_dist to True if workspace have the true distributions to - make plots, in that case vars_g also must be provided - Final result is histogram for ratios and signal - bkf rejection curves - ''' + else: + outputs = predict( + '{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format( + self.dir, self.model_g, self.c1_g, self.model_file), testdata.reshape( + testdata.shape[0], 1), model_g=self.model_g, clf=self.clf) + + F1fulldist = np.array( + [self.evalDist(scoref, F1pdf, [xs]) for xs in outputs]) + F0fulldist = np.array( + [self.evalDist(scoref, F0pdf, [xs]) for xs in outputs]) + + completeRatio = getRatio(F1fulldist, F0fulldist) + if true_dist: + if len(testdata.shape) > 1: + F1dist = np.array([self.evalDist(x, w.pdf('F1'), xs) + for xs in testdata]) + F0dist = np.array([self.evalDist(x, w.pdf('F0'), xs) + for xs in testdata]) + else: + F1dist = np.array( + [self.evalDist(x, w.pdf('F1'), [xs]) for xs in testdata]) + F0dist = np.array( + [self.evalDist(x, w.pdf('F0'), [xs]) for xs in testdata]) + + realRatio = getRatio(F1dist, F0dist) + + decomposed_target = testtarget + complete_target = testtarget + real_target = testtarget + + # Removing outliers + numtest = decomposedRatio.shape[0] + + if true_dist: + real_outliers = np.zeros(numtest, dtype=bool) + real_outliers = self.findOutliers(realRatio) + + + all_ratios_plots = [] + all_names_plots = [] + bins = 70 + low = 0.6 + high = 1.2 + if use_log: + low = -1.0 + high = 1.0 + low = [] + high = [] + low = [] + high = [] + ratios_vars = [] + for l, name in enumerate(['sig', 'bkg']): + if true_dist: + ratios_names = ['true', 'full', 'composed'] + ratios_vec = [realRatio, completeRatio, decomposedRatio] + target_vec = [real_target, complete_target, decomposed_target] + + minimum = min([realRatio[real_target == 1 - l].min(), + completeRatio[complete_target == 1 - l].min(), + decomposedRatio[decomposed_target == 1 - l].min()]) + maximum = max([realRatio[real_target == 1 - l].max(), + completeRatio[complete_target == 1 - l].max(), + decomposedRatio[decomposed_target == 1 - l].max()]) - f = ROOT.TFile('{0}/{1}'.format(self.dir,self.workspace)) - w = f.Get('w') - f.Close() + else: + ratios_names = ['full', 'composed'] + ratios_vec = [completeRatio, decomposedRatio] + target_vec = [complete_target, decomposed_target] + minimum = min([completeRatio[complete_target == 1 - l].min(), + decomposedRatio[decomposed_target == 1 - l].min()]) + maximum = max([completeRatio[complete_target == 1 - l].max(), + decomposedRatio[decomposed_target == 1 - l].max()]) + + low.append(minimum - ((maximum - minimum) / bins) * 10) + high.append(maximum + ((maximum - minimum) / bins) * 10) + w.factory('ratio{0}[{1},{2}]'.format(name, low[l], high[l])) + ratios_vars.append(w.var('ratio{0}'.format(name))) + for curr, curr_ratios, curr_targets in zip( + ratios_names, ratios_vec, target_vec): + numtest = curr_ratios.shape[0] + for l, name in enumerate(['sig', 'bkg']): + hist = ROOT.TH1F( + '{0}_{1}hist_F0_f0'.format( + curr, name), 'hist', bins, low[l], high[l]) + for val in curr_ratios[curr_targets == 1 - l]: + hist.Fill(val) + datahist = ROOT.RooDataHist( + '{0}_{1}datahist_F0_f0'.format( + curr, name), 'hist', ROOT.RooArgList( + ratios_vars[l]), hist) + ratios_vars[l].setBins(bins) + histpdf = ROOT.RooHistFunc( + '{0}_{1}histpdf_F0_f0'.format( + curr, name), 'hist', ROOT.RooArgSet( + ratios_vars[l]), datahist, 0) + + histpdf.specialIntegratorConfig( + ROOT.kTRUE).method1D().setLabel('RooBinIntegrator') + getattr(w, 'import')(hist) + getattr(w, 'import')(datahist) + getattr(w, 'import')(histpdf) + if name == 'bkg': + all_ratios_plots.append([w.function('{0}_sighistpdf_F0_f0'.format( + curr)), w.function('{0}_bkghistpdf_F0_f0'.format(curr))]) + all_names_plots.append( + ['sig_{0}'.format(curr), 'bkg_{0}'.format(curr)]) + + all_ratios_plots = [[all_ratios_plots[j][i] for j, _ in enumerate( + all_ratios_plots)] for i, _ in enumerate(all_ratios_plots[0])] + all_names_plots = [[all_names_plots[j][i] for j, _ in enumerate( + all_names_plots)] for i, _ in enumerate(all_names_plots[0])] + + printMultiFrame(w, + ['ratiosig', + 'ratiobkg'], + all_ratios_plots, + makePlotName('ratio', + 'comparison', + type='hist' + post, + dir=self.dir, + model_g=self.model_g, + c1_g=self.c1_g), + all_names_plots, + setLog=True, + dir=self.dir, + model_g=self.model_g, + y_text='Count', + title='Histograms for ratios', + x_text='ratio value', + print_pdf=True) + + + if use_log: + decomposedRatio = np.exp(decomposedRatio) + completeRatio = np.exp(completeRatio) + if true_dist: + realRatio = np.exp(realRatio) + if true_dist: + ratios_list = [decomposedRatio / decomposedRatio.max(), + completeRatio / completeRatio.max(), + realRatio / realRatio.max()] + targets_list = [decomposed_target, complete_target, real_target] + legends_list = ['Approx. Decomposed', 'Approx.', 'Exact'] + else: - - #TODO: This are Harcoded for now - c1 = self.c1 - c0 = self.c0 - #c1 = np.multiply(c1, self.cross_section) - c1 = c1/c1.sum() - c0 = c0/c0.sum() - - print 'Calculating ratios' - - npoints = 50 - - if true_dist == True: - vars = ROOT.TList() - for var in vars_g: - vars.Add(w.var(var)) - x = ROOT.RooArgSet(vars) - - if use_log == True: - evaluateRatio = self.evaluateLogDecomposedRatio - post = 'log' - else: - evaluateRatio = self.evaluateDecomposedRatio - post = '' - - score = ROOT.RooArgSet(w.var('score')) - scoref = ROOT.RooArgSet(w.var('scoref')) - - if use_log == True: - getRatio = self.singleLogRatio - else: - getRatio = self.singleRatio - - if self.preprocessing == True: - if self.scaler == None: - self.scaler = {} - for k in range(self.nsamples): - for j in range(self.nsamples): - if k < j: - self.scaler[(k,j)] = joblib.load('{0}/model/{1}/{2}/{3}_{4}_{5}.dat'.format(self.dir,'mlp',self.c1_g,'scaler',self.dataset_names[k],self.dataset_names[j])) - - - # NN trained on complete model - F0pdf = w.function('bkghistpdf_F0_F1') - F1pdf = w.function('sighistpdf_F0_F1') - - # TODO Here assuming that signal is first dataset - testdata, testtarget = loadData(data_file,self.F0_dist,0,dir=self.dir,c1_g=self.c1_g,preprocessing=False) - if len(vars_g) == 1: - xarray = np.linspace(0,5,npoints) - fullRatios,_ = evaluateRatio(w,xarray,x=x,plotting=True,roc=False,true_dist=True) - - F1dist = np.array([self.evalDist(x,w.pdf('F1'),[xs]) for xs in xarray]) - F0dist = np.array([self.evalDist(x,w.pdf('F0'),[xs]) for xs in xarray]) - y2 = getRatio(F1dist, F0dist) - - # NN trained on complete model - outputs = predict('{0}/model/{1}/{2}/adaptive_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g),xarray.reshape(xarray.shape[0],1),model_g=self.model_g,clf=self.clf) - F1fulldist = np.array([self.evalDist(scoref,F1pdf,[xs]) for xs in outputs]) - F0fulldist = np.array([self.evalDist(scoref,F0pdf,[xs]) for xs in outputs]) - - pdfratios = getRatio(F1fulldist, F0fulldist) - - saveFig(xarray, [fullRatios, y2, pdfratios], makePlotName('all','train',type='ratio'+post),title='Likelihood Ratios',labels=['Composed trained', 'True', 'Full trained'],print_pdf=True,dir=self.dir) - - if true_dist == True: - decomposedRatio,_ = evaluateRatio(w,testdata,x=x,plotting=False,roc=self.verbose_printing,true_dist=True) - else: - decomposedRatio,_ = evaluateRatio(w,testdata,c0arr=c0,c1arr=c1,plotting=True, - roc=True,data_type=data_file) - if len(testdata.shape) > 1: - outputs = predict('{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file),testdata,model_g=self.model_g,clf=self.clf) - #outputs = predict('/afs/cern.ch/work/j/jpavezse/private/{0}_F0_F1.pkl'.format(self.model_file),testdata,model_g=self.model_g) - - else: - outputs = predict('{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file),testdata.reshape(testdata.shape[0],1),model_g=self.model_g,clf=self.clf) - - F1fulldist = np.array([self.evalDist(scoref,F1pdf,[xs]) for xs in outputs]) - F0fulldist = np.array([self.evalDist(scoref,F0pdf,[xs]) for xs in outputs]) - - completeRatio = getRatio(F1fulldist,F0fulldist) - if true_dist == True: - if len(testdata.shape) > 1: - F1dist = np.array([self.evalDist(x,w.pdf('F1'),xs) for xs in testdata]) - F0dist = np.array([self.evalDist(x,w.pdf('F0'),xs) for xs in testdata]) - else: - F1dist = np.array([self.evalDist(x,w.pdf('F1'),[xs]) for xs in testdata]) - F0dist = np.array([self.evalDist(x,w.pdf('F0'),[xs]) for xs in testdata]) - - realRatio = getRatio(F1dist,F0dist) - - decomposed_target = testtarget - complete_target = testtarget - real_target = testtarget - #Histogram F0-f0 for composed, full and true - - # Removing outliers - numtest = decomposedRatio.shape[0] - #decomposedRatio[decomposedRatio < 0.] = completeRatio[decomposedRatio < 0.] - - #decomposed_outliers = np.zeros(numtest,dtype=bool) - #complete_outliers = np.zeros(numtest,dtype=bool) - #decomposed_outliers = self.findOutliers(decomposedRatio) - #complete_outliers = self.findOutliers(completeRatio) - #decomposed_target = testtarget[decomposed_outliers] - #complete_target = testtarget[complete_outliers] - #decomposedRatio = decomposedRatio[decomposed_outliers] - #completeRatio = completeRatio[complete_outliers] - if true_dist == True: - real_outliers = np.zeros(numtest,dtype=bool) - real_outliers = self.findOutliers(realRatio) - #real_target = testtarget[real_outliers] - #realRatio = realRatio[real_outliers] - - all_ratios_plots = [] - all_names_plots = [] - bins = 70 - low = 0.6 - high = 1.2 - if use_log == True: - low = -1.0 - high = 1.0 - low = [] - high = [] - low = [] - high = [] - ratios_vars = [] - for l,name in enumerate(['sig','bkg']): - if true_dist == True: - ratios_names = ['truth','full','composed'] - ratios_vec = [realRatio, completeRatio, decomposedRatio] - target_vec = [real_target, complete_target, decomposed_target] - - minimum = min([realRatio[real_target == 1-l].min(), - completeRatio[complete_target == 1-l].min(), - decomposedRatio[decomposed_target == 1-l].min()]) - maximum = max([realRatio[real_target == 1-l].max(), - completeRatio[complete_target == 1-l].max(), - decomposedRatio[decomposed_target == 1-l].max()]) - - else: - ratios_names = ['full','composed'] - ratios_vec = [completeRatio, decomposedRatio] - target_vec = [complete_target, decomposed_target] - minimum = min([completeRatio[complete_target == 1-l].min(), - decomposedRatio[decomposed_target == 1-l].min()]) - maximum = max([completeRatio[complete_target == 1-l].max(), - decomposedRatio[decomposed_target == 1-l].max()]) - - low.append(minimum - ((maximum - minimum) / bins)*10) - high.append(maximum + ((maximum - minimum) / bins)*10) - w.factory('ratio{0}[{1},{2}]'.format(name, low[l], high[l])) - ratios_vars.append(w.var('ratio{0}'.format(name))) - for curr, curr_ratios, curr_targets in zip(ratios_names,ratios_vec,target_vec): - numtest = curr_ratios.shape[0] - for l,name in enumerate(['sig','bkg']): - hist = ROOT.TH1F('{0}_{1}hist_F0_f0'.format(curr,name),'hist',bins,low[l],high[l]) - for val in curr_ratios[curr_targets == 1-l]: - hist.Fill(val) - datahist = ROOT.RooDataHist('{0}_{1}datahist_F0_f0'.format(curr,name),'hist', - ROOT.RooArgList(ratios_vars[l]),hist) - ratios_vars[l].setBins(bins) - histpdf = ROOT.RooHistFunc('{0}_{1}histpdf_F0_f0'.format(curr,name),'hist', - ROOT.RooArgSet(ratios_vars[l]), datahist, 0) - - histpdf.specialIntegratorConfig(ROOT.kTRUE).method1D().setLabel('RooBinIntegrator') - getattr(w,'import')(hist) - getattr(w,'import')(datahist) # work around for morph = w.import(morph) - getattr(w,'import')(histpdf) # work around for morph = w.import(morph) - #print '{0} {1} {2}'.format(curr,name,hist.Integral()) - if name == 'bkg': - all_ratios_plots.append([w.function('{0}_sighistpdf_F0_f0'.format(curr)), - w.function('{0}_bkghistpdf_F0_f0'.format(curr))]) - all_names_plots.append(['sig_{0}'.format(curr),'bkg_{0}'.format(curr)]) - - all_ratios_plots = [[all_ratios_plots[j][i] for j,_ in enumerate(all_ratios_plots)] - for i,_ in enumerate(all_ratios_plots[0])] - all_names_plots = [[all_names_plots[j][i] for j,_ in enumerate(all_names_plots)] - for i,_ in enumerate(all_names_plots[0])] - - printMultiFrame(w,['ratiosig','ratiobkg'],all_ratios_plots, makePlotName('ratio','comparison',type='hist'+post,dir=self.dir,model_g=self.model_g,c1_g=self.c1_g),all_names_plots,setLog=True,dir=self.dir,model_g=self.model_g,y_text='Count',title='Histograms for ratios',x_text='ratio value',print_pdf=True) - - # scatter plot true ratio - composed - full ratio - - #if self.verbose_printing == True and true_dist == True: - # saveFig(completeRatio,[realRatio], makePlotName('full','train',type='scat'+post,dir=self.dir,model_g=self.model_g,c1_g=self.c1_g),scatter=True,axis=['full trained ratio','true ratio'],dir=self.dir,model_g=self.model_g) - # saveFig(decomposedRatio,[realRatio], makePlotName('comp','train',type='scat'+post,dir=self.dir, model_g=self.model_g, c1_g=self.c1_g),scatter=True, axis=['composed trained ratio','true ratio'],dir=self.dir, model_g=self.model_g) - # signal - bkg rejection plots - if use_log == True: - decomposedRatio = np.exp(decomposedRatio) - completeRatio = np.exp(completeRatio) - if true_dist == True: - realRatio = np.exp(realRatio) - if true_dist == True: - - ratios_list = [decomposedRatio/decomposedRatio.max(), - completeRatio/completeRatio.max(), - realRatio/realRatio.max()] - targets_list = [decomposed_target, complete_target, real_target] - legends_list = ['composed', 'full', 'true'] - else: - - indices = (decomposedRatio > 0.) - decomposedRatio = decomposedRatio[indices] - decomposed_target = decomposed_target[indices] - indices = (completeRatio > 0.) - completeRatio = completeRatio[indices] - complete_target = complete_target[indices] - - completeRatio = np.log(completeRatio) - decomposedRatio = np.log(decomposedRatio) - decomposedRatio = decomposedRatio + np.abs(decomposedRatio.min()) - completeRatio = completeRatio + np.abs(completeRatio.min()) - ratios_list = [decomposedRatio/decomposedRatio.max(), - completeRatio/completeRatio.max()] - targets_list = [decomposed_target, complete_target] - legends_list = ['composed','full'] - makeSigBkg(ratios_list,targets_list,makePlotName('comp','all',type='sigbkg'+post,dir=self.dir, - model_g=self.model_g,c1_g=self.c1_g),dir=self.dir,model_g=self.model_g,print_pdf=True,legends=legends_list,title='Signal-Background rejection curves') - - # Scatter plot to compare regression function and classifier score - if self.verbose_printing == True and true_dist == True: - testdata, testtarget = loadData('test',self.F0_dist,self.F1_dist,dir=self.dir,c1_g=self.c1_g) - if len(testdata.shape) > 1: - reg = np.array([self.__regFunc(x,w.pdf('F0'),w.pdf('F1'),xs) for xs in testdata]) - else: - reg = np.array([self.__regFunc(x,w.pdf('F0'),w.pdf('F1'),[xs]) for xs in testdata]) - if len(testdata.shape) > 1: - outputs = predict('{0}/model/{1}/{2}/adaptive_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g),testdata.reshape(testdata.shape[0],testdata.shape[1]),model_g=self.model_g, clf=self.clf) - else: - outputs = predict('{0}/model/{1}/{2}/adaptive_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g),testdata.reshape(testdata.shape[0],1),model_g=self.model_g, clf=self.clf) - #saveFig(outputs,[reg], makePlotName('full','train',type='scat',dir=self.dir,model_g=self.model_g,c1_g=self.c1_g),scatter=True,axis=['score','regression'],dir=self.dir,model_g=self.model_g) - - - - def evalC1Likelihood(self,w,testdata,c0,c1,c_eval=0,c_min=0.01,c_max=0.2,use_log=False,true_dist=False, vars_g=None, npoints=50,samples_ids=None,weights_func=None,coef_index=0): - - if true_dist == True: - vars = ROOT.TList() - for var in vars_g: - vars.Add(w.var(var)) - x = ROOT.RooArgSet(vars) - else: - x = None - - score = ROOT.RooArgSet(w.var('score')) - if use_log == True: - evaluateRatio = self.evaluateLogDecomposedRatio - post = 'log' - else: - evaluateRatio = self.evaluateDecomposedRatio - post = '' - - csarray = np.linspace(c_min,c_max,npoints) - decomposedLikelihood = np.zeros(npoints) - trueLikelihood = np.zeros(npoints) - c1s = np.zeros(c0.shape[0]) - pre_pdf = [] - pre_dist = [] - pre_pdf.extend([[],[]]) - pre_dist.extend([[],[]]) - # change this enumerates - for k in enumerate(self.nsamples): - pre_pdf[0].append([]) - pre_pdf[1].append([]) - pre_dist[0].append([]) - pre_dist[1].append([]) - for j in enumerate(self.nsamples): - index_k,index_j = (self.basis_indexes[k],self.basis_indexes[j]) - if k <> j: - f0pdf = w.function('bkghistpdf_{0}_{1}'.format(index_k,index_j)) - f1pdf = w.function('sighistpdf_{0}_{1}'.format(index_k,index_j)) - data = testdata - if self.preprocessing == True: - data = preProcessing(testdata,self.dataset_names[min(index_k,index_j)], - self.dataset_names[max(index_k,index_j)],self.scaler) - outputs = predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(self.dir,self.model_g, - self.c1_g,self.model_file,index_k,index_j),data,model_g=self.model_g, clf=self.clf) - f0pdfdist = np.array([self.evalDist(score,f0pdf,[xs]) for xs in outputs]) - f1pdfdist = np.array([self.evalDist(score,f1pdf,[xs]) for xs in outputs]) - pre_pdf[0][k].append(f0pdfdist) - pre_pdf[1][k].append(f1pdfdist) + indices = (decomposedRatio > 0.) + decomposedRatio = decomposedRatio[indices] + decomposed_target = decomposed_target[indices] + indices = (completeRatio > 0.) + completeRatio = completeRatio[indices] + complete_target = complete_target[indices] + + completeRatio = np.log(completeRatio) + decomposedRatio = np.log(decomposedRatio) + decomposedRatio = decomposedRatio + np.abs(decomposedRatio.min()) + completeRatio = completeRatio + np.abs(completeRatio.min()) + ratios_list = [decomposedRatio / decomposedRatio.max(), + completeRatio / completeRatio.max()] + targets_list = [decomposed_target, complete_target] + legends_list = ['Approx. Decomposed', 'Approx.'] + makeSigBkg( + ratios_list, + targets_list, + makePlotName( + 'comp', + 'all', + type='sigbkg' + post, + dir=self.dir, + model_g=self.model_g, + c1_g=self.c1_g), + dir=self.dir, + model_g=self.model_g, + print_pdf=True, + legends=legends_list, + title='Signal-Background rejection curves') + + + def evalC1C2Likelihood( + self, + c0, + c1, + dir='/afs/cern.ch/user/j/jpavezse/systematics', + workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', + c1_g='', + model_g='mlp', + use_log=False, + true_dist=False, + vars_g=None, + clf=None, + verbose_printing=False): + + f = ROOT.TFile('{0}/{1}'.format(dir, workspace)) + w = f.Get('w') + f.Close() + if true_dist: + vars = ROOT.TList() + for var in vars_g: + vars.Add(w.var(var)) + x = ROOT.RooArgSet(vars) else: - pre_pdf[0][k].append(None) - pre_pdf[1][k].append(None) - if true_dist == True: - f0 = w.pdf('f{0}'.format(index_k)) - f1 = w.pdf('f{0}'.format(index_j)) - if len(testdata.shape) > 1: - f0dist = np.array([self.evalDist(x,f0,xs) for xs in testdata]) - f1dist = np.array([self.evalDist(x,f1,xs) for xs in testdata]) - else: - f0dist = np.array([self.evalDist(x,f0,[xs]) for xs in testdata]) - f1dist = np.array([self.evalDist(x,f1,[xs]) for xs in testdata]) - pre_dist[0][k].append(f0dist) - pre_dist[1][k].append(f1dist) - indices = np.ones(testdata.shape[0], dtype=bool) - ratiosList = [] - samples = [] - # This is needed for calibration of full ratios - #for i,sample in enumerate(self.dataset_names): - # samples.append(np.loadtxt('{0}/data/{1}/{2}/{3}_{4}.dat'.format(self.dir,'mlp',self.c1_g,'data',sample))) - - #cross_section = self.cross_section / np.sum(self.cross_section) - n_eff_ratio = np.zeros(csarray.shape[0]) - n_zeros = np.zeros(csarray.shape[0]) - cross_section = None - for i,cs in enumerate(csarray): - if weights_func <> None: - c1s = weights_func(cs,c1[1]) if coef_index == 0 else weights_func(c1[0],cs) - print '{0} {1}'.format(cs, c1[1]) if coef_index == 0 else '{0} {1}'.format(c1[0],cs) - print c1s - else: - c1s[:] = c1[:] - c1s[c_eval] = cs - if self.cross_section <> None: - c1s = np.multiply(c1s,self.cross_section) - #c1s = np.abs(c1s) - n_eff = c1s.sum() - n_tot = np.abs(c1s).sum() - print 'n_eff: {0}, n_tot: {1}, n_eff/n_tot: {2}'.format(n_eff, n_tot, n_eff/n_tot) - c1s = c1s/c1s.sum() - decomposedRatios,trueRatios = evaluateRatio(w,testdata,x=x, - plotting=False,roc=False,c0arr=c0,c1arr=c1s,true_dist=true_dist,pre_dist=pre_dist, - pre_evaluation=pre_pdf,cross_section=cross_section) - decomposedRatios = 1./decomposedRatios - n_eff_ratio[i] = n_eff/n_tot - n_zeros[i] = decomposedRatios[decomposedRatios < 0.].shape[0] - print decomposedRatios[decomposedRatios < 0.].shape - #calibratedRatios = self.calibrateFullRatios(w, decomposedRatios, - # c0,c1s,debug=debug,samples_data=samples,index=i) - #saveFig(decomposedRatios2, [calibratedRatios], makePlotName('calibrated_{0}'.format(i),'ratio',type='scat', - #dir=self.dir, model_g=self.model_g, c1_g=self.c1_g),scatter=True, axis=['composed ratio', - #'composed calibrated'], dir=self.dir, model_g=self.model_g) - ratiosList.append(decomposedRatios) - #indices = np.logical_and(indices, decomposedRatios > 0.) - for i,cs in enumerate(csarray): - decomposedRatios = ratiosList[i] - if use_log == False: - if samples_ids <> None: - ratios = decomposedRatios - ids = samples_ids - decomposedLikelihood[i] = (np.dot(np.log(ratios), - np.array([c1[x] for x in ids]))).sum() + x = None + + score = ROOT.RooArgSet(w.var('score')) + if use_log: + evaluateRatio = self.evaluateLogDecomposedRatio + post = 'log' else: - decomposedRatios[decomposedRatios < 0.] = 1.0 - decomposedLikelihood[i] = -np.log(decomposedRatios).sum() - print decomposedLikelihood[i] - - trueLikelihood[i] = -np.log(trueRatios).sum() - else: - decomposedLikelihood[i] = decomposedRatios.sum() - trueLikelihood[i] = trueRatios.sum() - decomposedLikelihood = decomposedLikelihood - decomposedLikelihood.min() - # print n_eff/n_zero relation - #saveFig(csarray,[n_eff_ratio, n_zeros/n_zeros.max()],makePlotName('eff_ratio','zeros',type=post+'plot_g2'),labels=['n_eff/n_tot','zeros/{0}'.format(n_zeros.max())],axis=['g2','values'],marker=True,dir=self.dir,marker_value=c1[0],title='#zeros and n_eff/n_tot given g2',print_pdf=True,model_g=self.model_g) - #saveFig(n_eff_ratio, [n_zeros/n_zeros.max()], makePlotName('eff_ratio','zeros',type='scat', - #dir=self.dir, model_g=self.model_g, c1_g=self.c1_g),scatter=True, axis=['n_eff/n_tot', - #'#zeros/{0}'.format(n_zeros.max())], dir=self.dir, model_g=self.model_g,title='# zeros given n_eff/n_tot ratio') - - if true_dist == True: - trueLikelihood = trueLikelihood - trueLikelihood.min() - saveFig(csarray,[decomposedLikelihood,trueLikelihood],makePlotName('comp','train',type=post+'likelihood_{0}'.format(n_sample)),labels=['decomposed','true'],axis=['c1[0]','-ln(L)'],marker=True,dir=self.dir,marker_value=c1[0],title='c1[0] Fitting',print_pdf=True) - return (csarray[trueLikelihood.argmin()], csarray[decomposedLikelihood.argmin()]) - else: - saveFig(csarray,[decomposedLikelihood],makePlotName('comp','train',type='likelihood_g2'),labels=['decomposed'],axis=['g2','-ln(L)'],marker=True,dir=self.dir,marker_value=c1[c_eval],title='g2 Fitting',print_pdf=True,model_g=self.model_g) - pdb.set_trace() - return (0.,csarray[decomposedLikelihood.argmin()]) - - def evalC1C2Likelihood(self,w,testdata,c0,c1,c_eval=0,c_min=0.01,c_max=0.2,use_log=False,true_dist=False, vars_g=None, npoints=50,samples_ids=None,weights_func=None): - - if true_dist == True: - vars = ROOT.TList() - for var in vars_g: - vars.Add(w.var(var)) - x = ROOT.RooArgSet(vars) - else: - x = None - - score = ROOT.RooArgSet(w.var('score')) - if use_log == True: - evaluateRatio = self.evaluateLogDecomposedRatio - post = 'log' - else: - evaluateRatio = self.evaluateDecomposedRatio - post = '' - - csarray = np.linspace(c_min[0],c_max[0],npoints) - csarray2 = np.linspace(c_min[1], c_max[1], npoints) - decomposedLikelihood = np.zeros((npoints,npoints)) - trueLikelihood = np.zeros((npoints,npoints)) - c1s = np.zeros(c0.shape[0]) - pre_pdf = [] - pre_dist = [] - pre_pdf.extend([[],[]]) - pre_dist.extend([[],[]]) - # change this enumerates - for k,c0_ in enumerate(c0): - pre_pdf[0].append([]) - pre_pdf[1].append([]) - pre_dist[0].append([]) - pre_dist[1].append([]) - for j,c1_ in enumerate(c0): - index_k,index_j = (self.basis_indexes[k],self.basis_indexes[j]) - if k <> j: - f0pdf = w.function('bkghistpdf_{0}_{1}'.format(index_k,index_j)) - f1pdf = w.function('sighistpdf_{0}_{1}'.format(index_k,index_j)) - data = testdata - if self.preprocessing == True: - data = preProcessing(testdata,self.dataset_names[min(index_k,index_j)], - self.dataset_names[max(index_k,index_j)],self.scaler) - outputs = predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(self.dir,self.model_g, - self.c1_g,self.model_file,index_k,index_j),data,model_g=self.model_g, clf=self.clf) - f0pdfdist = np.array([self.evalDist(score,f0pdf,[xs]) for xs in outputs]) - f1pdfdist = np.array([self.evalDist(score,f1pdf,[xs]) for xs in outputs]) - pre_pdf[0][k].append(f0pdfdist) - pre_pdf[1][k].append(f1pdfdist) + evaluateRatio = self.evaluateDecomposedRatio + post = '' + + npoints = 25 + csarray = np.linspace(0.01, 0.2, npoints) + cs2array = np.linspace(0.1, 0.4, npoints) + testdata = np.loadtxt( + '{0}/data/{1}/{2}/{3}_{4}.dat'.format(dir, model_g, c1_g, 'test', 'F1')) + + decomposedLikelihood = np.zeros((npoints, npoints)) + trueLikelihood = np.zeros((npoints, npoints)) + c1s = np.zeros(c1.shape[0]) + c0s = np.zeros(c1.shape[0]) + pre_pdf = [] + pre_dist = [] + pre_pdf.extend([[], []]) + pre_dist.extend([[], []]) + for k, c0_ in enumerate(c0): + pre_pdf[0].append([]) + pre_pdf[1].append([]) + pre_dist[0].append([]) + pre_dist[1].append([]) + for j, c1_ in enumerate(c1): + if k != j: + f0pdf = w.function('bkghistpdf_{0}_{1}'.format(k, j)) + f1pdf = w.function('sighistpdf_{0}_{1}'.format(k, j)) + outputs = predict( + '{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format( + dir, + model_g, + c1_g, + 'adaptive', + k, + j), + testdata, + model_g=model_g, + clf=clf) + f0pdfdist = np.array( + [self.evalDist(score, f0pdf, [xs]) for xs in outputs]) + f1pdfdist = np.array( + [self.evalDist(score, f1pdf, [xs]) for xs in outputs]) + pre_pdf[0][k].append(f0pdfdist) + pre_pdf[1][k].append(f1pdfdist) + else: + pre_pdf[0][k].append(None) + pre_pdf[1][k].append(None) + if true_dist: + f0 = w.pdf('f{0}'.format(k)) + f1 = w.pdf('f{0}'.format(j)) + if len(testdata.shape) > 1: + f0dist = np.array([self.evalDist(x, f0, xs) + for xs in testdata]) + f1dist = np.array([self.evalDist(x, f1, xs) + for xs in testdata]) + else: + f0dist = np.array([self.evalDist(x, f0, [xs]) + for xs in testdata]) + f1dist = np.array([self.evalDist(x, f1, [xs]) + for xs in testdata]) + pre_dist[0][k].append(f0dist) + pre_dist[1][k].append(f1dist) + + # Evaluate Likelihood in different c1[0] and c1[1] values + self + for i, cs in enumerate(csarray): + for j, cs2 in enumerate(cs2array): + c1s[:] = c1[:] + c1s[0] = cs + c1s[1] = cs2 + c1s[2] = 1. - cs - cs2 + decomposedRatios, trueRatios = evaluateRatio(w, testdata, + x=x, roc=False, plotting=False, + c0arr=c0, c1arr=c1s, true_dist=true_dist, + pre_evaluation=pre_pdf, + pre_dist=pre_dist) + + if not use_log: + decomposedLikelihood[i, j] = np.log(decomposedRatios).sum() + trueLikelihood[i, j] = np.log(trueRatios).sum() + else: + decomposedLikelihood[i, j] = decomposedRatios.sum() + trueLikelihood[i, j] = trueRatios.sum() + decomposedLikelihood = 2. * decomposedLikelihood + if true_dist: + trueLikelihood = 2. * trueLikelihood + decomposedLikelihood = decomposedLikelihood - decomposedLikelihood.min() + X, Y = np.meshgrid(csarray, cs2array) + decMin = np.unravel_index( + decomposedLikelihood.argmin(), + decomposedLikelihood.shape) + min_value = [csarray[decMin[0]], cs2array[decMin[1]]] + if true_dist: + trueLikelihood = trueLikelihood - trueLikelihood.min() + trueMin = np.unravel_index( + trueLikelihood.argmin(), + trueLikelihood.shape) + if verbose_printing: + saveFig(X, + [Y, + decomposedLikelihood, + trueLikelihood], + makePlotName('comp', + 'train', + type='multilikelihood'), + labels=['Approx. Decomposed', + 'Exact'], + type='contour2', + marker=True, + dir=dir, + marker_value=(c1[0], + c1[1]), + print_pdf=True, + min_value=min_value) + return [[csarray[trueMin[0]], cs2array[trueMin[1]]], + [csarray[decMin[0]], cs2array[decMin[1]]]] else: - pre_pdf[0][k].append(None) - pre_pdf[1][k].append(None) - if true_dist == True: - f0 = w.pdf('f{0}'.format(k)) - f1 = w.pdf('f{0}'.format(j)) - if len(testdata.shape) > 1: - f0dist = np.array([self.evalDist(x,f0,xs) for xs in testdata]) - f1dist = np.array([self.evalDist(x,f1,xs) for xs in testdata]) - else: - f0dist = np.array([self.evalDist(x,f0,[xs]) for xs in testdata]) - f1dist = np.array([self.evalDist(x,f1,[xs]) for xs in testdata]) - pre_dist[0][k].append(f0dist) - pre_dist[1][k].append(f1dist) - indices = np.ones(testdata.shape[0], dtype=bool) - ratiosList = [] - samples = [] - # This is needed for calibration of full ratios - #for i,sample in enumerate(self.dataset_names): - # samples.append(np.loadtxt('{0}/data/{1}/{2}/{3}_{4}.dat'.format(self.dir,'mlp',self.c1_g,'data',sample))) - n_eff_ratio = np.zeros((csarray.shape[0], csarray2.shape[0])) - for i,cs in enumerate(csarray): - ratiosList.append([]) - for j, cs2 in enumerate(csarray2): - if weights_func <> None: - c1s = weights_func(cs,cs2) - #print '{0} {1}'.format(cs,cs2) - #print c1s + return [[0., 0.], [csarray[decMin[0]], cs2array[decMin[1]]]] + + + def fitCValues( + self, + c0, + c1, + dir='/afs/cern.ch/user/j/jpavezse/systematics', + c1_g='', + model_g='mlp', + true_dist=False, + vars_g=None, + workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', + use_log=False, + clf=None, + n_hist_c=100): + if use_log: + post = 'log' else: - c1s[:] = c1[:] - c1s[c_eval] = cs - if self.cross_section <> None: - c1s = np.multiply(c1s,self.cross_section) - n_eff = c1s.sum() - n_tot = np.abs(c1s).sum() - n_eff_ratio[i,j] = n_eff/n_tot - #print '{0} {1}'.format(i,j) - #print 'n_eff: {0}, n_tot: {1}, n_eff/n_tot: {2}'.format(n_eff, n_tot, n_eff/n_tot) - c1s = c1s/c1s.sum() - #print c1s - decomposedRatios,trueRatios = evaluateRatio(w,testdata,x=x, - plotting=False,roc=False,c0arr=c0,c1arr=c1s,true_dist=true_dist,pre_dist=pre_dist, - pre_evaluation=pre_pdf) - decomposedRatios = 1./decomposedRatios - #calibratedRatios = self.calibrateFullRatios(w, decomposedRatios, - # c0,c1s,debug=debug,samples_data=samples,index=i) - #saveFig(decomposedRatios2, [calibratedRatios], makePlotName('calibrated_{0}'.format(i),'ratio',type='scat', - #dir=self.dir, model_g=self.model_g, c1_g=self.c1_g),scatter=True, axis=['composed ratio', - #'composed calibrated'], dir=self.dir, model_g=self.model_g) - ratiosList[i].append(decomposedRatios) - #print('{0} {1} '.format(i,j)), - #print decomposedRatios[decomposedRatios < 0.].shape - #print c1s - #indices = np.logical_and(indices, decomposedRatios > 0.) - for i,cs in enumerate(csarray): - for j, cs2 in enumerate(csarray2): - decomposedRatios = ratiosList[i][j] - if use_log == False: - if samples_ids <> None: - ratios = decomposedRatios - ids = samples_ids - decomposedLikelihood[i,j] = (np.dot(np.log(ratios), - np.array([c1[x] for x in ids]))).sum() - else: - #decomposedRatios[decomposedRatios < 0.] = 0.9 - decomposedRatios[decomposedRatios < 0.] = 1.0 - #decomposedRatios = decomposedRatios[self.findOutliers(decomposedRatios)] - if n_eff_ratio[i,j] <= 0.5: - #TODO: Harcoded number - decomposedLikelihood[i,j] = 20000 + post = '' + keys = ['true', 'dec'] + c1_ = dict((key, np.zeros(n_hist_c)) for key in keys) + c1_values = dict((key, np.zeros(n_hist_c)) for key in keys) + c2_values = dict((key, np.zeros(n_hist_c)) for key in keys) + + fil2 = open('{0}/fitting_values_c1c2{1}.txt'.format(dir, post), 'w') + + for i in range(n_hist_c): + + makeData(vars_g, c0, c1, num_train=200000, num_test=500, no_train=True, + workspace=workspace, dir=dir, c1_g=c1_g) + + if i == 0: + verbose_printing = True else: - decomposedLikelihood[i,j] = -np.log(decomposedRatios).sum() - #print decomposedLikelihood[i,j] - #print '{0} {1} {2}'.format(i,j,decomposedLikelihood[i,j]) - trueLikelihood[i,j] = -np.log(trueRatios).sum() - else: - decomposedLikelihood[i,j] = decomposedRatios.sum() - trueLikelihood[i,j] = trueRatios.sum() - #print '\n {0}'.format(i) - decomposedLikelihood = decomposedLikelihood - decomposedLikelihood.min() - decMin = np.unravel_index(decomposedLikelihood.argmin(), decomposedLikelihood.shape) - # pixel plots - #saveFig(csarray,[csarray2,decomposedLikelihood],makePlotName('comp','train',type='likelihood_g1g2'),labels=['composed'],pixel=True,marker=True,dir=self.dir,model_g=self.model_g,marker_value=(c1[0],c1[1]),print_pdf=True,contour=True,title='Likelihood fit for g1,g2') - - #decMin = [np.sum(decomposedLikelihood,1).argmin(),np.sum(decomposedLikelihood,0).argmin()] - X,Y = np.meshgrid(csarray, csarray2) - - saveFig(X,[Y,decomposedLikelihood],makePlotName('comp','train',type='multilikelihood'),labels=['composed'],contour=True,marker=True,dir=self.dir,model_g=self.model_g,marker_value=(c1[0],c1[1]),print_pdf=True,min_value=(csarray[decMin[0]],csarray2[decMin[1]])) - #print decMin - print [csarray[decMin[0]],csarray2[decMin[1]]] - pdb.set_trace() - if true_dist == True: - trueLikelihood = trueLikelihood - trueLikelihood.min() - trueMin = np.unravel_index(trueLikelihood.argmin(), trueLikelihood.shape) - saveFig(csarray,[decomposedLikelihood,trueLikelihood],makePlotName('comp','train',type=post+'likelihood_{0}'.format(n_sample)),labels=['decomposed','true'],axis=['c1[0]','-ln(L)'],marker=True,dir=self.dir,marker_value=c1[0],title='c1[0] Fitting',print_pdf=True) - return [[csarray[trueMin[0]],csarray2[trueMin[1]]], - [csarray2[decMin[0],csarray2[decMin[1]]]]] - else: - return [[0.,0.],[csarray[decMin[0]],csarray2[decMin[1]]]] - - def fitCValues(self,c0,c1,data_file = 'test',true_dist=False,vars_g=None,use_log=False,n_hist=150,num_pseudodata=1000,weights_func=None): - if use_log == True: - post = 'log' - else: - post = '' - npoints = 15 - c_eval = 1 - c_min = [0.6,0.1] - c_max = [1.5,0.9] - c_min = [-1.1,-1.1] - c_max = [-0.1,-0.1] - - #c_min = 0.6 - #c_max = 1.4 - - f = ROOT.TFile('{0}/{1}'.format('/afs/cern.ch/work/j/jpavezse/private/',self.workspace)) - w = f.Get('w') - f.Close() - assert w - - print '{0} {1}'.format(c_min,c_max) - rng = np.random.RandomState(self.seed) - # Needed in case of working of NN with scaled features - if self.preprocessing == True: - if self.scaler == None: - self.scaler = {} - for k in range(self.nsamples): - for j in enumerate(self.nsamples): - if k < j: - self.scaler[(k,j)] = joblib.load('{0}/model/{1}/{2}/{3}_{4}_{5}.dat'.format(self.dir,'mlp',self.c1_g,'scaler',self.dataset_names[k],self.dataset_names[j])) - - testdata = np.loadtxt('{0}/data/{1}/{2}/{3}_{4}.dat'.format(self.dir,'mlp',self.c1_g,data_file,self.F1_dist))[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,16,20,24,25,26,27,28,29,30,31,36,40,42]] - print self.F1_dist - print testdata.shape - - fil1 = open('{0}/fitting_values_c1.txt'.format(self.dir),'a') - for i in range(n_hist): - indexes = rng.choice(testdata.shape[0], num_pseudodata) - dataset = testdata[indexes] - #(c1_true_1, c1_dec_1) = self.evalSingleC1Likelihood(w,dataset, c0,c1,c_eval=c_eval,c_min=c_min, - #c_max=c_max,true_dist=true_dist,vars_g=vars_g,weights_func=weights_func, - # npoints=npoints,use_log=use_log) - ((c1_true,c2_true),(c1_dec,c2_dec)) = self.evalDoubleC1C2Likelihood(w,dataset, c0,c1,c_eval=c_eval,c_min=c_min, - c_max=c_max,true_dist=true_dist,vars_g=vars_g,weights_func=weights_func, - npoints=npoints,use_log=use_log) - print '2: {0} {1} {2} {3}'.format(c1_true, c1_dec, c2_true, c2_dec) - fil1.write('{0} {1} {2} {3}\n'.format(c1_true, c1_dec, c2_true, c2_dec)) - #print '1: {0} {1}'.format(c1_true_1, c1_dec_1) - #fil1.write('{0} {1}\n'.format(c1_true_1, c1_dec_1)) - fil1.close() + verbose_printing = False + + ((c1_true, + c2_true), + (c1_dec, + c2_dec)) = self.evalC1C2Likelihood(c0, + c1, + dir=dir, + c1_g=c1_g, + model_g=model_g, + true_dist=true_dist, + vars_g=vars_g, + workspace=workspace, + use_log=use_log, + clf=clf, + verbose_printing=verbose_printing) + print '2: {0} {1} {2} {3}'.format(c1_true, c1_dec, c2_true, c2_dec) + + fil2.write( + '{0} {1} {2} {3}\n'.format( + c1_true, + c1_dec, + c2_true, + c2_dec)) + + fil2.close() \ No newline at end of file diff --git a/logistic_sgd.py b/logistic_sgd.py index ed1ef58..d8c55de 100644 --- a/logistic_sgd.py +++ b/logistic_sgd.py @@ -46,6 +46,7 @@ import theano import theano.tensor as T + class LogisticRegression(object): """Multi-class Logistic Regression Class @@ -55,7 +56,6 @@ class LogisticRegression(object): determine a class membership probability. """ - def __init__(self, input, n_in, n_out): """ Initialize the parameters of the logistic regression @@ -76,13 +76,12 @@ def __init__(self, input, n_in, n_out): # initialize with 0 the weights W as a matrix of shape (n_in, n_out) self.W = theano.shared(value=numpy.zeros((n_in, n_out), dtype=theano.config.floatX), - name='W', borrow=True) + name='W', borrow=True) # initialize the baises b as a vector of n_out 0s self.b = theano.shared(value=numpy.zeros((n_out,), dtype=theano.config.floatX), name='b', borrow=True) - out = T.dot(input, self.W) + self.b # compute vector of class-membership probabilities in symbolic form self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) @@ -122,8 +121,8 @@ def negative_log_likelihood(self, y): # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is # the mean (across minibatch examples) of the elements in v, # i.e., the mean log-likelihood across the minibatch. - #return -(T.mean(T.log(self.p_y_given_x[y==1])) + T.mean(T.log(1.-self.p_y_given_x[y==0]))) - #return T.mean(T.nnet.binary_crossentropy(self.p_y_given_x, y)) + # return -(T.mean(T.log(self.p_y_given_x[y==1])) + T.mean(T.log(1.-self.p_y_given_x[y==0]))) + # return T.mean(T.nnet.binary_crossentropy(self.p_y_given_x, y)) return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) def errors(self, y): @@ -139,7 +138,7 @@ def errors(self, y): # check if y has same dimension of y_pred if y.ndim != self.y_pred.ndim: raise TypeError('y should have the same shape as self.y_pred', - ('y', target.type, 'y_pred', self.y_pred.type)) + ('y', target.type, 'y_pred', self.y_pred.type)) # check if y is of the correct datatype if y.dtype.startswith('int'): # the T.neq operator returns a vector of 0s and 1s, where 1 @@ -164,7 +163,8 @@ def load_data(dataset): data_dir, data_file = os.path.split(dataset) if data_dir == "" and not os.path.isfile(dataset): # Check if dataset is in the data directory. - new_path = os.path.join(os.path.split(__file__)[0], "..", "data", dataset) + new_path = os.path.join( + os.path.split(__file__)[0], "..", "data", dataset) if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz': dataset = new_path @@ -180,12 +180,12 @@ def load_data(dataset): f = gzip.open(dataset, 'rb') train_set, valid_set, test_set = cPickle.load(f) f.close() - #train_set, valid_set, test_set format: tuple(input, target) - #input is an numpy.ndarray of 2 dimensions (a matrix) - #witch row's correspond to an example. target is a - #numpy.ndarray of 1 dimensions (vector)) that have the same length as - #the number of rows in the input. It should give the target - #target to the example with the same index in the input. + # train_set, valid_set, test_set format: tuple(input, target) + # input is an numpy.ndarray of 2 dimensions (a matrix) + # witch row's correspond to an example. target is a + # numpy.ndarray of 1 dimensions (vector)) that have the same length as + # the number of rows in the input. It should give the target + # target to the example with the same index in the input. def shared_dataset(data_xy, borrow=True): """ Function that loads the dataset into shared variables @@ -221,9 +221,11 @@ def shared_dataset(data_xy, borrow=True): return rval -def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, - dataset='/user/j/jgpavez/dbnTheano/code/mnist.pkl.gz', - batch_size=600): +def sgd_optimization_mnist( + learning_rate=0.13, + n_epochs=1000, + dataset='/user/j/jgpavez/dbnTheano/code/mnist.pkl.gz', + batch_size=600): """ Demonstrate stochastic gradient descent optimization of a log-linear model @@ -262,7 +264,7 @@ def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of - # [int] labels + # [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 @@ -275,16 +277,16 @@ def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function(inputs=[index], - outputs=classifier.errors(y), - givens={ - x: test_set_x[index * batch_size: (index + 1) * batch_size], - y: test_set_y[index * batch_size: (index + 1) * batch_size]}) + outputs=classifier.errors(y), + givens={ + x: test_set_x[index * batch_size: (index + 1) * batch_size], + y: test_set_y[index * batch_size: (index + 1) * batch_size]}) validate_model = theano.function(inputs=[index], - outputs=classifier.errors(y), - givens={ - x: valid_set_x[index * batch_size:(index + 1) * batch_size], - y: valid_set_y[index * batch_size:(index + 1) * batch_size]}) + outputs=classifier.errors(y), + givens={ + x: valid_set_x[index * batch_size:(index + 1) * batch_size], + y: valid_set_y[index * batch_size:(index + 1) * batch_size]}) # compute the gradient of cost with respect to theta = (W,b) g_W = T.grad(cost=cost, wrt=classifier.W) @@ -299,11 +301,11 @@ def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function(inputs=[index], - outputs=cost, - updates=updates, - givens={ - x: train_set_x[index * batch_size:(index + 1) * batch_size], - y: train_set_y[index * batch_size:(index + 1) * batch_size]}) + outputs=cost, + updates=updates, + givens={ + x: train_set_x[index * batch_size:(index + 1) * batch_size], + y: train_set_y[index * batch_size:(index + 1) * batch_size]}) ############### # TRAIN MODEL # @@ -312,14 +314,14 @@ def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is - # found + # found improvement_threshold = 0.995 # a relative improvement of this much is - # considered significant + # considered significant validation_frequency = min(n_train_batches, patience / 2) - # go through this many - # minibatche before checking the network - # on the validation set; in this case we - # check every epoch + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch best_params = None best_validation_loss = numpy.inf @@ -342,13 +344,13 @@ def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) - print('epoch %i, minibatch %i/%i, validation error %f %%' % \ - (epoch, minibatch_index + 1, n_train_batches, - this_validation_loss * 100.)) + print('epoch %i, minibatch %i/%i, validation error %f %%' % + (epoch, minibatch_index + 1, n_train_batches, + this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: - #improve patience if loss improvement is good enough + # improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) @@ -361,9 +363,9 @@ def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of best' - ' model %f %%') % - (epoch, minibatch_index + 1, n_train_batches, - test_score * 100.)) + ' model %f %%') % + (epoch, minibatch_index + 1, n_train_batches, + test_score * 100.)) if patience <= iter: done_looping = True @@ -372,7 +374,7 @@ def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, end_time = time.clock() print(('Optimization complete with best validation score of %f %%,' 'with test performance %f %%') % - (best_validation_loss * 100., test_score * 100.)) + (best_validation_loss * 100., test_score * 100.)) print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time)) print >> sys.stderr, ('The code for file ' + diff --git a/make_data.py b/make_data.py index 41e019c..cb48ed1 100644 --- a/make_data.py +++ b/make_data.py @@ -13,285 +13,449 @@ from mpl_toolkits.mplot3d import Axes3D from utils import printMultiFrame, printFrame, saveFig, loadData,\ - makeROC, makeSigBkg, makePlotName + makeROC, makeSigBkg, makePlotName ''' - Functions to make model and data for the decomposed training + Functions to make model and data for the decomposed training method ''' # Default model parameters # private coefficients -coeffs_g = [[ 0.28174199,0.46707738,0.25118062],[0.18294893,0.33386682,0.48318425],[ 0.25763285,0.28015834,0.46220881]] +coeffs_g = [[0.28174199, 0.46707738, 0.25118062], [0.18294893, + 0.33386682, 0.48318425], [0.25763285, 0.28015834, 0.46220881]] # gaussians parameters mu_g = [] cov_g = [] -mu_g.append([5.,5.,4.,3.,5.,5.,4.5,2.5,4.,3.5]) -mu_g.append([2.,4.5,0.6,5.,6.,4.5,4.2,0.2,4.1,3.3]) -mu_g.append([1.,0.5,0.3,0.5,0.6,0.4,0.1,0.2,0.1,0.3]) -meansum = [[7.6,10.,-9.],[5.,-6.,7.5],[8.2,12.2,-4.3]] - -cov_g.append([[3.,0.,5.,0.,0.,0.,0.,1.,0.,5.], - [0.,2.,0.,0.,0.,0.,0.,0.,0.,0.], - [0.,0.,14.,0.,0.,0.,4.2,0.,5.,0.], - [0.,0.,0.,6.,0.,0.,0.,3.,0.,0.], - [0.,0.,0.,0.,17.,0.,0.,2.,0.,0.], - [0.,0.,0.,0.,0.,10.,0.,0.,0.,0.], - [0.,0.,0.,0.,0.,0.,5.,0.,0.,0.], - [0.,0.,0.,0.,0.,0.,0.,1.3,1.,0.], - [0.,0.,0.,0.,0.,0.,0.,0.,1.,0.], - [0.,0.,0.,0.,0.,0.,0.,0.,0.,9.3]]) -cov_g.append([[3.5,0.,0.,4.,0.,0.,0.,0.,5.,0.], - [0.,3.5,0.,0.,0.,0.,0.,0.,0.,0.], - [0.,0.,9.5,0.,0.,2.,0.,0.5,0.,0.], - [0.,0.,0.,7.2,0.,0.,0.,0.,2.,0.], - [0.,0.,0.,0.,4.5,0.,0.,0.,0.,0.], - [0.,0.,0.,0.,0.,4.5,0.,0.,0.,0.], - [0.,0.,0.,0.,0.,0.,8.2,0.,0.,0.2], - [0.,0.,0.,0.,0.,0.,0.,9.5,3.,0.], - [0.,0.,0.,0.,0.,0.,0.,0.,3.5,0.], - [0.,0.,0.,0.,0.,0.,0.,0.,0.,4.5]]) -cov_g.append([[13.,0.,0.,0.,0.,0.,0.,0.,0.,0.], - [0.,12.,0.,0.,0.,0.2,0.,4.,0.,0.], - [0.,0.,14.,0.,0.5,0.,0.,0.,0.,3.], - [0.,0.,0.,6.,0.,0.,0.,0.,0.,0.], - [0.,0.,0.,0.,1.,2.,0.,0.,0.,0.], - [0.,0.,0.,0.,0.,10.,0.,3.,0.,0.], - [0.,0.,0.,0.,0.,0.,15.,0.,0.,4.], - [0.,0.,0.,0.,0.,0.,0.,6.3,0.,0.], - [0.,0.,0.,0.,0.,0.,0.,0.,11.,0.], - [0.,0.,0.,0.,0.,0.,0.,0.,0.,1.3]]) - -def makeModelPrivateND(vars_g,c0, c1, n_private=3, coeffs=coeffs_g,cov_l=cov_g, mu_l=mu_g, - workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', - dir='/afs/cern.ch/user/j/jpavezse/systematics',model_g='mlp', - c1_g='',verbose_printing=False,load_cov=False): - ''' - RooFit statistical model for the data - - ''' - # Statistical model - w = ROOT.RooWorkspace('w') - - print 'Generating initial distributions' - cov_m = [] - mu_m = [] - mu_str = [] - cov_root = [] - vec = [] - argus = ROOT.RooArgList() - - # features - for i,var in enumerate(vars_g): - w.factory('{0}[{1},{2}]'.format(var,-25,30)) - argus.add(w.var(var)) - n = len(cov_l[0][0]) - for glob in range(3): - for priv in range(n_private): - if load_cov == False: - cov_i = np.random.random((n,n)) - cov_i = cov_i + cov_i.transpose() - cov_i = cov_i + n*np.eye(n) - np.savetxt('{0}/covariance_{1}_{2}.txt'.format(dir,glob,priv), - cov_i,fmt='%f') - else: - cov_i = np.matrix(np.loadtxt('{0}/data/covariance_{1}_{2}.txt'.format( - dir,glob,priv))) - print cov_i - # generate covriance matrix - cov_m.append(cov_i) - cov_root.append(ROOT.TMatrixDSym(len(vars_g))) - for i,var1 in enumerate(vars_g): - for j,var2 in enumerate(vars_g): - if i <= j: - cov_root[-1][i][j] = cov_m[-1][i,j] - else: - cov_root[-1][i][j] = cov_m[-1][j,i] - getattr(w,'import')(cov_root[-1],'cov{0}'.format(glob*3 + priv)) - # generate mu vectors - mu_m.append(np.array(mu_l[glob]) + meansum[glob][priv]) - vec.append(ROOT.TVectorD(len(vars_g))) - for i, mu in enumerate(mu_m[-1]): - vec[-1][i] = mu - mu_str.append(','.join([str(mu) for mu in mu_m[-1]])) - # create multivariate gaussian - gaussian = ROOT.RooMultiVarGaussian('f{0}_{1}'.format(glob,priv), - 'f{0}_{1}'.format(glob,priv),argus,vec[-1],cov_root[-1]) - getattr(w,'import')(gaussian) - # create private mixture model - priv_coeffs = np.array(coeffs[glob]) - #print 'priv coef {0} {1}'.format(priv_coeffs, priv_coeffs.sum()) - sum_str = ','.join(['c_{0}_{1}[{2}]*f{0}_{1}'.format(glob,j,priv_coeffs[j]) for j in range(n_private)]) - w.factory('SUM::f{0}({1})'.format(glob,sum_str)) - #mixture model - w.factory("SUM::F0(c00[{0}]*f0,c01[{1}]*f1,f2)".format(c0[0],c0[1])) - w.factory("SUM::F1(c10[{0}]*f0,c11[{1}]*f1,f2)".format(c1[0],c1[1])) - - # Check Model - w.Print() - - w.writeToFile('{0}/{1}'.format(dir,workspace)) - if verbose_printing == True: - printFrame(w,['x0','x1','x2'],[w.pdf('f0'),w.pdf('f1'),w.pdf('f2')],'decomposed_model',['f0','f1','f2'] - ,dir=dir,model_g=model_g,range=[-15,20],title='Single distributions',x_text='x0',y_text='p(x)') - printFrame(w,['x0','x1','x2'],[w.pdf('F0'),w.pdf('F1')],'full_model',['Bkg','Bkg+Signal'], - dir=dir,model_g=model_g,range=[-15,20],title='Composed model',x_text='x0',y_text='p(x)') - printFrame(w,['x0','x1','x2'],[w.pdf('F1'),'f0'],'full_signal', ['Bkg','Signal'], - dir=dir,model_g=model_g,range=[-15,20],title='Background and signal',x_text='x0',y_text='p(x)') - - return w - -def makeModelND(vars_g,c0,c1,cov_l=cov_g,mu_l=mu_g, - workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', - dir='/afs/cern.ch/user/j/jpavezse/systematics',model_g='mlp', - c1_g='',verbose_printing=False): - ''' - RooFit statistical model for the data - - ''' - # Statistical model - w = ROOT.RooWorkspace('w') - - print 'Generating initial distributions' - cov_m = [] - mu_m = [] - mu_str = [] - cov_root = [] - vec = [] - argus = ROOT.RooArgList() - #features - for i,var in enumerate(vars_g): - w.factory('{0}[{1},{2}]'.format(var,-25,30)) - argus.add(w.var(var)) - - for glob in range(3): - # generate covariance matrix - cov_m.append(np.matrix(cov_l[glob])) - cov_root.append(ROOT.TMatrixDSym(len(vars_g))) - for i,var1 in enumerate(vars_g): - for j,var2 in enumerate(vars_g): - cov_root[-1][i][j] = cov_m[-1][i,j] - getattr(w,'import')(cov_root[-1],'cov{0}'.format(glob)) - # generate mu vector - mu_m.append(np.array(mu_l[glob])) - vec.append(ROOT.TVectorD(len(vars_g))) - for i, mu in enumerate(mu_m[-1]): - vec[-1][i] = mu - mu_str.append(','.join([str(mu) for mu in mu_m[-1]])) - # multivariate gaussian - gaussian = ROOT.RooMultiVarGaussian('f{0}'.format(glob), - 'f{0}'.format(glob),argus,vec[-1],cov_root[-1]) - getattr(w,'import')(gaussian) - # mixture models - w.factory("SUM::F0(c00[{0}]*f0,c01[{1}]*f1,f2)".format(c0[0],c0[1])) - w.factory("SUM::F1(c10[{0}]*f0,c11[{1}]*f1,f2)".format(c1[0],c1[1])) - - # Check Model - w.Print() - - w.writeToFile('{0}/{1}'.format(dir,workspace)) - if verbose_printing == True: - printFrame(w,['x0','x1','x2'],[w.pdf('f0'),w.pdf('f1'),w.pdf('f2')],'decomposed_model',['f0','f1','f2'] - ,dir=dir,model_g=model_g,range=[-15,20],title='Single distributions',x_text='x0',y_text='p(x)',print_pdf=True) - printFrame(w,['x0','x1','x2'],[w.pdf('F0'),w.pdf('F1')],'full_model',['Bkg','Bkg+Signal'], - dir=dir,model_g=model_g,range=[-15,20],title='Composed model',x_text='x0',y_text='p(x)',print_pdf=True) - printFrame(w,['x0','x1','x2'],[w.pdf('F1'),'f0'],'full_signal', ['Bkg','Signal'], - dir=dir,model_g=model_g,range=[-15,20],title='Background and signal',x_text='x0',y_text='p(x)',print_pdf=True) - - return w - - -def makeModel(c0,c1,cov_l=cov_g,mu_l=mu_g, - workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', - dir='/afs/cern.ch/user/j/jpavezse/systematics',model_g='mlp', - c1_g='',verbose_printing=False): - ''' - RooFit statistical model for the data - - ''' - # Statistical model - w = ROOT.RooWorkspace('w') - #w.factory("EXPR::f1('cos(x)**2 + .01',x)") - w.factory("EXPR::f2('exp(x*-1)',x[0,5])") - w.factory("EXPR::f1('0.3 + exp(-(x-5)**2/5.)',x)") - w.factory("EXPR::f0('exp(-(x-2.5)**2/1.)',x)") - #w.factory("EXPR::f2('exp(-(x-2)**2/2)',x)") - w.factory("SUM::F0(c00[{0}]*f0,c01[{1}]*f1,f2)".format(c0[0],c0[1])) - w.factory("SUM::F1(c10[{0}]*f0,c11[{1}]*f1,f2)".format(c1[0],c1[1])) - - # Check Model - w.Print() - w.writeToFile('{0}/workspace_DecomposingTestOfMixtureModelsClassifiers.root'.format(dir)) - if verbose_printing == True: - printFrame(w,['x'],[w.pdf('f0'),w.pdf('f1'),w.pdf('f2')],'decomposed_model',['f0','f1','f2'] - ,dir=dir,model_g=model_g,range=[-15,20],title='Single distributions',x_text='x0',y_text='p(x)', - print_pdf=True) - printFrame(w,['x'],[w.pdf('F0'),w.pdf('F1')],'full_model',['Bkg','Bkg+Signal'], - dir=dir,model_g=model_g,range=[-15,20],title='Composed model',x_text='x0',y_text='p(x)',print_pdf=True) - printFrame(w,['x'],[w.pdf('F1'),'f0'],'full_signal', ['Bkg','Signal'], - dir=dir,model_g=model_g,range=[-15,20],title='Background and signal',x_text='x0',y_text='p(x)', - print_pdf=True) - - -def makeData(vars_g,c0,c1, num_train=500,num_test=100,no_train=False, - workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', - dir='/afs/cern.ch/user/j/jpavezse/systematics', - c1_g='',model_g='mlp'): - # Start generating data - ''' - Each function will be discriminated pair-wise - so n*n datasets are needed (maybe they can be reused?) - ''' +mu_g.append([5., 5., 4., 3., 5., 5., 4.5, 2.5, 4., 3.5]) +mu_g.append([2., 4.5, 0.6, 5., 6., 4.5, 4.2, 0.2, 4.1, 3.3]) +mu_g.append([1., 0.5, 0.3, 0.5, 0.6, 0.4, 0.1, 0.2, 0.1, 0.3]) +meansum = [[7.6, 10., -9.], [5., -6., 7.5], [8.2, 12.2, -4.3]] + +cov_g.append([[3., 0., 5., 0., 0., 0., 0., 1., 0., 5.], + [0., 2., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 14., 0., 0., 0., 4.2, 0., 5., 0.], + [0., 0., 0., 6., 0., 0., 0., 3., 0., 0.], + [0., 0., 0., 0., 17., 0., 0., 2., 0., 0.], + [0., 0., 0., 0., 0., 10., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 5., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 1.3, 1., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 9.3]]) +cov_g.append([[3.5, 0., 0., 4., 0., 0., 0., 0., 5., 0.], + [0., 3.5, 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 9.5, 0., 0., 2., 0., 0.5, 0., 0.], + [0., 0., 0., 7.2, 0., 0., 0., 0., 2., 0.], + [0., 0., 0., 0., 4.5, 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 4.5, 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 8.2, 0., 0., 0.2], + [0., 0., 0., 0., 0., 0., 0., 9.5, 3., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 3.5, 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 4.5]]) +cov_g.append([[13., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 12., 0., 0., 0., 0.2, 0., 4., 0., 0.], + [0., 0., 14., 0., 0.5, 0., 0., 0., 0., 3.], + [0., 0., 0., 6., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 1., 2., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 10., 0., 3., 0., 0.], + [0., 0., 0., 0., 0., 0., 15., 0., 0., 4.], + [0., 0., 0., 0., 0., 0., 0., 6.3, 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 11., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.3]]) + + +def makeModelPrivateND( + vars_g, + c0, + c1, + n_private=3, + coeffs=coeffs_g, + cov_l=cov_g, + mu_l=mu_g, + workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', + dir='/afs/cern.ch/user/j/jpavezse/systematics', + model_g='mlp', + c1_g='', + verbose_printing=False, + load_cov=False): + ''' + RooFit statistical model for the data + + ''' + # Statistical model + w = ROOT.RooWorkspace('w') + + print 'Generating initial distributions' + cov_m = [] + mu_m = [] + mu_str = [] + cov_root = [] + vec = [] + argus = ROOT.RooArgList() + + # features + for i, var in enumerate(vars_g): + w.factory('{0}[{1},{2}]'.format(var, -25, 30)) + argus.add(w.var(var)) + n = len(cov_l[0][0]) + for glob in range(3): + for priv in range(n_private): + if not load_cov: + cov_i = np.random.random((n, n)) + cov_i = cov_i + cov_i.transpose() + cov_i = cov_i + n * np.eye(n) + np.savetxt('{0}/covariance_{1}_{2}.txt'.format(dir, + glob, priv), cov_i, fmt='%f') + else: + cov_i = np.matrix(np.loadtxt( + '{0}/data/covariance_{1}_{2}.txt'.format(dir, glob, priv))) + print cov_i + # generate covriance matrix + cov_m.append(cov_i) + cov_root.append(ROOT.TMatrixDSym(len(vars_g))) + for i, var1 in enumerate(vars_g): + for j, var2 in enumerate(vars_g): + if i <= j: + cov_root[-1][i][j] = cov_m[-1][i, j] + else: + cov_root[-1][i][j] = cov_m[-1][j, i] + getattr(w, 'import')(cov_root[-1], + 'cov{0}'.format(glob * 3 + priv)) + # generate mu vectors + mu_m.append(np.array(mu_l[glob]) + meansum[glob][priv]) + vec.append(ROOT.TVectorD(len(vars_g))) + for i, mu in enumerate(mu_m[-1]): + vec[-1][i] = mu + mu_str.append(','.join([str(mu) for mu in mu_m[-1]])) + # create multivariate gaussian + gaussian = ROOT.RooMultiVarGaussian('f{0}_{1}'.format( + glob, priv), 'f{0}_{1}'.format(glob, priv), argus, vec[-1], cov_root[-1]) + getattr(w, 'import')(gaussian) + # create private mixture model + priv_coeffs = np.array(coeffs[glob]) + # print 'priv coef {0} {1}'.format(priv_coeffs, priv_coeffs.sum()) + sum_str = ','.join( + ['c_{0}_{1}[{2}]*f{0}_{1}'.format(glob, j, priv_coeffs[j]) for j in range(n_private)]) + w.factory('SUM::f{0}({1})'.format(glob, sum_str)) + # mixture model + w.factory("SUM::F0(c00[{0}]*f0,c01[{1}]*f1,f2)".format(c0[0], c0[1])) + w.factory("SUM::F1(c10[{0}]*f0,c11[{1}]*f1,f2)".format(c1[0], c1[1])) + + # Check Model + w.Print() + + w.writeToFile('{0}/{1}'.format(dir, workspace)) + if verbose_printing: + printFrame(w, + ['x0', + 'x1', + 'x2'], + [w.pdf('f0'), + w.pdf('f1'), + w.pdf('f2')], + 'decomposed_model', + ['f0', + 'f1', + 'f2'], + dir=dir, + model_g=model_g, + range=[-15, + 20], + title='Single distributions', + x_text='x0', + y_text='p(x)') + printFrame(w, + ['x0', + 'x1', + 'x2'], + [w.pdf('F0'), + w.pdf('F1')], + 'full_model', + ['Bkg', + 'Bkg+Signal'], + dir=dir, + model_g=model_g, + range=[-15, + 20], + title='Composed model', + x_text='x0', + y_text='p(x)') + printFrame(w, + ['x0', + 'x1', + 'x2'], + [w.pdf('F1'), + 'f0'], + 'full_signal', + ['Bkg', + 'Signal'], + dir=dir, + model_g=model_g, + range=[-15, + 20], + title='Background and signal', + x_text='x0', + y_text='p(x)') + + return w + + +def makeModelND( + vars_g, + c0, + c1, + cov_l=cov_g, + mu_l=mu_g, + workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', + dir='/afs/cern.ch/user/j/jpavezse/systematics', + model_g='mlp', + c1_g='', + verbose_printing=False): + ''' + RooFit statistical model for the data - f = ROOT.TFile('{0}/{1}'.format(dir,workspace)) - w = f.Get('w') - f.Close() + ''' + # Statistical model + w = ROOT.RooWorkspace('w') - print 'Making Data' - # Start generating data - ''' + print 'Generating initial distributions' + cov_m = [] + mu_m = [] + mu_str = [] + cov_root = [] + vec = [] + argus = ROOT.RooArgList() + # features + for i, var in enumerate(vars_g): + w.factory('{0}[{1},{2}]'.format(var, -25, 30)) + argus.add(w.var(var)) + + for glob in range(3): + # generate covariance matrix + cov_m.append(np.matrix(cov_l[glob])) + cov_root.append(ROOT.TMatrixDSym(len(vars_g))) + for i, var1 in enumerate(vars_g): + for j, var2 in enumerate(vars_g): + cov_root[-1][i][j] = cov_m[-1][i, j] + getattr(w, 'import')(cov_root[-1], 'cov{0}'.format(glob)) + # generate mu vector + mu_m.append(np.array(mu_l[glob])) + vec.append(ROOT.TVectorD(len(vars_g))) + for i, mu in enumerate(mu_m[-1]): + vec[-1][i] = mu + mu_str.append(','.join([str(mu) for mu in mu_m[-1]])) + # multivariate gaussian + gaussian = ROOT.RooMultiVarGaussian('f{0}'.format( + glob), 'f{0}'.format(glob), argus, vec[-1], cov_root[-1]) + getattr(w, 'import')(gaussian) + # mixture models + w.factory("SUM::F0(c00[{0}]*f0,c01[{1}]*f1,f2)".format(c0[0], c0[1])) + w.factory("SUM::F1(c10[{0}]*f0,c11[{1}]*f1,f2)".format(c1[0], c1[1])) + + # Check Model + w.Print() + + w.writeToFile('{0}/{1}'.format(dir, workspace)) + if verbose_printing: + printFrame(w, + ['x0', + 'x1', + 'x2'], + [w.pdf('f0'), + w.pdf('f1'), + w.pdf('f2')], + 'decomposed_model', + ['f0', + 'f1', + 'f2'], + dir=dir, + model_g=model_g, + range=[-15, + 20], + title='Single distributions', + x_text='x0', + y_text='p(x)', + print_pdf=True) + printFrame(w, + ['x0', + 'x1', + 'x2'], + [w.pdf('F0'), + w.pdf('F1')], + 'full_model', + ['Bkg', + 'Bkg+Signal'], + dir=dir, + model_g=model_g, + range=[-15, + 20], + title='Composed model', + x_text='x0', + y_text='p(x)', + print_pdf=True) + printFrame(w, + ['x0', + 'x1', + 'x2'], + [w.pdf('F1'), + 'f0'], + 'full_signal', + ['Bkg', + 'Signal'], + dir=dir, + model_g=model_g, + range=[-15, + 20], + title='Background and signal', + x_text='x0', + y_text='p(x)', + print_pdf=True) + + return w + + +def makeModel( + c0, + c1, + cov_l=cov_g, + mu_l=mu_g, + workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', + dir='/afs/cern.ch/user/j/jpavezse/systematics', + model_g='mlp', + c1_g='', + verbose_printing=False): + ''' + RooFit statistical model for the data + + ''' + # Statistical model + w = ROOT.RooWorkspace('w') + #w.factory("EXPR::f1('cos(x)**2 + .01',x)") + w.factory("EXPR::f2('exp(x*-1)',x[0,5])") + w.factory("EXPR::f1('0.3 + exp(-(x-5)**2/5.)',x)") + w.factory("EXPR::f0('exp(-(x-2.5)**2/1.)',x)") + # w.factory("EXPR::f2('exp(-(x-2)**2/2)',x)") + w.factory("SUM::F0(c00[{0}]*f0,c01[{1}]*f1,f2)".format(c0[0], c0[1])) + w.factory("SUM::F1(c10[{0}]*f0,c11[{1}]*f1,f2)".format(c1[0], c1[1])) + + # Check Model + w.Print() + w.writeToFile( + '{0}/workspace_DecomposingTestOfMixtureModelsClassifiers.root'.format(dir)) + if verbose_printing: + printFrame(w, + ['x'], + [w.pdf('f0'), + w.pdf('f1'), + w.pdf('f2')], + 'decomposed_model', + ['f0', + 'f1', + 'f2'], + dir=dir, + model_g=model_g, + range=[-15, + 20], + title='Single distributions', + x_text='x0', + y_text='p(x)', + print_pdf=True) + printFrame(w, + ['x'], + [w.pdf('F0'), + w.pdf('F1')], + 'full_model', + ['Bkg', + 'Bkg+Signal'], + dir=dir, + model_g=model_g, + range=[-15, + 20], + title='Composed model', + x_text='x0', + y_text='p(x)', + print_pdf=True) + printFrame(w, + ['x'], + [w.pdf('F1'), + 'f0'], + 'full_signal', + ['Bkg', + 'Signal'], + dir=dir, + model_g=model_g, + range=[-15, + 20], + title='Background and signal', + x_text='x0', + y_text='p(x)', + print_pdf=True) + + +def makeData( + vars_g, + c0, + c1, + num_train=500, + num_test=100, + no_train=False, + workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', + dir='/afs/cern.ch/user/j/jpavezse/systematics', + c1_g='', + model_g='mlp'): + # Start generating data + ''' + Each function will be discriminated pair-wise + so n*n datasets are needed (maybe they can be reused?) + ''' + + f = ROOT.TFile('{0}/{1}'.format(dir, workspace)) + w = f.Get('w') + f.Close() + + print 'Making Data' + # Start generating data + ''' Each function will be discriminated pair-wise so n*n datasets are needed (maybe they can be reused?) - ''' - - # make data from root pdf - def makeDataFi(x, pdf, num): - traindata = np.zeros((num,len(vars_g))) - data = pdf.generate(x,num) - traindata[:] = [[data.get(i).getRealValue(var) for var in vars_g] - for i in range(num)] - return traindata - - # features - vars = ROOT.TList() - for var in vars_g: - vars.Add(w.var(var)) - x = ROOT.RooArgSet(vars) - - # make data from pdf and save to .dat in folder - # ./data/{model}/{c1} - for k,c in enumerate(c0): - print 'Making {0}'.format(k) - if not no_train: - traindata = makeDataFi(x,w.pdf('f{0}'.format(k)), num_train) - np.savetxt('{0}/data/{1}/{2}/train_{3}.dat'.format(dir,model_g,c1_g,k), - traindata,fmt='%f') - testdata = makeDataFi(x, w.pdf('f{0}'.format(k)), num_test) - np.savetxt('{0}/data/{1}/{2}/test_{3}.dat'.format(dir,model_g,c1_g,k), - testdata,fmt='%f') - if not no_train: - traindata = makeDataFi(x,w.pdf('F0'), num_train) - np.savetxt('{0}/data/{1}/{2}/train_F0.dat'.format(dir,model_g,c1_g), - traindata,fmt='%f') - traindata = makeDataFi(x,w.pdf('F1'), num_train) - np.savetxt('{0}/data/{1}/{2}/train_F1.dat'.format(dir,model_g,c1_g), - traindata,fmt='%f') - testdata = makeDataFi(x, w.pdf('F0'), num_test) - np.savetxt('{0}/data/{1}/{2}/test_F0.dat'.format(dir,model_g,c1_g), - testdata,fmt='%f') - testdata = makeDataFi(x, w.pdf('F1'), num_test) - np.savetxt('{0}/data/{1}/{2}/test_F1.dat'.format(dir,model_g,c1_g), - testdata,fmt='%f') + ''' + # make data from root pdf + def makeDataFi(x, pdf, num): + traindata = np.zeros((num, len(vars_g))) + data = pdf.generate(x, num) + traindata[:] = [[data.get(i).getRealValue(var) for var in vars_g] + for i in range(num)] + return traindata + # features + vars = ROOT.TList() + for var in vars_g: + vars.Add(w.var(var)) + x = ROOT.RooArgSet(vars) + + # make data from pdf and save to .dat in folder + # ./data/{model}/{c1} + for k, c in enumerate(c0): + print 'Making {0}'.format(k) + if not no_train: + traindata = makeDataFi(x, w.pdf('f{0}'.format(k)), num_train) + np.savetxt('{0}/data/{1}/{2}/train_{3}.dat'.format(dir, + model_g, c1_g, k), traindata, fmt='%f') + testdata = makeDataFi(x, w.pdf('f{0}'.format(k)), num_test) + np.savetxt('{0}/data/{1}/{2}/test_{3}.dat'.format(dir, + model_g, c1_g, k), testdata, fmt='%f') + if not no_train: + traindata = makeDataFi(x, w.pdf('F0'), num_train) + np.savetxt('{0}/data/{1}/{2}/train_F0.dat'.format(dir, model_g, c1_g), + traindata, fmt='%f') + traindata = makeDataFi(x, w.pdf('F1'), num_train) + np.savetxt('{0}/data/{1}/{2}/train_F1.dat'.format(dir, model_g, c1_g), + traindata, fmt='%f') + testdata = makeDataFi(x, w.pdf('F0'), num_test) + np.savetxt('{0}/data/{1}/{2}/test_F0.dat'.format(dir, model_g, c1_g), + testdata, fmt='%f') + testdata = makeDataFi(x, w.pdf('F1'), num_test) + np.savetxt('{0}/data/{1}/{2}/test_F1.dat'.format(dir, model_g, c1_g), + testdata, fmt='%f') diff --git a/mlp.py b/mlp.py index fb95651..02fafdb 100644 --- a/mlp.py +++ b/mlp.py @@ -17,7 +17,9 @@ from logistic_sgd import LogisticRegression + class HiddenLayer(object): + def __init__(self, rng, input, n_in, n_out, W=None, b=None, activation=T.tanh): """ @@ -61,9 +63,9 @@ def __init__(self, rng, input, n_in, n_out, W=None, b=None, # tanh. if W is None: W_values = numpy.asarray(rng.uniform( - low=-numpy.sqrt(6. / (n_in + n_out)), - high=numpy.sqrt(6. / (n_in + n_out)), - size=(n_in, n_out)), dtype=theano.config.floatX) + low=-numpy.sqrt(6. / (n_in + n_out)), + high=numpy.sqrt(6. / (n_in + n_out)), + size=(n_in, n_out)), dtype=theano.config.floatX) if activation == theano.tensor.nnet.sigmoid: W_values *= 4 @@ -135,12 +137,12 @@ def __init__(self, rng, input, n_in, n_hidden, n_out): # L1 norm ; one regularization option is to enforce L1 norm to # be small self.L1 = abs(self.hiddenLayer.W).sum() \ - + abs(self.logRegressionLayer.W).sum() + + abs(self.logRegressionLayer.W).sum() # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = (self.hiddenLayer.W ** 2).sum() \ - + (self.logRegressionLayer.W ** 2).sum() + + (self.logRegressionLayer.W ** 2).sum() # negative log likelihood of the MLP is given by the negative # log likelihood of the output of the model, computed in the @@ -161,27 +163,28 @@ def save_model(self, best_params, filename="params.pkl"): ''' Save parameters of the model ''' print '...saving model' - #if not os.path.isdir(save_dir): + # if not os.path.isdir(save_dir): # os.makedirs(save_dir) #save_file= open(os.path.join(save_dir, filename),'wb') - save_file= open(filename,'wb') + save_file = open(filename, 'wb') cPickle.dump(best_params, save_file, protocol=cPickle.HIGHEST_PROTOCOL) save_file.close() def load_model(self, filename="params.pkl"): ''' Save parameters of the model ''' - #print '...loading model' + # print '...loading model' save_file = open(filename, 'r') params = cPickle.load(save_file) save_file.close() - + self.hiddenLayer.W.set_value(params[0].get_value(), borrow=True) self.hiddenLayer.b.set_value(params[1].get_value(), borrow=True) self.logRegressionLayer.W.set_value(params[2].get_value(), borrow=True) self.logRegressionLayer.b.set_value(params[3].get_value(), borrow=True) + def shared_dataset(data_xy, borrow=True): """ Function that loads the dataset into shared variables @@ -193,7 +196,7 @@ def shared_dataset(data_xy, borrow=True): """ data_x, data_y = data_xy shared_x = theano.shared(numpy.asmatrix(data_x, - dtype=theano.config.floatX), + dtype=theano.config.floatX), borrow=borrow) shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), @@ -207,31 +210,39 @@ def shared_dataset(data_xy, borrow=True): # lets ous get around this issue return shared_x, T.cast(shared_y, 'int32') -def make_predictions(dataset, learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=10, - batch_size=20, n_hidden=40,in_size=1,out_size=2, - model_file='model/mlp/adaptive_0_1.pkl'): + +def make_predictions( + dataset, + learning_rate=0.01, + L1_reg=0.00, + L2_reg=0.0001, + n_epochs=10, + batch_size=20, + n_hidden=40, + in_size=1, + out_size=2, + model_file='model/mlp/adaptive_0_1.pkl'): test_set_x = dataset in_size = test_set_x.shape[1] if len(test_set_x.shape) > 1 else 1 if in_size == 1: - test_set_x = test_set_x.reshape(test_set_x.shape[0],1) + test_set_x = test_set_x.reshape(test_set_x.shape[0], 1) else: - test_set_x = test_set_x.reshape(test_set_x.shape[0],in_size) + test_set_x = test_set_x.reshape(test_set_x.shape[0], in_size) # quick fix to avoid more change of code, have to change it test_set_y = numpy.ones(test_set_x.shape[0]) test_set_x, test_set_y = shared_dataset((test_set_x, test_set_y)) - ###################### # BUILD ACTUAL MODEL # ###################### - #print '... building the model' + # print '... building the model' # allocate symbolic variables for the data x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of - # [int] labels + # [int] labels rng = numpy.random.RandomState(1234) @@ -239,19 +250,28 @@ def make_predictions(dataset, learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_ classifier = MLP(rng=rng, input=x, n_in=in_size, n_hidden=n_hidden, n_out=out_size) - classifier.load_model(filename=model_file) - test_model = theano.function([], [classifier.predictions,classifier.y_out], - givens={x: test_set_x}) + test_model = theano.function( + [], [ + classifier.predictions, classifier.y_out], givens={ + x: test_set_x}) predictions, probs = test_model() return probs -def train_mlp(datasets,learning_rate=0.01, L1_reg=0.000, L2_reg=0.0001, n_epochs=100, - batch_size=50, n_hidden=40,in_size=1,out_size=2, - save_file='model/mlp/adaptive_0_1.pkl'): +def train_mlp( + datasets, + learning_rate=0.01, + L1_reg=0.000, + L2_reg=0.0001, + n_epochs=100, + batch_size=50, + n_hidden=40, + in_size=1, + out_size=2, + save_file='model/mlp/adaptive_0_1.pkl'): train_set_x, train_set_y = datasets in_size = train_set_x.shape[1] if len(train_set_x.shape) > 1 else 1 @@ -260,10 +280,10 @@ def train_mlp(datasets,learning_rate=0.01, L1_reg=0.000, L2_reg=0.0001, n_epochs train_set_x = train_set_x[indices] train_set_y = train_set_y[indices] if in_size == 1: - train_set_x = train_set_x.reshape(train_set_x.shape[0],1) + train_set_x = train_set_x.reshape(train_set_x.shape[0], 1) else: - train_set_x = train_set_x.reshape(train_set_x.shape[0],in_size) - train_set_x , train_set_y = shared_dataset((train_set_x, train_set_y)) + train_set_x = train_set_x.reshape(train_set_x.shape[0], in_size) + train_set_x, train_set_y = shared_dataset((train_set_x, train_set_y)) # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size @@ -277,8 +297,7 @@ def train_mlp(datasets,learning_rate=0.01, L1_reg=0.000, L2_reg=0.0001, n_epochs index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of - # [int] labels - + # [int] labels # construct the MLP class classifier = MLP(rng=rng, input=x, n_in=in_size, @@ -288,8 +307,8 @@ def train_mlp(datasets,learning_rate=0.01, L1_reg=0.000, L2_reg=0.0001, n_epochs # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = classifier.negative_log_likelihood(y) \ - + L1_reg * classifier.L1 \ - + L2_reg * classifier.L2_sqr + + L1_reg * classifier.L1 \ + + L2_reg * classifier.L2_sqr # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams @@ -312,10 +331,10 @@ def train_mlp(datasets,learning_rate=0.01, L1_reg=0.000, L2_reg=0.0001, n_epochs # in the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function(inputs=[index], outputs=cost, - updates=updates, - givens={ - x: train_set_x[index * batch_size:(index + 1) * batch_size], - y: train_set_y[index * batch_size:(index + 1) * batch_size]}) + updates=updates, + givens={ + x: train_set_x[index * batch_size:(index + 1) * batch_size], + y: train_set_y[index * batch_size:(index + 1) * batch_size]}) ############### # TRAIN MODEL # @@ -330,17 +349,17 @@ def train_mlp(datasets,learning_rate=0.01, L1_reg=0.000, L2_reg=0.0001, n_epochs epoch = 0 done_looping = False - + while (epoch < n_epochs) and (not done_looping): minibatch_avg_cost = 0. epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): - minibatch_avg_cost += train_model(minibatch_index)/batch_size + minibatch_avg_cost += train_model(minibatch_index) / batch_size # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index print 'Epoch: {0}, cost: {1}'.format( - epoch, minibatch_avg_cost) + epoch, minibatch_avg_cost) best_params = copy.deepcopy(classifier.params) classifier.save_model(best_params, save_file) @@ -348,15 +367,18 @@ def train_mlp(datasets,learning_rate=0.01, L1_reg=0.000, L2_reg=0.0001, n_epochs # This is a wrapper in order to make use on decomposing test easier class MLPTrainer(): - def __init__(self,n_hidden=40, L2_reg=0.001): + + def __init__(self, n_hidden=40, L2_reg=0.001): self.n_hidden = n_hidden self.L2_reg = L2_reg - def fit(self, X, y,save_file=''): - train_mlp((X,y), + def fit(self, X, y, save_file=''): + train_mlp((X, y), save_file=save_file, - n_hidden = self.n_hidden, L2_reg = self.L2_reg) - def predict_proba(self, X, model_file='model'): - return make_predictions(dataset=X, model_file=model_file, n_hidden = self.n_hidden) + n_hidden=self.n_hidden, L2_reg=self.L2_reg) - \ No newline at end of file + def predict_proba(self, X, model_file='model'): + return make_predictions( + dataset=X, + model_file=model_file, + n_hidden=self.n_hidden) diff --git a/pyMorphWrapper.py b/pyMorphWrapper.py deleted file mode 100644 index 2920fdd..0000000 --- a/pyMorphWrapper.py +++ /dev/null @@ -1,501 +0,0 @@ -''' -Simple python Wrapper for EFT Morphing code -author: jpavezse -''' - - -from ctypes import * -import numpy as np -import pdb -import time -from itertools import combinations -import copy - -from deap import base -from deap import tools -from deap import creator -from deap import algorithms -from scoop import futures -import multiprocessing - - -#reading C++ library -lib = cdll.LoadLibrary('./RandomEFT/RandomEFT/libcMorphWrapper.so') -#lib.getWeights.restype = c_char_p -lib.getCrossSections.restype = c_char_p - - -def initInd(icls, size, samples): - pop1 = np.zeros(len(samples)) - idx1 = np.random.choice(len(samples),size,replace=False) - pop1[idx1] = 1 - pop2 = np.zeros(len(samples)) - idx2 = list(set(range(len(samples))) - set(idx1)) - idx2 = np.random.choice(idx2, size, replace=False) - pop2[idx2] = 1 - return icls(np.append(pop1,pop2).astype(int)) - - -def mutateInd(individual): - # External mutation - size = len(individual) / 2 - ind_arr = np.array(individual) - i = np.random.choice(np.where(np.logical_and(ind_arr[:size] == 0, ind_arr[size:] == 0))[0]) - base = np.random.choice((0,1)) - individual[np.random.choice(np.where(np.array(individual)[base*size:(base+1)*size] == 1)[0]) + base*size] = 0 - individual[i + size*base] = 1 - # Internal Mutation - ind_arr = np.array(individual) - i = np.random.choice(np.where(np.logical_and(ind_arr[:size] == base, ind_arr[size:] == 1-base))[0]) - j = np.random.choice(np.where(np.logical_and(ind_arr[:size] == 1-base, ind_arr[size:] == base))[0]) - individual[i],individual[i+size] = individual[i+size],individual[i] - individual[j],individual[j+size] = individual[j+size],individual[j] - return individual, - -def cxOver(ind1, ind2, section, base_size): - size = len(ind1) / 2 - assert size > section - base = np.random.choice((0,1)) - choice = np.random.choice(size-section) - mask = np.zeros(len(ind1)/2, np.bool) - mask[choice:choice+section] = 1 - - ind1_base = list() - ind2_base = list() - - for base in (0,1): - ind1_base.append(np.array(ind1[base*size:(base+1)*size])) - ind2_base.append(np.array(ind2[base*size:(base+1)*size])) - tmp = np.copy(ind2_base[-1][mask]) - ind2_base[-1][mask] = ind1_base[-1][mask] - ind1_base[-1][mask] = tmp - - - for inds in (ind1_base,ind2_base): - for ind in inds: - if sum(ind) > base_size: # Case in which we add 1s - ind[np.random.choice(np.where(ind == 1)[0], sum(ind) - base_size,replace=False)] = 0 - if sum(ind) < base_size: # Case in which we add 0s - ind[np.random.choice(np.where(np.logical_and(inds[0] == 0, inds[1] == 0))[0], - base_size - sum(ind),replace=False)] = 1 - - ind1[:] = list(np.append(ind1_base[0],ind1_base[1])) - ind2[:] = list(np.append(ind2_base[0],ind2_base[1])) - - return ind1, ind2 - - -class MorphingWrapper: - def __init__(self): - # Initialize the C object - lib.MorphingWrapperNew.restype = POINTER(c_char) - self.obj = lib.MorphingWrapperNew() - self.mem_values = dict() - - def setStringData(self,data): - # Set samples data in the form of string: - #'Nsamples Ncouplings types[P|D|S]*Ncouplings target sample1 sample2 ...' - # where target are the N couplings of the sample to morph and sample* - # N couplings of each sample - self.string_data = data - s_data = create_string_buffer(data) - lib.setSampleData(self.obj,s_data) - - def getStringData(self): - string_data = '{0} {1} '.format(self.nsamples,self.ncouplings) - string_data = string_data + ' '.join(str(x) for x in self.types) + ' ' - string_data = string_data + ' '.join(str(x) for x in self. morphed) + ' ' - string_data = string_data + ' '.join(str(x) for sample in self.basis for x in sample) - return string_data - - def setSampleData(self,nsamples,ncouplings,types,morphed=[1.,1.,1.],samples=None,ncomb=20,used_samples=30): - lib.getWeights.restype = POINTER(c_float*(nsamples+2)) - self.nsamples = nsamples - self.ncouplings = ncouplings - self.types = types - self.morphed = morphed - self.samples = samples - self.basis = samples[:self.nsamples] - self.ncomb = ncomb - string_data = self.getStringData() - self.setStringData(string_data) - self.used_samples = used_samples - - def resetBasis(self, sample): - self.basis = sample - string_data = self.getStringData() - self.setStringData(string_data) - - def resetTarget(self, target): - self.morphed = target - string_data = self.getStringData() - self.setStringData(string_data) - - def getWeights(self): - # Return computed weights for each sample - s = lib.getWeights(self.obj) - results = [float(x) for x in s.contents] - weights = results[:-2] - self.det = results[-2] - self.cond = results[-1] - return weights - #return [float(x) for x in s.split()] - - def getCrossSections(self): - # Return cross section for each one of the samples - # The C++ version is not working - - #s = lib.getCrossSections(obj) - return self.__computeCrossSections() - - def __computeCrossSections(self): - # Compute Cross Section using analytical formula, only for VBF 2nSM for now - # Defining constants - ca = 0.70710678 - sa = 0.70710678 - c_ = [0.1220,0.05669,0.003246,0.00006837,0.04388,0.002685,0.00007918,0.00002289] - osm = 0.1151 - - def formula(ksm,khzz,kazz): - return 4*osm*((ca**4)*((ksm**4) + c_[0]*(ksm**3)*khzz + c_[1]*(ksm**2)*(khzz**2) + c_[2]*ksm*(khzz**3) + c_[3]*(khzz**4)) +\ - (ca**2)*(sa**2)*(c_[4]*(ksm**2)*(kazz**2) + c_[5]*ksm*khzz*(kazz**2) + c_[6]*(khzz**2)*(kazz**2)) + (sa**4)*c_[7]*(kazz**4)) - - cross_sections = [] - for sample in self.basis: - ksm = sample[0] - khzz = 16.247*sample[1] - kazz = 16.247*sample[2] - cross_sections.append(formula(ksm,khzz,kazz)) - - return cross_sections - - def computeNeff(self,basis,samples): - # TODO : Harcoded! - #self.resetTarget(self.morphed) - self.resetTarget(self.morphed) - basis = [samples[b] for b in basis] - self.resetBasis(basis) - weights = np.array(self.getWeights()) - #print weights - cross_sections = np.array(self.getCrossSections()) - n_tot = (np.abs(np.multiply(weights,cross_sections))).sum() - n_eff = (np.multiply(weights,cross_sections)).sum() - return np.abs(n_eff / n_tot) - - def evalMean(self,x,samples,cvalues_1,cvalues_2,verb=False): - val = 0. - target = self.morphed[:] - norm = len(cvalues_1) * len(cvalues_2) - result = 0. - for val1 in cvalues_1: - for val2 in cvalues_2: - target[1] = val1 - target[2] = val2 - self.resetTarget(target) - res,det,cond = self.computeStats(x,samples) - result += (1./(val1**2 + val2**2)) * res - r1,det1,cond1 = self.computeStats(x,samples) - val = result/norm - if cond1 > 100000.: - val = 0. - #else: - # print val - return val - - def computeStats(self,basis,samples): - basis = [samples[b] for b in basis] - self.resetBasis(basis) - weights = np.array(self.getWeights()) - #print weights - cross_sections = np.array(self.getCrossSections()) - n_tot = (np.abs(np.multiply(weights,cross_sections))).sum() - n_eff = (np.multiply(weights,cross_sections)).sum() - return (np.abs(n_eff / n_tot),self.det,self.cond) - - def evalMaxPair(self,x,samples,cvalues_1,cvalues_2,verb=False): - val = 0. - target = self.morphed[:] - norm = len(cvalues_1) * len(cvalues_2) - result = 0. - for val1 in cvalues_1: - for val2 in cvalues_2: - target[1] = val1 - target[2] = val2 - self.resetTarget(target) - r1 = self.computeNeff(x[0],samples) - r2 = self.computeNeff(x[1],samples) - #result += np.max((r1, r2)) - result += r1 + r2 - #result = (r1 + r2)/2. - r1,det1,cond1 = self.computeStats(x[0],samples) - r2,det2,cond2 = self.computeStats(x[1],samples) - #self.resetTarget(self.morphed) - #r1 = self.computeNeff(x[0],samples) - #r2 = self.computeNeff(x[1],samples) - #result += 2.*(r1 + r2) - val = result/(norm) - if cond1 > 100000. or cond2 > 100000.: - val = 0. - #else: - # print val - return val - - def dynamicMorphing(self,target=None,cvalues_1=None,cvalues_2=None): - # This function will work in case you have more samples that needed - rng = np.random.RandomState(1234) - #rng = np.random.RandomState(1111) - samples = self.samples[:] - indexes = np.array([samples.index(x) for x in samples if x <> self.morphed]) - # This is the sample to fit (not necessarily the morphed when fitting) - if target <> None: - indexes = np.array([samples.index(x) for x in samples if x <> target]) - - samples_indexes = indexes - #samples_indexes = indexes[rng.choice(len(indexes), self.used_samples, replace=False)] - #samples_indexes = sorted(indexes,key=lambda x: np.sqrt(np.sum((np.array(samples[x])-self.morphed)**2))) - #samples_indexes = np.array((samples_indexes[:self.used_samples])) - print len(samples_indexes) - ncomb_indexes = samples_indexes[rng.choice(len(samples_indexes),self.ncomb,replace=False)] - - print 'Starting combinations' - #Considering condition number - comb = list(combinations(ncomb_indexes,self.nsamples)) - #comb = [comb[i] for i in rng.choice(len(comb),int(len(comb)/2),replace=False)] - start = time.time() - print 'Start sorting' - comb = sorted(comb,key=lambda x: self.computeNeff(x,samples),reverse=True)[:int(len(comb)*0.1)] - #comb = sorted(comb,key=lambda x: self.computeNeff(x,samples),reverse=True) - end = time.time() - print 'Elapsed time combinations: {0}'.format(end-start) - print 'Number of combinations: {0}'.format(len(comb)) - pairs = [] - for c in comb: - print '.', - #l1 = [i for i in samples_indexes if i not in c] - l1 = np.array(list(set(samples_indexes) - set(c))) - l1 = l1[rng.choice(len(l1),self.ncomb,replace=False)] - #comb2_len = self.nsamples*2 - len(indexes) - #comb2 = list(combinations(c,self.nsamples*2 - len(indexes))) - #comb2 = np.array(comb2)[rng.choice(len(comb2),int(len(comb2)*0.05),replace=False)] - #for c2 in comb2: - # pairs.append([c,tuple(l1 + list(c2))]) - if len(l1) <> self.nsamples: - comb2 = list(combinations(l1, self.nsamples)) - comb2 = np.array(comb2)[rng.choice(len(comb2), int(len(comb2) * 0.03),replace=False)] - for c2 in comb2: - pairs.append([c,tuple(list(c2))]) - else: - pairs.append([c,tuple(l1)]) - print '' - pairs = [pairs[i] for i in rng.choice(len(pairs),int(len(pairs)*0.3),replace=False)] - print 'Number of pairs: {0}'.format(len(pairs)) - start = time.time() - best_result = 0. - for p in pairs: - try: - result = self.evalMaxPair(p,samples,cvalues_1,cvalues_2) - if result > best_result: - best_result = result - best = p - if result > 0.8: - best = p - best_result = result - break - except KeyboardInterrupt: - print 'KeyboardInterrupt caught' - break - end = time.time() - print 'Elapsed time 2nd basis: {0}'.format(end-start) - print 'Best Result : {0}'.format(best_result) - print 'Results on target for each basis: ' - self.mem_values.clear() - self.resetTarget(self.morphed) - result,det,cond = self.computeStats(best[0],samples) - print 'Result: {0} ,Det: {1}, Cond: {2}'.format(result, det, cond) - result,det,cond = self.computeStats(best[1],samples) - print 'Result: {0} ,Det: {1}, Cond: {2}'.format(result, det, cond) - - return best - - def dynamicMorphing_(self,target=None,cvalues_1=None,cvalues_2=None): - # This function will work in case you have more samples that needed - rng = np.random.RandomState(1234) - #rng = np.random.RandomState(1111) - samples = self.samples[:] - indexes = np.array([samples.index(x) for x in samples if x <> self.morphed]) - # This is the sample to fit (not necessarily the morphed when fitting) - if target <> None: - indexes = np.array([samples.index(x) for x in samples if x <> target]) - - samples_indexes = indexes - print len(samples_indexes) - ncomb_indexes = samples_indexes[rng.choice(len(samples_indexes),self.ncomb,replace=False)] - - print 'Starting combinations' - #Considering condition number - comb = list(combinations(ncomb_indexes,self.nsamples)) - #comb = [comb[i] for i in rng.choice(len(comb),int(len(comb)/2),replace=False)] - start = time.time() - print 'Start sorting' - comb = sorted(comb,key=lambda x: self.computeNeff(x,samples),reverse=True)[:int(len(comb)*0.1)] - #comb = sorted(comb,key=lambda x: self.computeNeff(x,samples),reverse=True) - end = time.time() - print 'Elapsed time combinations: {0}'.format(end-start) - print 'Number of combinations: {0}'.format(len(comb)) - pairs = [] - for c in comb: - print '.', - l1 = [i for i in samples_indexes if i not in c] - comb2_len = self.nsamples*2 - len(indexes) - comb2 = list(combinations(c,self.nsamples*2 - len(indexes))) - comb2 = np.array(comb2)[rng.choice(len(comb2),int(len(comb2)*0.05),replace=False)] - for c2 in comb2: - pairs.append([c,tuple(l1 + list(c2))]) - print '' - pairs = [pairs[i] for i in rng.choice(len(pairs),int(len(pairs)*0.3),replace=False)] - print 'Number of pairs: {0}'.format(len(pairs)) - start = time.time() - best_result = 0. - for p in pairs: - try: - result = self.evalMaxPair(p,samples,cvalues_1,cvalues_2) - if result > best_result: - best_result = result - best = p - if result > 0.8: - best = p - best_result = result - break - except KeyboardInterrupt: - print 'KeyboardInterrupt caught' - break - end = time.time() - print 'Elapsed time 2nd basis: {0}'.format(end-start) - print 'Best Result : {0}'.format(best_result) - print 'Results on target for each basis: ' - self.mem_values.clear() - self.resetTarget(self.morphed) - result,det,cond = self.computeStats(best[0],samples) - print 'Result: {0} ,Det: {1}, Cond: {2}'.format(result, det, cond) - result,det,cond = self.computeStats(best[1],samples) - print 'Result: {0} ,Det: {1}, Cond: {2}'.format(result, det, cond) - - return best - - - def simpleMorphing(self,target=None,cvalues_1=None,cvalues_2=None,c_eval=0): - # TODO: Have to check the condition number and det - # This function will work in case you have more samples that needed - rng = np.random.RandomState(1234) - samples = self.samples[:] - indexes = np.array([samples.index(x) for x in samples if x <> self.morphed]) - # This is the sample to fit (not necessarily the morphed when fitting) - if target <> None: - indexes = np.array([samples.index(x) for x in samples if x <> target]) - #indexes = sorted(indexes,key=lambda x: np.sqrt(np.sum((np.array(samples[x])-self.morphed)**2))) - #ncomb_indexes = indexes[:self.ncomb] - ncomb_indexes = indexes[rng.choice(len(indexes),self.ncomb,replace=False)] - #Considering condition number - comb = list(combinations(ncomb_indexes,self.nsamples)) - comb = [comb[i] for i in rng.choice(len(comb),int(len(comb)/2),replace=False)] - start = time.time() - comb = sorted(comb,key=lambda x: self.computeNeff(x,samples),reverse=True)[:int(len(comb)*0.3)] - end = time.time() - print 'elapsed time : {0}'.format(end-start) - print len(comb) - start = time.time() - comb = [comb[i] for i in rng.choice(len(comb),int(len(comb)*0.1),replace=False)] - best_result = 0. - for p in comb: - result = self.evalMean(p,samples,cvalues_1,cvalues_2) if cvalues_2 <> None else\ - self.evalMeanSingle(p,samples,cvalues_1,c_eval) - if result > best_result: - best_result = result - best = p - end = time.time() - self.resetTarget(self.morphed) - result,det,cond = self.computeStats(best,samples) - print 'Result: {0} ,Det: {1}, Cond: {2}'.format(result, det, cond) - print 'elapsed time : {0}'.format(end-start) - #print 'n_eff: {0}'.format(self.computeNeff(best,samples)) - #self.resetBasis([samples[b] for b in best]) - - return best - - def evaluateInd(self,individual,cvalues_1, cvalues_2): - size = len(individual) / 2 - ind1,ind2 = np.array(individual[:size]),np.array(individual[size:]) - # Check that individual is a valid sample - assert sum(ind1) == 15 - assert sum(ind2) == 15 - assert sum(np.logical_and(ind1,ind2)) == 0 - p = map(list,(np.where(ind1 == 1)[0], np.where(ind2 == 1)[0])) - result = self.evalMaxPair(p,self.samples,cvalues_1,cvalues_2) - return result, - - # Workaround for parallel processing - def __call__(self, individual, cvalues_1, cvalues_2): - return self.evaluateInd(individual, cvalues_1, cvalues_2) - - def evolMorphing(self,target=None,cvalues_1=None,cvalues_2=None,c_eval=0): - - res = initInd(np.array, 15, self.samples) - - toolbox = base.Toolbox() - creator.create("FitnessMax", base.Fitness, weights=(1.0,), typecode='d') - creator.create("Individual", list, fitness=creator.FitnessMax) - toolbox.register("individual", initInd, creator.Individual, size=15, samples=self.samples) - ind1 = toolbox.individual() - ind2 = toolbox.individual() - - #toolbox.register('evaluate', self.evaluateInd, cvalues_1=cvalues_1, cvalues_2=cvalues_2) - toolbox.register('evaluate', self, cvalues_1=cvalues_1, cvalues_2=cvalues_2) - - #mutateInd(ind) - toolbox.register('mutate', mutateInd) - - toolbox.register('mate', cxOver, section=3, base_size=15) - toolbox.register("select", tools.selTournament, tournsize=3) - #toolbox.register("select", tools.selRoulette) - toolbox.register("population", tools.initRepeat, list, toolbox.individual) - #toolbox.register("map", futures.map) - pool = multiprocessing.Pool() - toolbox.register("map", pool.map) - - halloffame = tools.HallOfFame(maxsize=10) - - mu_es, lambda_es = 3,21 - - pop_stats = tools.Statistics(key=lambda ind: ind.fitness.values) - #pop_stats.register('pop', copy.deepcopy) - pop_stats.register('avg',np.mean) - pop_stats.register('max',np.max) - #toolbox.mate(ind1,ind2) - pop = toolbox.population(n=50) - start = time.time() - pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.5, ngen=500, stats=pop_stats, halloffame=halloffame) - #pop, logbook = algorithms.eaMuCommaLambda(pop, toolbox, mu=mu_es, lambda_=lambda_es, - # cxpb=0.6, mutpb=0.3, ngen=50, stats=pop_stats, halloffame=halloffame) - end = time.time() - halloffame.update(pop) - - individual = halloffame[0] - size = len(individual) / 2 - ind1,ind2 = np.array(individual[:size]),np.array(individual[size:]) - # Check that individual is a valid sample - assert sum(ind1) == 15 - assert sum(ind2) == 15 - assert sum(np.logical_and(ind1,ind2)) == 0 - p = map(list,(np.where(ind1 == 1)[0], np.where(ind2 == 1)[0])) - print self.evalMaxPair(p,self.samples,cvalues_1,cvalues_2) - - best = p - self.mem_values.clear() - self.resetTarget(self.morphed) - result,det,cond = self.computeStats(best[0],self.samples) - print 'Result: {0} ,Det: {1}, Cond: {2}'.format(result, det, cond) - result,det,cond = self.computeStats(best[1],self.samples) - print 'Result: {0} ,Det: {1}, Cond: {2}'.format(result, det, cond) - print 'elapsed time : {0}'.format(end-start) - return p - diff --git a/train_classifiers.py b/train_classifiers.py index 473dc03..862dbbf 100644 --- a/train_classifiers.py +++ b/train_classifiers.py @@ -6,7 +6,6 @@ import numpy as np from sklearn import svm, linear_model from sklearn.externals import joblib -from sklearn.metrics import roc_curve, auc from sklearn.ensemble import GradientBoostingClassifier from sklearn import cross_validation from xgboost_wrapper import XGBoostClassifier @@ -16,102 +15,108 @@ from os import listdir from os.path import isfile, join import os.path -import pdb from utils import printMultiFrame, printFrame, saveFig, loadData -''' +''' Train each one of the classifiers on the data ./data/c1/{train|test}_i_j.dat each model is saved on the file ./model/c1/{model_file}_i_j.pkl ''' -def trainClassifiers(clf,nsamples, - model_g='mlp',c1_g='', - dir='/afs/cern.ch/user/j/jpavezse/systematics', - model_file='adaptive', - dataset_names = None, - full_names = None, - data_file='train', - preprocessing=False, - seed=1234, - index = None, - vars_names = None - ): - ''' - Train classifiers pair-wise on - datasets - ''' - print 'Training classifier' - scaler=None - if preprocessing == True: - scaler = {} - for k in range(nsamples): - for j in range(nsamples): - if k==j or k > j: - continue - if dataset_names <> None: - name_k, name_j = (dataset_names[k], dataset_names[j]) - else: - name_k, name_j = (k,j) - print " Training Classifier on {0}/{1}".format(name_k,name_j) - traindata,targetdata = loadData(data_file,name_k,name_j,dir=dir,c1_g=c1_g, - preprocessing=preprocessing, scaler=scaler) - if model_g == 'mlp': - clf.fit(traindata, targetdata, save_file='{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(dir,model_g,c1_g,model_file,k,j)) - else: + +def trainClassifiers(clf, nsamples, + model_g='mlp', c1_g='', + dir='/afs/cern.ch/user/j/jpavezse/systematics', + model_file='adaptive', + dataset_names=None, + full_names=None, + data_file='train', + seed=1234, + index=None, + vars_names=None + ): + ''' + Train classifiers pair-wise on + datasets + ''' + print 'Training classifier' + for k in range(nsamples): + for j in range(nsamples): + if k == j or k > j: + continue + if dataset_names is not None: + name_k, name_j = (dataset_names[k], dataset_names[j]) + else: + name_k, name_j = (k, j) + print " Training Classifier on {0}/{1}".format(name_k, name_j) + traindata, targetdata = loadData(data_file, name_k, name_j, dir=dir, c1_g=c1_g) + if model_g == 'mlp': + clf.fit(traindata, + targetdata, + save_file='{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(dir, + model_g, + c1_g, + model_file, + k, + j)) + else: + rng = np.random.RandomState(seed) + indices = rng.permutation(traindata.shape[0]) + traindata = traindata[indices] + targetdata = targetdata[indices] + scores = cross_validation.cross_val_score(clf, traindata.reshape( + traindata.shape[0], traindata.shape[1]), targetdata) + print "Accuracy: {0} (+/- {1})".format(scores.mean(), scores.std() * 2) + clf.fit( + traindata.reshape( + traindata.shape[0], + traindata.shape[1]), + targetdata) + joblib.dump(clf, + '{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(dir, + model_g, + c1_g, + model_file, + k, + j)) + print " Training Classifier on F0/F1" + traindata, targetdata = loadData(data_file, 'F0' if full_names is None else full_names[0], + 'F1' if full_names is None else full_names[1], dir=dir, c1_g=c1_g) + if model_g == 'mlp': + clf.fit(traindata, + targetdata, + save_file='{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(dir, + model_g, + c1_g, + model_file)) + else: rng = np.random.RandomState(seed) indices = rng.permutation(traindata.shape[0]) traindata = traindata[indices] targetdata = targetdata[indices] - #scores = cross_validation.cross_val_score(clf, traindata.reshape(traindata.shape[0], - #traindata.shape[1]), targetdata) - #print "Accuracy: {0} (+/- {1})".format(scores.mean(), scores.std() * 2) - clf.fit(traindata.reshape(traindata.shape[0],traindata.shape[1]) - ,targetdata) - #joblib.dump(clf, '/afs/cern.ch/work/j/jpavezse/private/{0}_{1}_{2}.pkl'.format(model_file,k,j)) - joblib.dump(clf, '{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(dir,model_g,c1_g,model_file,k,j)) - print " Training Classifier on F0/F1" - traindata,targetdata = loadData(data_file,'F0' if full_names == None else full_names[0], - 'F1' if full_names == None else full_names[1],dir=dir,c1_g=c1_g) - if model_g == 'mlp': - clf.fit(traindata, targetdata, save_file='{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(dir,model_g,c1_g,model_file)) - else: - rng = np.random.RandomState(seed) - indices = rng.permutation(traindata.shape[0]) - traindata = traindata[indices] - targetdata = targetdata[indices] - #clf = svm.NuSVC(probability=True) #Why use a SVR?? - scores = cross_validation.cross_val_score(clf, traindata, targetdata) - print "Accuracy: {0} (+/- {1})".format(scores.mean(), scores.std() * 2) - clf.fit(traindata,targetdata) - #clf.plot_importance_matrix(vars_names) - #joblib.dump(clf, '/afs/cern.ch/work/j/jpavezse/private/{0}_F0_F1.pkl'.format(model_file)) - joblib.dump(clf, '{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(dir,model_g,c1_g,model_file)) - - - return scaler + scores = cross_validation.cross_val_score(clf, traindata, targetdata) + print "Accuracy: {0} (+/- {1})".format(scores.mean(), scores.std() * 2) + clf.fit(traindata, targetdata) + joblib.dump( + clf, '{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(dir, model_g, c1_g, model_file)) -def predict(filename, traindata,model_g='mlp', sig=1, clf=None): - sfilename,k,j = filename.split('/')[-1].split('_') - sfilename = '/'.join(filename.split('/')[:-1]) + '/' + sfilename - j = j.split('.')[0] - sig = 1 - if k <> 'F0': - k = int(k) - j = int(j) - sig = 1 if k < j else 0 - filename = '{0}_{1}_{2}.pkl'.format(sfilename,min(k,j),max(k,j)) - if clf <> None: - return clf.predict_proba(traindata, model_file=filename)[:,sig] - else: - clf = joblib.load(filename) - if clf.__class__.__name__ == 'NuSVR': - output = clf.predict(traindata) - return np.clip(output,0.,1.) +def predict(filename, traindata, model_g='mlp', sig=1, clf=None): + sfilename, k, j = filename.split('/')[-1].split('_') + sfilename = '/'.join(filename.split('/')[:-1]) + '/' + sfilename + j = j.split('.')[0] + sig = 1 + if k != 'F0': + k = int(k) + j = int(j) + sig = 1 if k < j else 0 + filename = '{0}_{1}_{2}.pkl'.format(sfilename, min(k, j), max(k, j)) + if clf is not None: + return clf.predict_proba(traindata, model_file=filename)[:, sig] else: - return clf.predict_proba(traindata)[:,sig] - - - - + clf = joblib.load(filename) + if clf.__class__.__name__ == 'NuSVR': + output = clf.predict(traindata) + return np.clip(output, 0., 1.) + else: + return clf.predict_proba(traindata)[:, sig] diff --git a/utils.py b/utils.py index abcd7ab..f4ce6e9 100644 --- a/utils.py +++ b/utils.py @@ -12,650 +12,604 @@ import os.path import pdb -from matplotlib.colors import LogNorm import matplotlib.pyplot as plt import matplotlib.cm as cm -from mpl_toolkits.mplot3d import Axes3D from sklearn import preprocessing -''' +''' Some usefull functions to print and load data ''' -def makePlotName(full, truth, f0 = None, f1 = None, type=None, - dir='/afs/cern.ch/user/j/jpavezse/systematics', - c1_g='',model_g='mlp'): - if full == 'dec': - return '{0}_{1}_f{2}_f{3}_{4}_{5}'.format(full, truth, f0, f1, model_g,type) - else: - return '{0}_{1}_{2}_{3}'.format(full, truth, model_g,type) - -def preProcessing(traindata,k,j, scaler): - assert scaler <> None - if (k,j) not in scaler: - scaler[(k,j)] = preprocessing.StandardScaler().fit(traindata) - traindata = scaler[(k,j)].transform(traindata) - return traindata - -def loadData(type,k,j,folder=None,dir='',c1_g='',preprocessing=False,scaler=None, persist=False, - model_g='mlp'): - if folder <> None: - fk = np.loadtxt('{0}/{1}_{2}.dat'.format(folder,type,k)) - fj = np.loadtxt('{0}/{1}_{2}.dat'.format(folder,type,j)) - else: - fk = np.loadtxt('{0}/data/{1}/{2}/{3}_{4}.dat'.format(dir,model_g,c1_g,type,k)) - fj = np.loadtxt('{0}/data/{1}/{2}/{3}_{4}.dat'.format(dir,model_g,c1_g,type,j)) - rng = np.random.RandomState(1111) - # HARCODED RIGHT NOW - - #data_num = 19900 - #data_num = 5000 - #indices = rng.choice(fk.shape[0],data_num,replace=False) - #fk = fk[indices] - #indices = rng.choice(fj.shape[0],data_num,replace=False) - #fj = fj[indices] - - #fk = fk[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,16,20,24,25,26,27,28,29,30,31,36,40,42]] - #fj = fj[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,16,20,24,25,26,27,28,29,30,31,36,40,42]] - - num1 = fj.shape[0] - num0 = fk.shape[0] - traindata = np.zeros((num0+num1,fk.shape[1])) if len(fk.shape) > 1 else \ - np.zeros(num0+num1) - targetdata = np.zeros(num0+num1) - traindata[:num1] = fj[:] - traindata[num1:] = fk[:] - targetdata[:num1].fill(1) - targetdata[num1:].fill(0) - if preprocessing == True: - traindata = preProcessing(traindata,k,j, scaler) - # this should be in the preProcessing? - if persist == True: - joblib.dump(scaler[(k,j)],'{0}/model/{1}/{2}/{3}_{4}_{5}.dat'.format(dir,'mlp',c1_g,'scaler',k,j)) - return (traindata, targetdata) - - -def printMultiFrame(w,obs,all_pdfs,name,legends, - dir='/afs/cern.ch/user/j/jpavezse/systematics', - model_g='mlp',setLog=False,y_text='',print_pdf=False,title='', - x_text='x'): - ''' - This just print a bunch of pdfs - in a Canvas - ''' - - # Preliminaries - ROOT.gROOT.SetStyle('Plain') - #ROOT.gStyle.SetOptTitle(0) - ROOT.gStyle.SetOptStat(0) - ROOT.gStyle.SetOptFit(1) - ROOT.gStyle.SetPalette(1) - - ROOT.gStyle.SetTitleX(0.5) - ROOT.gStyle.SetTitleAlign(23) - ROOT.gStyle.SetTitleBorderSize(0) - - # Hope I don't need more colors ... - colors = [ROOT.kBlue,ROOT.kRed,ROOT.kBlack,ROOT.kGreen, - ROOT.kYellow] - style = [ROOT.kSolid,ROOT.kSolid,ROOT.kDashed,ROOT.kDashed] - can = ROOT.TCanvas('c1') - can.Divide(1,len(all_pdfs)) - legs = [] - frames = [] - for curr,pdf in enumerate(all_pdfs): - x = w.var(obs[curr]) - can.cd(curr+1) - if curr <> len(pdf) - 1: - #ROOT.gPad.SetBottomMargin(0.01) - if curr == 0: - ROOT.gPad.SetTopMargin(0.09) - else: - ROOT.gPad.SetTopMargin(0.01) - ROOT.gPad.SetRightMargin(0.01) + +def makePlotName(full, truth, f0=None, f1=None, type=None, + dir='/afs/cern.ch/user/j/jpavezse/systematics', + c1_g='', model_g='mlp'): + if full == 'dec': + return '{0}_{1}_f{2}_f{3}_{4}_{5}'.format( + full, truth, f0, f1, model_g, type) else: - ROOT.gPad.SetTopMargin(0.01) - ROOT.gPad.SetRightMargin(0.01) - ROOT.gPad.SetLeftMargin(0.04) - if setLog == True: - ROOT.gPad.SetLogy(1) + return '{0}_{1}_{2}_{3}'.format(full, truth, model_g, type) + + +def loadData(type, k, j, folder=None, dir='', c1_g='', model_g='mlp'): + if folder is not None: + fk = np.loadtxt('{0}/{1}_{2}.dat'.format(folder, type, k)) + fj = np.loadtxt('{0}/{1}_{2}.dat'.format(folder, type, j)) + else: + fk = np.loadtxt( + '{0}/data/{1}/{2}/{3}_{4}.dat'.format(dir, model_g, c1_g, type, k)) + fj = np.loadtxt( + '{0}/data/{1}/{2}/{3}_{4}.dat'.format(dir, model_g, c1_g, type, j)) + rng = np.random.RandomState(1111) + + num1 = fj.shape[0] + num0 = fk.shape[0] + traindata = np.zeros( + (num0 + num1, + fk.shape[1])) if len( + fk.shape) > 1 else np.zeros( + num0 + num1) + targetdata = np.zeros(num0 + num1) + traindata[:num1] = fj[:] + traindata[num1:] = fk[:] + targetdata[:num1].fill(1) + targetdata[num1:].fill(0) + + return (traindata, targetdata) + +def makePreliminaries(): + # Preliminaries + ROOT.gROOT.SetStyle('Plain') + ROOT.gStyle.SetOptStat(0) + ROOT.gStyle.SetOptFit(1) + ROOT.gStyle.SetPalette(1) + + ROOT.gStyle.SetTitleX(0.5) + ROOT.gStyle.SetTitleAlign(23) + ROOT.gStyle.SetTitleBorderSize(0) + +def printMultiFrame( + w, + obs, + all_pdfs, + name, + legends, + dir='/afs/cern.ch/user/j/jpavezse/systematics', + model_g='mlp', + setLog=False, + y_text='', + print_pdf=False, + title='', + x_text='x'): + ''' + This just print a bunch of pdfs + in a Canvas + ''' + + makePreliminaries() + colors = [ROOT.kBlue, ROOT.kRed, ROOT.kBlack, ROOT.kGreen, + ROOT.kYellow] + style = [ROOT.kSolid, ROOT.kSolid, ROOT.kDashed, ROOT.kDashed] + can = ROOT.TCanvas('c1') + can.Divide(1, len(all_pdfs)) + legs = [] + frames = [] + for curr, pdf in enumerate(all_pdfs): + x = w.var(obs[curr]) + can.cd(curr + 1) + if curr != len(pdf) - 1: + if curr == 0: + ROOT.gPad.SetTopMargin(0.09) + else: + ROOT.gPad.SetTopMargin(0.01) + ROOT.gPad.SetRightMargin(0.01) + else: + ROOT.gPad.SetTopMargin(0.01) + ROOT.gPad.SetRightMargin(0.01) + ROOT.gPad.SetLeftMargin(0.04) + if setLog: + ROOT.gPad.SetLogy(1) + funcs = [] + line_colors = [] + line_styles = [] + for i, p in enumerate(pdf): + funcs.append(p) + line_colors.append(ROOT.RooFit.LineColor(colors[i])) + line_styles.append(ROOT.RooFit.LineStyle(style[i])) + frames.append(x.frame(ROOT.RooFit.Name(legends[curr][0]), ROOT.RooFit. + Title(legends[curr][0].split('_')[0]))) + for i, f in enumerate(funcs): + if isinstance(f, str): + funcs[0].plotOn(frames[-1], + ROOT.RooFit.Components(f), + ROOT.RooFit.Name(legends[curr][i]), + line_colors[i], + line_styles[i]) + else: + f.plotOn(frames[-1], + ROOT.RooFit.Name(legends[curr][i]), + line_colors[i], + line_styles[i]) + legs.append(ROOT.TLegend(0.79, 0.73, 0.90, 0.87)) + for i, l in enumerate(legends[curr]): + if i == 0: + legs[-1].AddEntry(frames[-1].findObject(legends[curr] + [i]), l.split('_')[1], 'l') + else: + legs[-1].AddEntry(frames[-1].findObject(legends[curr] + [i]), l.split('_')[1], 'l') + legs[-1].SetFillColor(0) + legs[-1].SetBorderSize(0) + legs[-1].SetTextSize(0.06) + legs[-1].SetFillColor(0) + legs[-1].SetBorderSize(0) + frames[-1].SetTitleSize(0.06, "Y") + frames[-1].SetTitleSize(0.06, "X") + frames[-1].GetYaxis().CenterTitle(1) + frames[-1].GetYaxis().SetTitleOffset(0.35) + if curr == 0: + frames[-1].SetTitle("{0};;{1}".format(title, y_text)) + elif curr == len(all_pdfs) - 1: + frames[-1].SetTitle(";{0};{1}".format(x_text, y_text)) + frames[-1].GetXaxis().SetTitleOffset(0.25) + else: + frames[-1].SetTitle(";;{0}".format(y_text)) + + frames[-1].Draw() + legs[-1].Draw() + ROOT.gPad.Update() + can.Modified() + can.Update() + can.SaveAs('{0}/plots/{1}/{2}.png'.format(dir, model_g, name)) + if print_pdf: + can.SaveAs('{0}/plots/{1}/{2}.pdf'.format(dir, model_g, name)) + + +def printFrame(w, obs, pdf, name, legends, + dir='/afs/cern.ch/user/j/jpavezse/systematics', model_g='mlp', + title='', y_text='', x_text='', range=None, print_pdf=False + ): + ''' + This just print a bunch of pdfs + in a Canvas + ''' + # Preliminaries + makePreliminaries() + if len(obs) > 1: + ROOT.gStyle.SetOptTitle(0) + + colors = [ROOT.kBlue, ROOT.kRed, ROOT.kGreen, ROOT.kBlack, + ROOT.kYellow] + x = [] + for var in obs: + x.append(w.var(var)) funcs = [] line_colors = [] - line_styles = [] - for i,p in enumerate(pdf): - funcs.append(p) - line_colors.append(ROOT.RooFit.LineColor(colors[i])) - line_styles.append(ROOT.RooFit.LineStyle(style[i])) - frames.append(x.frame(ROOT.RooFit.Name(legends[curr][0]),ROOT.RooFit. - Title(legends[curr][0].split('_')[0]))) - for i,f in enumerate(funcs): - if isinstance(f,str): - funcs[0].plotOn(frames[-1], ROOT.RooFit.Components(f),ROOT.RooFit.Name(legends[curr][i]), line_colors[i], - line_styles[i]) - else: - f.plotOn(frames[-1],ROOT.RooFit.Name(legends[curr][i]),line_colors[i],line_styles[i]) - legs.append(ROOT.TLegend(0.79, 0.73, 0.90, 0.87)) - #leg.SetFillColor(ROOT.kWhite) - #leg.SetLineColor(ROOT.kWhite) - # TODO This is just a quick fix because is now working how it should - for i,l in enumerate(legends[curr]): - if i == 0: - legs[-1].AddEntry(frames[-1].findObject(legends[curr][i]), l.split('_')[1], 'l') - else: - legs[-1].AddEntry(frames[-1].findObject(legends[curr][i]), l.split('_')[1], 'l') - legs[-1].SetFillColor(0) - legs[-1].SetBorderSize(0) - legs[-1].SetTextSize(0.06) - legs[-1].SetFillColor(0) - legs[-1].SetBorderSize(0) - frames[-1].SetTitleSize(0.06,"Y") - frames[-1].SetTitleSize(0.06,"X") - frames[-1].GetYaxis().CenterTitle(1) - frames[-1].GetYaxis().SetTitleOffset(0.35) - if curr == 0: - frames[-1].SetTitle("{0};;{1}".format(title,y_text)) - elif curr == len(all_pdfs)-1: - frames[-1].SetTitle(";{0};{1}".format(x_text,y_text)) - frames[-1].GetXaxis().SetTitleOffset(0.25) - else: - frames[-1].SetTitle(";;{0}".format(y_text)) - - frames[-1].Draw() - legs[-1].Draw() - ROOT.gPad.Update() - can.Modified() - can.Update() - can.SaveAs('{0}/plots/{1}/{2}.png'.format(dir,model_g,name)) - if print_pdf == True: - can.SaveAs('{0}/plots/{1}/{2}.pdf'.format(dir,model_g,name)) - -def printFrame(w,obs,pdf,name,legends, - dir='/afs/cern.ch/user/j/jpavezse/systematics',model_g='mlp', - title='',y_text='',x_text='',range=None,print_pdf=False - ): - ''' - This just print a bunch of pdfs - in a Canvas - ''' - # Preliminaries - ROOT.gROOT.SetStyle('Plain') - if len(obs) > 1: - ROOT.gStyle.SetOptTitle(0) - ROOT.gStyle.SetOptStat(0) - ROOT.gStyle.SetOptFit(1) - ROOT.gStyle.SetPalette(1) - - ROOT.gStyle.SetTitleX(0.5) - ROOT.gStyle.SetTitleAlign(23) - ROOT.gStyle.SetTitleBorderSize(0) - - # Hope I don't need more colors ... - colors = [ROOT.kBlue,ROOT.kRed,ROOT.kGreen,ROOT.kBlack, - ROOT.kYellow] - x = [] - for var in obs: - x.append(w.var(var)) - funcs = [] - line_colors = [] - for i,p in enumerate(pdf): - funcs.append(p) - line_colors.append(ROOT.RooFit.LineColor(colors[i])) - - can = ROOT.TCanvas('c1') - can.Divide(1,len(obs)) - frame = [] - for var in x: - frame.append(var.frame()) - legs = [] - for j,fra in enumerate(frame): - can.cd(j+1) - if len(obs) == 1: - ROOT.gPad.SetRightMargin(0.01) - else: - if j <> len(obs) - 1: - #ROOT.gPad.SetBottomMargin(0.01) - if j == 0: - ROOT.gPad.SetTopMargin(0.09) + for i, p in enumerate(pdf): + funcs.append(p) + line_colors.append(ROOT.RooFit.LineColor(colors[i])) + + can = ROOT.TCanvas('c1') + can.Divide(1, len(obs)) + frame = [] + for var in x: + frame.append(var.frame()) + legs = [] + for j, fra in enumerate(frame): + can.cd(j + 1) + if len(obs) == 1: + ROOT.gPad.SetRightMargin(0.01) else: - ROOT.gPad.SetTopMargin(0.01) - ROOT.gPad.SetRightMargin(0.01) - else: - ROOT.gPad.SetTopMargin(0.01) - ROOT.gPad.SetRightMargin(0.01) - ROOT.gPad.SetLeftMargin(0.04) - for i,f in enumerate(funcs): - if isinstance(f,str): - funcs[0].plotOn(fra, ROOT.RooFit.Components(f),ROOT.RooFit.Name(legends[i]), line_colors[i]) + if j != len(obs) - 1: + # ROOT.gPad.SetBottomMargin(0.01) + if j == 0: + ROOT.gPad.SetTopMargin(0.09) + else: + ROOT.gPad.SetTopMargin(0.01) + ROOT.gPad.SetRightMargin(0.01) + else: + ROOT.gPad.SetTopMargin(0.01) + ROOT.gPad.SetRightMargin(0.01) + ROOT.gPad.SetLeftMargin(0.04) + for i, f in enumerate(funcs): + if isinstance(f, str): + funcs[0].plotOn( + fra, ROOT.RooFit.Components(f), ROOT.RooFit.Name( + legends[i]), line_colors[i]) + else: + f.plotOn(fra, ROOT.RooFit.Name(legends[i]), line_colors[i]) + legs.append(ROOT.TLegend(0.69, 0.73, 0.89, 0.87)) + + for i, l in enumerate(legends): + if i == 0: + legs[-1].AddEntry(fra.findObject(legends[i]), l, 'l') + else: + legs[-1].AddEntry(fra.findObject(legends[i]), l, 'l') + legs[-1].SetFillColor(0) + legs[-1].SetBorderSize(0) + legs[-1].SetFillColor(0) + legs[-1].SetBorderSize(0) + + fra.GetYaxis().CenterTitle(1) + + fra.SetTitle(" ") + if range is not None: + fra.GetXaxis().SetRangeUser(range[0], range[1]) + fra.Draw() + legs[-1].Draw() + can.SaveAs('{0}/plots/{1}/{2}.png'.format(dir, model_g, name)) + if print_pdf: + can.SaveAs('{0}/plots/{1}/{2}.pdf'.format(dir, model_g, name)) + + +def saveFig( + x, + y, + file, + type='', + labels=None, + axis=None, + dir='/afs/cern.ch/user/j/jpavezse/systematics', + model_g='mlp', + marker=False, + marker_value=None, + x_range=None, + title='', + multi=False, + print_pdf=False, + min_value=None): + ''' + Helper to print a lof of different plots + ''' + + plt.ioff() + fig, ax = plt.subplots() + colors = ['b-', 'r-', 'k-'] + colors_rgb = ['blue', 'red', 'black'] + if type=='contour2': + cs1 = plt.contour( + x, y[0], y[1].transpose(), [ + 0., 0.1, 0.5, 1., 5., 10., 20., 50.], label=labels[0]) + cs2 = plt.contour( + x, y[0], y[2].transpose(), [ + 0., 0.1, 0.5, 1., 5., 10., 20., 50.], linestyles="dashed", label=labels[1]) + plt.clabel(cs1, inline=1, fontsize=10) + lines = [cs1.collections[0], cs2.collections[0]] + plt.legend(lines, labels, frameon=False, fontsize=11) + + ax.set_xlabel('signal weight', fontsize=11) + ax.set_ylabel('background weight', fontsize=11) + ax.annotate( + 'min=({0:.2f},{1:.2f})'.format( + min_value[0], + min_value[1]), + xy=( + min_value[0] + 0.001, + min_value[1] + 0.001), + xytext=( + min_value[0] + 0.03, + min_value[1] + 0.03), + arrowprops=dict( + facecolor='red')) + if marker: + plt.axvline(marker_value[0], color='black') + plt.axhline(marker_value[1], color='black') + plt.legend(loc='upper left', frameon=False, numpoints=1) + elif type=='contour': + levels = [2.3,6.] + cs1 = plt.contour( + x, + y[0], + y[1].transpose(), + levels, + origin='lower', + alpha=.5, + colors=( + '#ff0000', + '#ff9900', + '#999900', + 'w', + '#009999', + '#0099ff', + '#0000ff', + '#00ffff')) + plt.clabel(cs1, inline=1, fontsize=10) + lines = [cs1.collections[0]] + ax.set_title('-2ln(Q), target=({0:.2f},{1:.2f})'. + format(marker_value[0],marker_value[1])) + ax.set_xlabel('Kazz',fontsize=11) + ax.set_ylabel('Khazz',fontsize=11) + ax.set_xlabel('signal weight', fontsize=11) + ax.set_ylabel('background weight', fontsize=11) + ax.annotate( + 'min=({0:.2f},{1:.2f})'.format( + min_value[0], + min_value[1]), + xy=( + min_value[0] + 0.001, + min_value[1] + 0.001), + xytext=( + min_value[0] + 0.03, + min_value[1] + 0.03), + arrowprops=dict( + facecolor='red')) + if marker: + plt.axvline(marker_value[0], color='black') + plt.axhline(marker_value[1], color='black') + + labels = ['95% C.L','68.27% C.L'] + for i in range(len(labels)): + cs1.collections[i].set_label(labels[i]) + plt.legend(loc='upper left', frameon=False, numpoints=1) + elif type=='pixel': + vals = np.flipud(y[1]) + im = plt.imshow( + vals, + extent=( + y[0].min(), + y[0].max(), + x.min(), + x.max()), + interpolation='nearest', + cmap=cm.gist_rainbow_r) + + CB = plt.colorbar(im, shrink=0.8, extend='both') + ax.set_title(title) + ax.set_xlabel('g2', fontsize=11) + ax.set_ylabel('g1', fontsize=11) + elif type=='scatter': + if len(y) == 1: + ax.scatter(x, y[0], marker='*', color='g') + ax.set_xlabel(axis[0]) + ax.set_ylabel(axis[1]) else: - f.plotOn(fra,ROOT.RooFit.Name(legends[i]),line_colors[i]) - legs.append(ROOT.TLegend(0.69, 0.73, 0.89, 0.87)) - #leg.SetFillColor(ROOT.kWhite) - #leg.SetLineColor(ROOT.kWhite) - for i,l in enumerate(legends): - if i == 0: - legs[-1].AddEntry(fra.findObject(legends[i]), l, 'l') - else: - legs[-1].AddEntry(fra.findObject(legends[i]), l, 'l') - legs[-1].SetFillColor(0) - legs[-1].SetBorderSize(0) - #legs[-1].SetTextSize(0.06) - legs[-1].SetFillColor(0) - legs[-1].SetBorderSize(0) - #fra.SetTitleSize(0.06,"Y") - #fra.SetTitleSize(0.06,"X") - fra.GetYaxis().CenterTitle(1) - #fra.GetYaxis().SetTitleOffset(0.35) - if len(obs) == 1: - fra.SetTitle("{0};{1};{2}".format(title,x_text,y_text)) - else: - fra.SetTitle(";;{0}".format(y_text)) - if range <> None: - fra.GetXaxis().SetRangeUser(range[0],range[1]) - fra.Draw() - legs[-1].Draw() - can.SaveAs('{0}/plots/{1}/{2}.png'.format(dir,model_g,name)) - if print_pdf == True: - can.SaveAs('{0}/plots/{1}/{2}.pdf'.format(dir,model_g,name)) - -def saveFig(x,y,file,labels=None,scatter=False,contour=False,axis=None, - dir='/afs/cern.ch/user/j/jpavezse/systematics', - model_g='mlp',marker=False, hist=False, marker_value=None, x_range=None,title='',multi=False,print_pdf=False,hist2D=False,pixel=False,min_value=None): - fig,ax = plt.subplots() - colors = ['b-','r-','k-'] - colors_rgb = ['blue','red','black'] - if contour == True: - if pixel == True: - vals = np.flipud(y[1]) - #vals = y[1] - im = plt.imshow(vals, extent=(y[0].min(), y[0].max(), x.min(),x.max()), - interpolation='nearest', cmap=cm.gist_rainbow_r) - - CB = plt.colorbar(im, shrink=0.8, extend='both') - #plt.legend(lines,labels,frameon=False,fontsize=11) - ax.set_title(title) - ax.set_xlabel('g2',fontsize=11) - ax.set_ylabel('g1',fontsize=11) - else: - print min_value - #levels = [2.,10.,40.,60.,80.,100.,200.,400.,600.] - #levels = [2.3,6.] - #levels = [-800.,-600.,-400.,-200.,0.,200,1000.,10000.,50000.] - #im = plt.imshow(y[1], interpolation='bilinear', origin='lower', - # cmap=cm.gray, extent=(-3,3,-2,2)) - cs1 = plt.contour(x,y[0],y[1].transpose(),[0.,0.1,0.5,1.,5.,10.,50.,100.]) - #cs1 = plt.contour(x,y[0],y[1].transpose(),levels,origin='lower',alpha=.5, colors=('#ff0000', '#ff9900', '#999900', 'w', '#009999', '#0099ff', '#0000ff','#00ffff')) - cs2 = plt.contour(x,y[0],y[2].transpose(),[0.,0.1,0.5,1.,5.,10.,50.,100.],linestyles="dashed") - plt.clabel(cs1, inline=1, fontsize=10) - #plt.xlim(0.8,1.2) - #plt.ylim(0.3,0.7) - #CB = plt.colorbar(cs1, shrink=0.8, extend='both') - lines = [cs1.collections[0],cs2.collections[0]] - #lines = [cs1.collections[0]] - plt.legend(lines,labels,frameon=False,fontsize=11) - ax.set_title('-ln($\lambda$), target=({0:.2f},{1:.2f})'. - format(marker_value[0],marker_value[1])) - #ax.set_title('-2ln(Q), target=({0:.2f},{1:.2f})'. - # format(marker_value[0],marker_value[1])) - #ax.set_xlabel('Kazz',fontsize=11) - #ax.set_ylabel('Khazz',fontsize=11) - ax.set_xlabel('signal weight',fontsize=11) - ax.set_ylabel('background weight',fontsize=11) - ax.annotate('min=({0:.2f},{1:.2f})'.format(min_value[0],min_value[1]), xy=(min_value[0]+0.001 - ,min_value[1]+0.001),xytext=(min_value[0]+0.03,min_value[1]+0.03),arrowprops=dict(facecolor='red')) - if marker == True: - plt.axvline(marker_value[0], color='black') - plt.axhline(marker_value[1], color='black') - - #labels = ['95% C.L','68.27% C.L'] - #for i in range(len(labels)): - # cs1.collections[i].set_label(labels[i]) - #ax.plot([marker_value[0]],[marker_value[1]],'+',markersize=15,label='True') - #ax.plot([min_value[0]],[min_value[1]],'+',color='red',markersize=15,label='Fit') - plt.legend(loc='upper left',frameon=False,numpoints=1) - #ax.annotate('min',xy=(c1[0],c1[1]),xytext=(0.,0.)) - else: - if scatter == True: - if len(y) == 1: - ax.scatter(x,y[0],marker='*',color='g') + sc1 = ax.scatter(x, y[0], color='black') + sc2 = ax.scatter(x, y[1], color='red') + ax.legend((sc1, sc2), (labels[0], labels[1])) + ax.set_xlabel('x') + ax.set_ylabel('regression(score)') + elif type=='hist2D': + H, xedges, yedges, img = plt.hist2d( + y[0][:450], y[1][:450], bins=12) + extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]] + fig = plt.figure() + plt.xlim([xedges[0], xedges[-1]]) + plt.ylim([1.2, yedges[-1]]) + ax = fig.add_subplot(1, 1, 1) ax.set_xlabel(axis[0]) ax.set_ylabel(axis[1]) - else: - sc1 = ax.scatter(x,y[0],color='black') - sc2 = ax.scatter(x,y[1],color='red') - ax.legend((sc1,sc2),(labels[0],labels[1])) - ax.set_xlabel('x') - ax.set_ylabel('regression(score)') - else: - if hist == True: - if hist2D == True: - H, xedges, yedges, img = plt.hist2d(y[0][:450], y[1][:450],bins=12) - pdb.set_trace() - extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]] - fig = plt.figure() - plt.xlim([xedges[0],xedges[-1]]) - plt.ylim([1.2,yedges[-1]]) - ax = fig.add_subplot(1, 1, 1) - ax.set_xlabel(axis[0]) - ax.set_ylabel(axis[1]) - im = ax.imshow(H, cmap=plt.cm.jet,interpolation='nearest',extent=extent) - mean1,mean2 = (y[0][:450].mean(),y[1][:450].mean()) - std1, std2 = (y[0][:450].std(),y[1][:450].std()) - ax.annotate('mean=[{0:.2f},{1:.2f}]\nstd=[{2:.2f},{3:.2f}]'.format(mean1,mean2,std1,std2) - ,xy=(mean1,mean2),xytext=(mean1-0.01,mean2-0.1),arrowprops=dict(facecolor='red'),color='white') - fig.colorbar(im, ax=ax) - if marker == True: + im = ax.imshow( + H, + cmap=plt.cm.jet, + interpolation='nearest', + extent=extent) + mean1, mean2 = (y[0][:450].mean(), y[1][:450].mean()) + std1, std2 = (y[0][:450].std(), y[1][:450].std()) + ax.annotate( + 'mean=[{0:.2f},{1:.2f}]\nstd=[{2:.2f},{3:.2f}]'.format( + mean1, mean2, std1, std2), xy=( + mean1, mean2), xytext=( + mean1 - 0.01, mean2 - 0.1), arrowprops=dict( + facecolor='red'), color='white') + fig.colorbar(im, ax=ax) + if marker: plt.axvline(marker_value[0], color='black') plt.axhline(marker_value[1], color='black') - else: - if len(y) == 1: - if x_range <> None: - ax.hist(y[0],color='red', label=labels[0],bins=16, range=[x_range[0], x_range[1]], - histtype='step', alpha=0.5) + elif type=='hist': + if len(y) == 1: + if x_range is not None: + ax.hist( + y[0], color='red', label=labels[0], bins=16, range=[ + x_range[0], x_range[1]], histtype='step', alpha=0.5) else: - ax.hist(y[0],color='red', label=labels[0],bins=20, - histtype='step', alpha=0.5) - else: - #Just supporting two plots for now - if x_range <> None: - for i,ys in enumerate(y): - ax.hist(ys,color=colors_rgb[i],label=labels[i],bins=15, range=[x_range[0],x_range[1]],histtype='step',normed=1, alpha=0.5) + ax.hist( + y[0], + color='red', + label=labels[0], + bins=20, + histtype='step', + alpha=0.5) + else: + # Just supporting two plots for now + if x_range is not None: + for i, ys in enumerate(y): + ax.hist(ys, color=colors_rgb[i], label=labels[i], bins=15, range=[ + x_range[0], x_range[1]], histtype='step', normed=1, alpha=0.5) else: - for i,ys in enumerate(y): - ax.hist(ys,color=colors_rgb[i],label=labels[i],bins=15,histtype='step',normed=1, alpha=0.5) - ax.legend(frameon=False,fontsize=11) - if axis <> None: - ax.set_xlabel(axis[0]) - else: + for i, ys in enumerate(y): + ax.hist( + ys, + color=colors_rgb[i], + label=labels[i], + bins=15, + histtype='step', + normed=1, + alpha=0.5) + ax.legend(frameon=False, fontsize=11) + if axis is not None: + ax.set_xlabel(axis[0]) + else: ax.set_xlabel('x') - ax.set_ylabel('Count') - if marker == True: + ax.set_ylabel('count') + if marker: plt.axvline(marker_value, color='black') - else: + elif type=='': if len(y) == 1: - ax.plot(x,y[0],'b') - ax.annotate('fit min 1.5285', xy=(1.52857,0),xytext=(1.8,200.),arrowprops=dict(facecolor='red')) + ax.plot(x, y[0], 'b') + ax.annotate( + 'fit min 1.5285', xy=( + 1.52857, 0), xytext=( + 1.8, 200.), arrowprops=dict( + facecolor='red')) else: - #Just supporting two plots for now - linestyles=['--','--'] - markers = ['+','x'] - for k,ys in enumerate(y): - #ax.plot(x,ys,colors[k],label=labels[k],linestyle=linestyles[k],marker=markers[k]) - ax.plot(x,ys,colors[k],label=labels[k]) - ax.legend(frameon=False,fontsize=11) - if axis <> None: - ax.set_ylabel(axis[1]) - ax.set_xlabel(axis[0]) + # Just supporting two plots for now + linestyles = ['--', '--'] + markers = ['+', 'x'] + for k, ys in enumerate(y): + ax.plot(x, ys, colors[k], label=labels[k]) + ax.legend(frameon=False, fontsize=11) + if axis is not None: + ax.set_ylabel(axis[1]) + ax.set_xlabel(axis[0]) else: - ax.set_ylabel('LR') - ax.set_xlabel('x') - if marker == True: - plt.axvline(marker_value, color='black') - ax.set_title(title) - #if (len(y) > 1): - # This breaks the naming convention for plots, I will solve - # it later - # for i,l in enumerate(labels): - # np.savetxt('{0}/plots/{1}/results/{2}_{3}.txt'.format(dir,model_g,file,l),y[i]) - #else: - # np.savetxt('{0}/plots/{1}/results/{2}.txt'.format(dir,model_g,file),y[0]) - if print_pdf == True: - fig.savefig('{0}/plots/{1}/{2}.pdf'.format(dir,model_g,file)) - fig.savefig('{0}/plots/{1}/{2}.png'.format(dir,model_g,file)) - plt.close(fig) - plt.clf() + ax.set_ylabel('LR') + ax.set_xlabel('x') + if marker: + plt.axvline(marker_value, color='black') + + plt.tight_layout() + if print_pdf: + fig.savefig('{0}/plots/{1}/{2}.pdf'.format(dir, model_g, file)) + fig.savefig('{0}/plots/{1}/{2}.png'.format(dir, model_g, file)) + plt.close(fig) + plt.clf() def saveMultiFig(x,y,file,labels=None, dir='/afs/cern.ch/user/j/jpavezse/systematics', model_g='mlp',title='',print_pdf=False): - plot_num = 311 - for k,ys in enumerate(y): - # Fix later - plt.subplot(plot_num) - plt.plot(x,ys[0],'b-',label=labels[k][0]) - plt.plot(x,ys[1],'r-',label=labels[k][1]) - if k == len(y)-1: - plt.xlabel('x',fontsize=11) - plt.ylabel('Ratio',fontsize=11) - plt.tick_params(axis='both', labelsize=10) - plt.legend(frameon=False, fontsize=11) - if plot_num == 311: - plt.title('{0}'.format(title)) - plot_num = plot_num + 1 - if print_pdf == True: - plt.savefig('{0}/plots/{1}/{2}.pdf'.format(dir,model_g,file)) - plt.savefig('{0}/plots/{1}/{2}.png'.format(dir,model_g,file)) - plt.close() - plt.clf() - - -def saveFig3D(x,y,z,file,labels=None,scatter=False, - dir='/afs/cern.ch/user/j/jpavezse/systematics', - model_g='mlp',axis=None): - fig = plt.figure() - ax = fig.add_subplot(111, projection='3d') - if scatter == True: - if len(z) == 1: - ax.scatter(x,y,z[0],s=2) - ax.set_xlabel(axis[0]) - ax.set_ylabel(axis[1]) - ax.set_zlabel(axis[2]) - else: - sc1 = ax.scatter(x,y,z[0],color='black') - sc2 = ax.scatter(x,y,z[1],color='red') - ax.legend((sc1,sc2),(labels[0],labels[1])) - ax.set_xlabel('x') - ax.set_ylabel('y') - ax.set_zlabel('regression(score)') - else: - if len(z) == 1: - ax.plot_wireframe(x,y,z[0],color='red') - else: - #Just supporting two plots for now - ax.plot_wireframe(x,y,z[0],color='red',label=labels[0]) - ax.plot_wireframe(x,y,z[1],color='blue',label=labels[1]) - ax.legend() - ax.set_zlabel('LR') - ax.set_ylabel('y') - ax.set_xlabel('x') - ax.set_title(file) - if (len(z) > 1): - # This breaks the naming convention for plots, I will solve - # it later - for i,l in enumerate(labels): - np.savetxt('{0}/plots/{1}/{2}_{3}.txt'.format(dir,model_g,file,l),z[i]) - else: - np.savetxt('{0}/plots/{1}/{2}.txt'.format(dir,model_g,file),z[0]) - fig.savefig('{0}/plots/{1}/{2}.png'.format(dir,model_g,file)) - plt.close(fig) - plt.clf() - - -def makeMultiROC(all_outputs, targets, label, - dir='/afs/cern.ch/user/j/jpavezse/systematics',model_g='mlp', - true_score=None,print_pdf=False,title='',pos=[(0,1),(0,2),(1,2)]): - ''' - make plots for ROC curve of classifier and - test data. - ''' - plot_n = 311 - fprs = [] - tprs = [] - fpr_trues = [] - tpr_trues = [] - roc_aucs = [] - roc_auc_trues = [] - for k,(outputs,target) in enumerate(zip(all_outputs,targets)): - fpr, tpr, _ = roc_curve(target.ravel(),outputs.ravel()) - fprs.append(fpr) - tprs.append(tpr) + plot_num = 311 + for k,ys in enumerate(y): + plt.subplot(plot_num) + plt.plot(x,ys[0],'b-',label=labels[k][0]) + plt.plot(x,ys[1],'r-',label=labels[k][1]) + if k == len(y)-1: + plt.xlabel('x',fontsize=11) + plt.ylabel('Ratio',fontsize=11) + plt.tick_params(axis='both', labelsize=10) + plt.legend(frameon=False, fontsize=11) + if plot_num == 311: + plt.title('{0}'.format(title)) + plot_num = plot_num + 1 + if print_pdf == True: + plt.savefig('{0}/plots/{1}/{2}.pdf'.format(dir,model_g,file)) + plt.savefig('{0}/plots/{1}/{2}.png'.format(dir,model_g,file)) + plt.close() + plt.clf() + +def makeMultiROC( + all_outputs, + targets, + label, + dir='/afs/cern.ch/user/j/jpavezse/systematics', + model_g='mlp', + true_score=None, + print_pdf=False, + title='', + pos=[ + (0, + 1), + (0, + 2), + (1, + 2)]): + ''' + make plots for ROC curve of classifier and + test data. + ''' + plot_n = 311 + fprs = [] + tprs = [] + fpr_trues = [] + tpr_trues = [] + roc_aucs = [] + roc_auc_trues = [] + for k, (outputs, target) in enumerate(zip(all_outputs, targets)): + fpr, tpr, _ = roc_curve(target.ravel(), outputs.ravel()) + fprs.append(fpr) + tprs.append(tpr) + roc_auc = auc(fpr, tpr) + roc_aucs.append(roc_auc) + if true_score is not None: + fpr_true, tpr_true, _ = roc_curve( + target.ravel(), true_score[k].ravel()) + fpr_trues.append(fpr_true) + tpr_trues.append(tpr_true) + roc_auc_true = auc(fpr_true, tpr_true) + roc_auc_trues.append(roc_auc_true) + plt.subplot(plot_n) + plt.plot(fprs[-1], tprs[-1], 'b-', + label='ROC curve trained (area = %0.2f)' % roc_aucs[-1]) + if true_score is not None: + plt.plot(fpr_trues[-1], tpr_trues[-1], 'r-', + label='ROC curve true (area = %0.2f)' % roc_auc_trues[-1]) + plt.plot([0, 1], [0, 1], 'k--') + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + if k == len(targets) - 1: + plt.xlabel('Sensitivity', fontsize=11) + plt.ylabel('1-Specificity', fontsize=11) + plt.tick_params(axis='both', labelsize=10) + if plot_n == 311: + plt.title('{0}'.format(title)) + plt.legend(loc="lower right", frameon=False, fontsize=11) + plt.text(0.62, 0.42, 'f{0}-f{1}'.format(pos[k][0], pos[k][1])) + plot_n = plot_n + 1 + plt.savefig('{0}/plots/{1}/{2}.png'.format(dir, model_g, label)) + if print_pdf: + plt.savefig('{0}/plots/{1}/{2}.pdf'.format(dir, model_g, label)) + plt.close() + plt.clf() + + +def makeROC(outputs, target, label, + dir='/afs/cern.ch/user/j/jpavezse/systematics', model_g='mlp'): + ''' + make plots for ROC curve of classifier and + test data. + ''' + fpr, tpr, _ = roc_curve(target.ravel(), outputs.ravel()) roc_auc = auc(fpr, tpr) - roc_aucs.append(roc_auc) - if true_score <> None: - fpr_true, tpr_true, _ = roc_curve(target.ravel(),true_score[k].ravel()) - fpr_trues.append(fpr_true) - tpr_trues.append(tpr_true) - roc_auc_true = auc(fpr_true, tpr_true) - roc_auc_trues.append(roc_auc_true) - plt.subplot(plot_n) - plt.plot(fprs[-1], tprs[-1],'b-', label='ROC curve trained (area = %0.2f)' % roc_aucs[-1]) - if true_score <> None: - plt.plot(fpr_trues[-1], tpr_trues[-1],'r-', label='ROC curve true (area = %0.2f)' % roc_auc_trues[-1]) + fig = plt.figure() + plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) - if k == len(targets)-1: - plt.xlabel('Sensitivity',fontsize=11) - plt.ylabel('1-Specificity',fontsize=11) - plt.tick_params(axis='both', labelsize=10) - if plot_n == 311: - plt.title('{0}'.format(title)) - plt.legend(loc="lower right",frameon=False,fontsize=11) - plt.text(0.62,0.42,'f{0}-f{1}'.format(pos[k][0],pos[k][1])) - plot_n = plot_n + 1 - #np.savetxt('{0}/plots/{1}/results/{2}.txt'.format(dir,model_g,label),np.column_stack((fpr,tpr))) - plt.savefig('{0}/plots/{1}/{2}.png'.format(dir,model_g,label)) - if print_pdf == True: - plt.savefig('{0}/plots/{1}/{2}.pdf'.format(dir,model_g,label)) - plt.close() - plt.clf() - -def makeROC(outputs, target, label, - dir='/afs/cern.ch/user/j/jpavezse/systematics',model_g='mlp'): - ''' - make plots for ROC curve of classifier and - test data. - ''' - fpr, tpr, _ = roc_curve(target.ravel(),outputs.ravel()) - roc_auc = auc(fpr, tpr) - fig = plt.figure() - plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) - plt.plot([0, 1], [0, 1], 'k--') - plt.xlim([0.0, 1.0]) - plt.ylim([0.0, 1.05]) - plt.xlabel('False Positive Rate') - plt.ylabel('True Positive Rate') - plt.title('{0}'.format(label)) - plt.legend(loc="lower right") - plt.savefig('{0}/plots/{1}/{2}.png'.format(dir,model_g,label)) - plt.plot() - #plt.close(fig) - #plt.clf() + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('{0}'.format(label)) + plt.legend(loc="lower right") + plt.savefig('{0}/plots/{1}/{2}.png'.format(dir, model_g, label)) + plt.plot() def makeSigBkg(all_outputs, targets, label, - dir='/afs/cern.ch/user/j/jpavezse/systematics',model_g='mlp', - print_pdf=False,legends=None, title=''): - ''' - make plots for ROC curve of classifier and - test data. - ''' - print 'Signal-Background plots' - tprs = [] - fnrs = [] - aucs = [] - thresholds = np.linspace(0,1.0,150) - fig = plt.figure() - for k,(outputs,target) in enumerate(zip(all_outputs,targets)): - fnrs.append(np.array([float(np.sum((outputs > tr) * (target == 0)))/float(np.sum(target == 0)) for tr in thresholds])) - fnrs[-1] = fnrs[-1].ravel() - tprs.append(np.array([float(np.sum((outputs < tr) * (target == 1)))/float(np.sum(target == 1)) for tr in thresholds])) - tprs[-1] = tprs[-1].ravel() - aucs.append(auc(tprs[-1],fnrs[-1])) - plt.plot(tprs[-1], fnrs[-1], label='{0} (area = {1:.2f})'.format( - legends[k],aucs[-1])) - plt.xlim([0.0, 1.0]) - plt.ylim([0.0, 1.05]) - plt.xlabel('Signal Efficiency',fontsize=11) - plt.ylabel('Background Rejection',fontsize=11) - plt.tick_params(axis='both', labelsize=10) - plt.title('{0}'.format(title)) - plt.legend(loc="lower left",frameon=False, fontsize=11) - #np.savetxt('{0}/plots/{1}/results/{2}.txt'.format(dir,model_g,label),np.column_stack((tpr,fnr))) - plt.savefig('{0}/plots/{1}/{2}.png'.format(dir,model_g,label)) - if print_pdf == True: - plt.savefig('{0}/plots/{1}/{2}.pdf'.format(dir,model_g,label)) - plt.close(fig) - plt.clf() - - -def plotCValues(c0,c1,dir='/afs/cern.ch/user/j/jpavezse/systematics', - c1_g='',model_g='mlp',true_dist=False,vars_g=None, - workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', - use_log=False, n_hist=150,c_eval=0, range_min=-1.0,range_max=0.): - - ''' - Plot histogram of fitted values - ''' - if use_log == True: - post = 'log' - else: - post = '' - - keys = ['true','dec'] - c1_ = dict((key,np.zeros(n_hist)) for key in keys) - c1_1 = np.loadtxt('{0}/fitting_values_c1.txt'.format(dir)) - c1_['true'] = c1_1[:,0] - c1_['dec'] = c1_1[:,1] - if true_dist == True: - vals = [c1_['true'],c1_['dec']] - labels = ['true','dec'] - else: - vals = c1_['dec'] - vals1 = c1_1[:,3] - labels = ['dec'] - - size = min(vals.shape[0],vals1.shape[0]) - # 1D - #saveFig([],[vals1], - # makePlotName('g2','train',type='hist_g1g2'),hist=True, - # axis=['g2'],marker=True,marker_value=c1[c_eval], - # labels=labels,x_range=[range_min,range_max],dir=dir, - # model_g=model_g,title='Histogram for fitted g2', print_pdf=True) - # 2D - saveFig([],[vals,vals1], - makePlotName('g1g2','train',type='hist'),hist=True,hist2D=True, - axis=['g1','g2'],marker=True,marker_value=c1, - labels=labels,dir=dir,model_g=model_g,title='2D Histogram for fitted g1,g2', print_pdf=True, - x_range=[[0.5,1.4],[1.1,1.9]]) - - - -def getWeights(g_1=1.,g_2=1.5): - #Set the basis - basis_g1 = np.array((1.,1.,1.,1.,0.)) - basis_g2 = np.array((0.,2.,1.,3.,1.)) - - #basis_g1 = np.array((0.,1.,1.,1.,1.)) - #basis_g2 = np.array((1.,0.,1.,2.,3.)) - - #define the formulae - g1_t4 = lambda x,y: x*x*x*x + 0*y - g1_t2_g2_t2 = lambda x,y: x*x*y*y - g1_t3_g2 = lambda x,y: x*x*x*y - g1_g2_t3 = lambda x,y: y*y*y*x - g2_t4 = lambda x,y: y*y*y*y + 0*x - - a = np.zeros((5,5)) - - for i in range(5): - a[0,i] = g1_t4(basis_g1[i],basis_g2[i]) - a[2,i] = g1_t2_g2_t2(basis_g1[i],basis_g2[i]) - a[1,i] = g1_t3_g2(basis_g1[i],basis_g2[i]) - a[3,i] = g1_g2_t3(basis_g1[i],basis_g2[i]) - a[4,i] = g2_t4(basis_g1[i],basis_g2[i]) - - #print a - - b = np.linalg.inv(a) - #print b - - weight = np.zeros(5) - #identify the weight - - for j in range(5): - weight[j] = b[j,0]*g1_t4(g_1,g_2) + b[j,1]*g1_t3_g2(g_1,g_2) + b[j,2]*g1_t2_g2_t2(g_1,g_2) + b[j,3]*g1_g2_t3(g_1,g_2) + b[j,4]*g2_t4(g_1,g_2) - - return weight + dir='/afs/cern.ch/user/j/jpavezse/systematics', model_g='mlp', + print_pdf=False, legends=None, title=''): + ''' + make plots for ROC curve of classifier and + test data. + ''' + print 'Signal-Background plots' + tprs = [] + fnrs = [] + aucs = [] + thresholds = np.linspace(0, 1.0, 150) + fig = plt.figure() + for k, (outputs, target) in enumerate(zip(all_outputs, targets)): + fnrs.append(np.array([float(np.sum((outputs > tr) * + (target == 0))) / + float(np.sum(target == 0)) for tr in thresholds])) + fnrs[-1] = fnrs[-1].ravel() + tprs.append(np.array([float(np.sum((outputs < tr) * + (target == 1))) / + float(np.sum(target == 1)) for tr in thresholds])) + tprs[-1] = tprs[-1].ravel() + aucs.append(auc(tprs[-1], fnrs[-1])) + plt.plot(tprs[-1], fnrs[-1], label='{0} (area = {1:.2f})'.format( + legends[k], aucs[-1])) + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('signal efficiency', fontsize=11) + plt.ylabel('background rejection', fontsize=11) + plt.tick_params(axis='both', labelsize=10) + plt.legend(loc="lower left", frameon=False, fontsize=11) + plt.tight_layout() + plt.savefig('{0}/plots/{1}/{2}.png'.format(dir, model_g, label)) + if print_pdf: + plt.savefig('{0}/plots/{1}/{2}.pdf'.format(dir, model_g, label)) + plt.close(fig) + plt.clf() + diff --git a/xgboost_wrapper.py b/xgboost_wrapper.py index f5a7a11..9e6028c 100644 --- a/xgboost_wrapper.py +++ b/xgboost_wrapper.py @@ -1,7 +1,7 @@ - + import sys import math - + import numpy as np @@ -11,38 +11,47 @@ import xgboost as xgb import pdb - + + def logloss(y_true, Y_pred): - return -1 * sum(math.log(y[int(label)]) if y[int(label)] > 0 else -np.inf for y, label in zip(Y_pred, y_true)) / len(Y_pred) + return -1 * sum(math.log(y[int(label)]) if y[int(label)] > 0 else - + np.inf for y, label in zip(Y_pred, y_true)) / len(Y_pred) + class XGBoostClassifier(): - def __init__(self, num_boost_round=10, **params): + + def __init__(self, num_boost_round=10, missing=0., **params): self.clf = None + self.missing = missing self.num_boost_round = num_boost_round self.params = params self.params.update({'objective': 'multi:softprob'}) - - def fit(self, X, y, num_boost_round=None): + + def fit(self, X, y, num_boost_round=None, weight=None): num_boost_round = num_boost_round or self.num_boost_round - dtrain = xgb.DMatrix(X, label=y) - self.clf = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=num_boost_round) - + dtrain = xgb.DMatrix(X, label=y, missing=self.missing, weight=weight) + self.clf = xgb.train( + params=self.params, + dtrain=dtrain, + num_boost_round=num_boost_round, + verbose_eval=True) + def predict(self, X): Y = self.predict_proba(X) y = np.argmax(Y, axis=1) return np.array(y) - + def predict_proba(self, X): - dtest = xgb.DMatrix(X) + dtest = xgb.DMatrix(X, missing=self.missing) return self.clf.predict(dtest) - + def score(self, X, y): Y = self.predict_proba(X) return 1 / logloss(y, Y) - + def get_params(self, deep=True): return self.params - + def set_params(self, **params): if 'num_boost_round' in params: self.num_boost_round = params.pop('num_boost_round') @@ -50,4 +59,7 @@ def set_params(self, **params): del params['objective'] self.params.update(params) return self - + + def plot_importance_matrix(self, vars_names): + pdb.set_trace() + xgb.plot_importance(self.clf)