diff --git a/Decrease_cadence.ipynb b/Decrease_cadence.ipynb new file mode 100644 index 0000000..1cd2c5e --- /dev/null +++ b/Decrease_cadence.ipynb @@ -0,0 +1,735 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Neural networks not available in this version of scikit-learn. Neural networks are available from development version 0.18.\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload #Use this to reload modules if they are changed on disk while the notebook is running\n", + "from __future__ import division\n", + "from snmachine import sndata, snfeatures, snclassifier, tsne_plot\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import time, os, pywt,subprocess\n", + "from sklearn.decomposition import PCA\n", + "from astropy.table import Table,join,vstack\n", + "from astropy.io import fits\n", + "import sklearn.metrics \n", + "from functools import partial\n", + "from multiprocessing import Pool\n", + "import sncosmo\n", + "%matplotlib nbagg\n", + "\n", + "from astropy.table import Column\n", + "# WARNING...\n", + "#Multinest uses a hardcoded character limit for the output file names. I believe it's a limit of 100 characters\n", + "#so avoid making this file path to lengthy if using nested sampling or multinest output file names will be truncated\n", + "\n", + "#Change outdir to somewhere on your computer if you like\n", + "dataset='spcc'\n", + "outdir=os.path.join('output_%s_no_z' %dataset,'')\n", + "out_features=os.path.join(outdir,'features') #Where we save the extracted features to\n", + "out_class=os.path.join(outdir,'classifications') #Where we save the classification probabilities and ROC curves\n", + "out_int=os.path.join(outdir,'int') #Any intermediate files (such as multinest chains or GP fits)\n", + "\n", + "subprocess.call(['mkdir',outdir])\n", + "subprocess.call(['mkdir',out_features])\n", + "subprocess.call(['mkdir',out_class])\n", + "subprocess.call(['mkdir',out_int])\n", + "\n", + "read_from_file=False #True #We can use this flag to quickly rerun from saved features\n", + "run_name=os.path.join(out_features,'%s_all' %dataset)\n", + "rt=os.path.join('SPCC_SUBSET','')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def make_new_data(filename, table, output, sntype):\n", + " with open(filename) as f:\n", + " data = f.read().split(\"\\n\")\n", + " filters = data[5].split()\n", + " survey = data[0].split()\n", + " stuffRA = data[6].split()\n", + " stuffDec = data[7].split()\n", + " MWEBV = data[11].split()\n", + " if sntype==1:\n", + " typestring='SN Type = Ia , MODEL = mlcs2k2.SNchallenge'\n", + " elif sntype==2:\n", + " typestring='SN Type = II , MODEL = SDSS-017564'\n", + " elif sntype==3:\n", + " typestring='SN Type = Ic , MODEL = SDSS-014475'\n", + " else:\n", + " typestring='SOMETHING WENT HORRIBLY WRONG'\n", + " table.meta = {survey[0][:-1]: survey[1], stuffRA[0][:-1]: stuffRA[1], stuffDec[0][:-1]: stuffDec[1],filters[0][:-1]: filters[1],\n", + " MWEBV[0][:-1]: MWEBV[1], 'SNTYPE': -9, 'SIM_COMMENT': typestring }\n", + " #table.rename_column('mjd', 'MJD')\n", + " #table.rename_column('filter ', 'FLT')\n", + " #table.rename_column('flux', 'FLUXCAL')\n", + " #table.rename_column('flux_error', 'FLUXCALERR')\n", + " sncosmo.write_lc(table, 'new_mocks/%s'%output,pedantic=True, format = 'snana')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "6" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prototypes_Ia=[ 'DES_SN002542.DAT', 'DES_SN013866.DAT', 'DES_SN023940.DAT', 'DES_SN024734.DAT', 'DES_SN030701.DAT', 'DES_SN045040.DAT']\n", + "prototypes_II=[ 'DES_SN002457.DAT', 'DES_SN005519.DAT', 'DES_SN006381.DAT', 'DES_SN008569.DAT', 'DES_SN013360.DAT', 'DES_SN013481.DAT']\n", + "prototypes_Ibc=['DES_SN005399.DAT', 'DES_SN013863.DAT', 'DES_SN027266.DAT', 'DES_SN030183.DAT', 'DES_SN065493.DAT', 'DES_SN078241.DAT']\n", + "len(prototypes_II)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['DES_SN002457.DAT',\n", + " 'DES_SN005519.DAT',\n", + " 'DES_SN006381.DAT',\n", + " 'DES_SN008569.DAT',\n", + " 'DES_SN013360.DAT',\n", + " 'DES_SN013481.DAT']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prototypes_II" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def produce_mock_data_set(degrading_factor, realsperlc=100, prototypes_Ia=['DES_SN002542.DAT'], prototypes_II=['DES_SN002457.DAT'], prototypes_Ibc=['DES_SN005399.DAT']):\n", + " #Data root\n", + " dat=sndata.Dataset(rt)\n", + "\n", + " types=dat.get_types()\n", + " types['Type'][np.floor(types['Type']/10)==2]=2\n", + " types['Type'][np.floor(types['Type']/10)==3]=3\n", + " \n", + " logfile=open('new_mocks/new_mocks.LIST', 'w')\n", + " \n", + " for prot in range(len(prototypes_II)):\n", + " type_II = []\n", + " for i in range(len(dat.data[prototypes_II[prot]]['flux'])):\n", + " type_II.append(np.random.normal(dat.data[prototypes_II[prot]]['flux'][i], dat.data[prototypes_II[prot]]['flux_error'][i]*degrading_factor, realsperlc))\n", + " type_II = np.array(type_II)\n", + " filename_II = 'SPCC_SUBSET/'+prototypes_II[prot]\n", + " test_table_II = dat.data[prototypes_II[prot]]\n", + " test_table_II.rename_column('flux', 'FLUXCAL')\n", + " test_table_II.rename_column('flux_error', 'FLUXCALERR')\n", + " col_II = Table.Column(name='field',data=np.zeros(len(test_table_II)) )\n", + " test_table_II.add_column(col_II, index = 2)\n", + " for i in range(realsperlc):\n", + " test_table_II.replace_column('FLUXCAL', type_II[:,i])\n", + " test_table_II.replace_column('FLUXCALERR', test_table_II['FLUXCALERR']*degrading_factor)\n", + " make_new_data(filename_II, test_table_II, 'II_%s_%s'%(i,prototypes_II[prot]), 2)\n", + " logfile.write('II_'+str(i)+'_'+prototypes_II[prot]+'\\n')\n", + "\n", + " for prot in range(len(prototypes_Ia)):\n", + " type_Ia = []\n", + " for i in range(len(dat.data[prototypes_Ia[prot]]['flux'])):\n", + " type_Ia.append(np.random.normal(dat.data[prototypes_Ia[prot]]['flux'][i], dat.data[prototypes_Ia[prot]]['flux_error'][i]*degrading_factor, realsperlc))\n", + " type_Ia = np.array(type_Ia)\n", + " filename_Ia = 'SPCC_SUBSET/'+prototypes_Ia[prot]\n", + " test_table_Ia = dat.data[prototypes_Ia[prot]]\n", + " test_table_Ia.rename_column('flux', 'FLUXCAL')\n", + " test_table_Ia.rename_column('flux_error', 'FLUXCALERR')\n", + " col_Ia = Table.Column(name='field',data=np.zeros(len(test_table_Ia)) )\n", + " test_table_Ia.add_column(col_Ia, index = 2)\n", + " for i in range(realsperlc):\n", + " test_table_Ia.replace_column('FLUXCAL', type_Ia[:,i])\n", + " test_table_Ia.replace_column('FLUXCALERR', test_table_Ia['FLUXCALERR']*degrading_factor)\n", + " make_new_data(filename_Ia, test_table_Ia, 'Ia_%s_%s'%(i,prototypes_Ia[prot]), 1)\n", + " logfile.write('Ia_'+str(i)+'_'+prototypes_Ia[prot]+'\\n')\n", + " \n", + " for prot in range(len(prototypes_Ibc)):\n", + " type_Ibc = []\n", + " for i in range(len(dat.data[prototypes_Ibc[prot]]['flux'])):\n", + " type_Ibc.append(np.random.normal(dat.data[prototypes_Ibc[prot]]['flux'][i], dat.data[prototypes_Ibc[prot]]['flux_error'][i]*degrading_factor, realsperlc))\n", + " type_Ibc = np.array(type_Ibc)\n", + " filename_Ibc = 'SPCC_SUBSET/'+prototypes_Ibc[prot]\n", + " test_table_Ibc = dat.data[prototypes_Ibc[prot]]\n", + " test_table_Ibc.rename_column('flux', 'FLUXCAL')\n", + " test_table_Ibc.rename_column('flux_error', 'FLUXCALERR')\n", + " col_Ibc = Table.Column(name='field',data=np.zeros(len(test_table_Ibc)) )\n", + " test_table_Ibc.add_column(col_Ibc, index = 3)\n", + " for i in range(realsperlc):\n", + " test_table_Ibc.replace_column('FLUXCAL', type_Ibc[:,i])\n", + " test_table_Ibc.replace_column('FLUXCALERR', test_table_Ibc['FLUXCALERR']*degrading_factor)\n", + " make_new_data(filename_Ibc, test_table_Ibc, 'Ibc_%s_%s'%(i,prototypes_Ibc[prot]), 3)\n", + " logfile.write('Ibc_'+str(i)+'_'+prototypes_Ibc[prot]+'\\n')\n", + "\n", + " logfile.close()\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#produce_mock_data_set(1.1, 111, prototypes_Ia, prototypes_II, prototypes_Ibc)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def AUC_from_mock_data_set(classifiers=['nb','knn','svm','neural_network','boost_dt']):\n", + " rt1=os.path.join('new_mocks','')\n", + " dat1=sndata.Dataset(rt1)\n", + " for obj in dat1.object_names:\n", + " for i in range(len(dat1.data[obj])):\n", + " dat1.data[obj]['filter'][i]=dat1.data[obj]['filter'][i][3:7]\n", + " types=dat1.get_types()\n", + " types['Type'][np.floor(types['Type']/10)==2]=2\n", + " types['Type'][np.floor(types['Type']/10)==3]=3\n", + " %%capture --no-stdout\n", + " \n", + " mod1Feats=snfeatures.ParametricFeatures('newling',sampler='leastsq')\n", + " mod1_features=mod1Feats.extract_features(dat1,nprocesses=4,chain_directory=out_int)\n", + " mod1_features.write('%s_newling.dat' %run_name, format='ascii')\n", + " #Unfortunately, sometimes the fitting methods return NaN for some parameters for these models.\n", + " for c in mod1_features.colnames[1:]:\n", + " mod1_features[c][np.isnan(mod1_features[c])]=0\n", + " mod1Feats.fit_sn\n", + " dat1.set_model(mod1Feats.fit_sn,mod1_features)\n", + " \n", + " AUC=[]\n", + " nprocesses=4\n", + " return_classifier=False\n", + " columns=[]\n", + " training_set=0.7\n", + " param_dict={}\n", + " scale=True\n", + " \n", + " if isinstance(mod1_features,Table):\n", + " #The features are in astropy table format and must be converted to a numpy array before passing to sklearn\n", + "\n", + " #We need to make sure we match the correct Y values to X values. The safest way to do this is to make types an\n", + " #astropy table as well.\n", + "\n", + " if not isinstance(types,Table):\n", + " types=Table(data=[mod1_features['Object'],types],names=['Object','Type'])\n", + " feats=join(mod1_features,types,'Object')\n", + "\n", + " if len(columns)==0:\n", + " columns=feats.columns[1:-1]\n", + "\n", + " #Split into training and validation sets\n", + " if np.isscalar(training_set):\n", + " objs=feats['Object']\n", + " objs=np.random.permutation(objs)\n", + " training_set=objs[:(int)(training_set*len(objs))]\n", + "\n", + " #Otherwise a training set has already been provided as a list of Object names and we can continue\n", + " feats_train=feats[np.in1d(feats['Object'],training_set)]\n", + " feats_test=feats[~np.in1d(feats['Object'],training_set)]\n", + "\n", + " X_train=np.array([feats_train[c] for c in columns]).T\n", + " y_train=np.array(feats_train['Type'])\n", + " X_test=np.array([feats_test[c] for c in columns]).T\n", + " y_test=np.array(feats_test['Type'])\n", + "\n", + " else:\n", + " #Otherwise the features are already in the form of a numpy array\n", + " if np.isscalar(training_set):\n", + " inds=np.random.permutation(range(len(features)))\n", + " train_inds=inds[:(int)(len(inds)*training_set)]\n", + " test_inds=inds[(int)(len(inds)*training_set):]\n", + "\n", + " else:\n", + " #We assume the training set has been provided as indices\n", + " train_inds=training_set\n", + " test_inds=range(len(types))[~np.in1d(range(len(types)),training_set)]\n", + "\n", + " X_train=mod1_features[train_inds]\n", + " y_train=types[train_inds]\n", + " X_test=mod1_features[test_inds]\n", + " y_test=types[test_inds]\n", + "\n", + "\n", + " #Rescale the data (highly recommended)\n", + " if scale:\n", + " scaler = sklearn.preprocessing.StandardScaler()\n", + " scaler.fit(np.vstack((X_train, X_test)))\n", + " X_train = scaler.transform(X_train)\n", + " X_test = scaler.transform(X_test)\n", + "\n", + " probabilities={}\n", + " classifier_objects={}\n", + "\n", + " if nprocesses>1 and return_classifier:\n", + " print \"Due to limitations with python's multiprocessing module, classifier objects cannot be returned if \" \\\n", + " \"multiple processors are used. Continuing serially...\"\n", + " print\n", + "\n", + " if nprocesses>1 and not return_classifier:\n", + " partial_func=partial(snclassifier.__call_classifier,X_train=X_train, y_train=y_train, X_test=X_test,\n", + " param_dict=param_dict,return_classifier=False)\n", + " p=Pool(nprocesses)\n", + " result=p.map(partial_func,classifiers)\n", + "\n", + " for i in range(len(result)):\n", + " cls=classifiers[i]\n", + " probabilities[cls]=result[i]\n", + "\n", + " else:\n", + " for cls in classifiers:\n", + " probs,clf=snclassifier.__call_classifier(cls, X_train, y_train, X_test, param_dict,return_classifier)\n", + " probabilities[cls]=probs\n", + " if return_classifier:\n", + " classifier_objects[cls]=clf\n", + "\n", + " for i in range(len(classifiers)):\n", + " cls=classifiers[i]\n", + " probs=probabilities[cls]\n", + " fpr, tpr, auc=snclassifier.roc(probs, y_test, true_class=1)\n", + " AUC.append(auc)\n", + " return AUC\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading data...\n", + "23 objects read into memory.\n", + "Reading data...\n", + "1998 objects read into memory.\n" + ] + } + ], + "source": [ + "classifiers=['nb','svm','boost_dt']\n", + "#replace model\n", + "#run with full size data set\n", + "#put classifier names into plot\n", + "auc_allclass={}\n", + "\n", + "for cl in classifiers:\n", + " auc_grid=[]\n", + " for i in range(11):\n", + " produce_mock_data_set(1.+2.*i/10., 111, prototypes_Ia, prototypes_II, prototypes_Ibc)\n", + " auc_grid=np.append(auc_grid, AUC_from_mock_data_set([cl]));\n", + " auc_allclass[cl]=auc_grid" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "auc_allclass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "for cl in classifiers:\n", + " plt.plot(np.linspace(1., 3., 11), auc_allclass[cl])\n", + "plt.legend(auc_allclass)\n", + "plt.xlim([1.,3.])\n", + "plt.show()\n", + "plt.savefig('blah.png')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dat1.plot_all()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure()\n", + "tsne_plot.plot(mod1_features,join(mod1_features,types)['Type'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure()\n", + "clss=snclassifier.run_pipeline(mod1_features,types,output_name=os.path.join(out_class,'model1'),\n", + " classifiers=['nb','svm','boost_dt'], nprocesses=4)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "collapsed": true + }, + "source": [ + "waveFeats=snfeatures.WaveletFeatures()" + ] + }, + { + "cell_type": "raw", + "metadata": { + "collapsed": false + }, + "source": [ + "%%capture --no-stdout\n", + "if read_from_file:\n", + " wave_features=Table.read('%s_wavelets.dat' %run_name, format='ascii')\n", + " #Crucial for this format of id's\n", + " blah=wave_features['Object'].astype(str)\n", + " wave_features.replace_column('Object', blah)\n", + " PCA_vals=np.loadtxt('%s_wavelets_PCA_vals.dat' %run_name)\n", + " PCA_vec=np.loadtxt('%s_wavelets_PCA_vec.dat' %run_name)\n", + " PCA_mean=np.loadtxt('%s_wavelets_PCA_mean.dat' %run_name)\n", + "else:\n", + " wave_features=waveFeats.extract_features(dat,nprocesses=6,output_root=out_int,save_output='all')\n", + " wave_features.write('%s_wavelets.dat' %run_name, format='ascii')\n", + " np.savetxt('%s_wavelets_PCA_vals.dat' %run_name,waveFeats.PCA_eigenvals)\n", + " np.savetxt('%s_wavelets_PCA_vec.dat' %run_name,waveFeats.PCA_eigenvectors)\n", + " np.savetxt('%s_wavelets_PCA_mean.dat' %run_name,waveFeats.PCA_mean)\n", + " \n", + " PCA_vals=waveFeats.PCA_eigenvals\n", + " PCA_vec=waveFeats.PCA_eigenvectors\n", + " PCA_mean=waveFeats.PCA_mean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.sort(dat.object_names)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dat.set_model(waveFeats.fit_sn,wave_features,PCA_vec,PCA_mean,0,dat.get_max_length(),dat.filter_set)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#dat.plot_all()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#wave_features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#join(wave_features, types)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#plt.figure()\n", + "#tsne_plot.plot(wave_features,join(wave_features,types)['Type'])" + ] + }, + { + "cell_type": "raw", + "metadata": { + "collapsed": false + }, + "source": [ + "#This is a Features object, not yet the extracted features\n", + "#We need to register the LSST bands with sncosmo (which is why lsst_bands=True)\n", + "salt2Feats=snfeatures.TemplateFeatures(sampler='leastsq') \n", + "\n", + "#This performs the actual feature extraction. All feature extraction methods are parallelised with multiprocessing, \n", + "#just set nprocesses>1 for parallelisation.\n", + "if read_from_file:\n", + " salt2_features=Table.read('%s_templates.dat' %run_name, format='ascii')\n", + " blah=salt2_features['Object'].astype(str)\n", + " salt2_features.replace_column('Object', blah)\n", + "else:\n", + " salt2_features=salt2Feats.extract_features(dat,use_redshift=True,nprocesses=6,chain_directory=out_int)\n", + " salt2_features.write('%s_templates.dat' %run_name, format='ascii')\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#This code takes the fitted parameters and generates the model light curve for plotting purposes.\n", + "#dat.set_model(salt2Feats.fit_sn,salt2_features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#dat.plot_all()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#dat.data[dat.object_names[5]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/Inflate_errorbars.py b/Inflate_errorbars.py new file mode 100755 index 0000000..cdd2516 --- /dev/null +++ b/Inflate_errorbars.py @@ -0,0 +1,293 @@ + +# coding: utf-8 + +# In[1]: + +from __future__ import division +from snmachine import sndata, snfeatures, snclassifier, tsne_plot +import numpy as np +import matplotlib.pyplot as plt +import time, os, pywt,subprocess +from sklearn.decomposition import PCA +from astropy.table import Table,join,vstack +from astropy.io import fits +import sklearn.metrics +from functools import partial +from multiprocessing import Pool +import sncosmo + +from astropy.table import Column +# WARNING... +#Multinest uses a hardcoded character limit for the output file names. I believe it's a limit of 100 characters +#so avoid making this file path to lengthy if using nested sampling or multinest output file names will be truncated + +#Change outdir to somewhere on your computer if you like +dataset='spcc' +outdir=os.path.join('output_%s_no_z' %dataset,'') +out_features=os.path.join(outdir,'features') #Where we save the extracted features to +out_class=os.path.join(outdir,'classifications') #Where we save the classification probabilities and ROC curves +out_int=os.path.join(outdir,'int') #Any intermediate files (such as multinest chains or GP fits) + +subprocess.call(['mkdir',outdir]) +subprocess.call(['mkdir',out_features]) +subprocess.call(['mkdir',out_class]) +subprocess.call(['mkdir',out_int]) + +read_from_file=False #True #We can use this flag to quickly rerun from saved features +run_name=os.path.join(out_features,'%s_all' %dataset) +rt=os.path.join('SPCC_SUBSET','') + + +# In[ ]: + + + + +# In[2]: + +def make_new_data(filename, table, output, sntype): + with open(filename) as f: + data = f.read().split("\n") + filters = data[5].split() + survey = data[0].split() + stuffRA = data[6].split() + stuffDec = data[7].split() + MWEBV = data[11].split() + if sntype==1: + typestring='SN Type = Ia , MODEL = mlcs2k2.SNchallenge' + elif sntype==2: + typestring='SN Type = II , MODEL = SDSS-017564' + elif sntype==3: + typestring='SN Type = Ic , MODEL = SDSS-014475' + else: + typestring='SOMETHING WENT HORRIBLY WRONG' + table.meta = {survey[0][:-1]: survey[1], stuffRA[0][:-1]: stuffRA[1], stuffDec[0][:-1]: stuffDec[1],filters[0][:-1]: filters[1], + MWEBV[0][:-1]: MWEBV[1], 'SNTYPE': -9, 'SIM_COMMENT': typestring } + #table.rename_column('mjd', 'MJD') + #table.rename_column('filter ', 'FLT') + #table.rename_column('flux', 'FLUXCAL') + #table.rename_column('flux_error', 'FLUXCALERR') + sncosmo.write_lc(table, 'new_mocks/%s'%output,pedantic=True, format = 'snana') + + +# In[3]: + +prototypes_Ia=[ 'DES_SN002542.DAT', 'DES_SN013866.DAT', 'DES_SN023940.DAT', 'DES_SN024734.DAT', 'DES_SN030701.DAT', 'DES_SN045040.DAT'] +prototypes_II=[ 'DES_SN002457.DAT', 'DES_SN005519.DAT', 'DES_SN006381.DAT', 'DES_SN008569.DAT', 'DES_SN013360.DAT', 'DES_SN013481.DAT'] +prototypes_Ibc=['DES_SN005399.DAT', 'DES_SN013863.DAT', 'DES_SN027266.DAT', 'DES_SN030183.DAT', 'DES_SN065493.DAT', 'DES_SN078241.DAT'] +len(prototypes_II) + + +# In[4]: + +prototypes_II + + +# In[5]: + +def produce_mock_data_set(degrading_factor, realsperlc=100, prototypes_Ia=['DES_SN002542.DAT'], prototypes_II=['DES_SN002457.DAT'], prototypes_Ibc=['DES_SN005399.DAT']): + #Data root + dat=sndata.Dataset(rt) + + types=dat.get_types() + types['Type'][np.floor(types['Type']/10)==2]=2 + types['Type'][np.floor(types['Type']/10)==3]=3 + + logfile=open('new_mocks/new_mocks.LIST', 'w') + + for prot in range(len(prototypes_II)): + type_II = [] + for i in range(len(dat.data[prototypes_II[prot]]['flux'])): + type_II.append(np.random.normal(dat.data[prototypes_II[prot]]['flux'][i], dat.data[prototypes_II[prot]]['flux_error'][i]*degrading_factor, realsperlc)) + type_II = np.array(type_II) + filename_II = 'SPCC_SUBSET/'+prototypes_II[prot] + test_table_II = dat.data[prototypes_II[prot]] + test_table_II.rename_column('flux', 'FLUXCAL') + test_table_II.rename_column('flux_error', 'FLUXCALERR') + col_II = Table.Column(name='field',data=np.zeros(len(test_table_II)) ) + test_table_II.add_column(col_II, index = 2) + for i in range(realsperlc): + test_table_II.replace_column('FLUXCAL', type_II[:,i]) + test_table_II.replace_column('FLUXCALERR', test_table_II['FLUXCALERR']*degrading_factor) + make_new_data(filename_II, test_table_II, 'II_%s_%s'%(i,prototypes_II[prot]), 2) + logfile.write('II_'+str(i)+'_'+prototypes_II[prot]+'\n') + + for prot in range(len(prototypes_Ia)): + type_Ia = [] + for i in range(len(dat.data[prototypes_Ia[prot]]['flux'])): + type_Ia.append(np.random.normal(dat.data[prototypes_Ia[prot]]['flux'][i], dat.data[prototypes_Ia[prot]]['flux_error'][i]*degrading_factor, realsperlc)) + type_Ia = np.array(type_Ia) + filename_Ia = 'SPCC_SUBSET/'+prototypes_Ia[prot] + test_table_Ia = dat.data[prototypes_Ia[prot]] + test_table_Ia.rename_column('flux', 'FLUXCAL') + test_table_Ia.rename_column('flux_error', 'FLUXCALERR') + col_Ia = Table.Column(name='field',data=np.zeros(len(test_table_Ia)) ) + test_table_Ia.add_column(col_Ia, index = 2) + for i in range(realsperlc): + test_table_Ia.replace_column('FLUXCAL', type_Ia[:,i]) + test_table_Ia.replace_column('FLUXCALERR', test_table_Ia['FLUXCALERR']*degrading_factor) + make_new_data(filename_Ia, test_table_Ia, 'Ia_%s_%s'%(i,prototypes_Ia[prot]), 1) + logfile.write('Ia_'+str(i)+'_'+prototypes_Ia[prot]+'\n') + + for prot in range(len(prototypes_Ibc)): + type_Ibc = [] + for i in range(len(dat.data[prototypes_Ibc[prot]]['flux'])): + type_Ibc.append(np.random.normal(dat.data[prototypes_Ibc[prot]]['flux'][i], dat.data[prototypes_Ibc[prot]]['flux_error'][i]*degrading_factor, realsperlc)) + type_Ibc = np.array(type_Ibc) + filename_Ibc = 'SPCC_SUBSET/'+prototypes_Ibc[prot] + test_table_Ibc = dat.data[prototypes_Ibc[prot]] + test_table_Ibc.rename_column('flux', 'FLUXCAL') + test_table_Ibc.rename_column('flux_error', 'FLUXCALERR') + col_Ibc = Table.Column(name='field',data=np.zeros(len(test_table_Ibc)) ) + test_table_Ibc.add_column(col_Ibc, index = 3) + for i in range(realsperlc): + test_table_Ibc.replace_column('FLUXCAL', type_Ibc[:,i]) + test_table_Ibc.replace_column('FLUXCALERR', test_table_Ibc['FLUXCALERR']*degrading_factor) + make_new_data(filename_Ibc, test_table_Ibc, 'Ibc_%s_%s'%(i,prototypes_Ibc[prot]), 3) + logfile.write('Ibc_'+str(i)+'_'+prototypes_Ibc[prot]+'\n') + + logfile.close() + + + +# In[6]: + +#produce_mock_data_set(1.1, 111, prototypes_Ia, prototypes_II, prototypes_Ibc) + + +# In[ ]: + + + + +# In[ ]: + + + + +# In[7]: + +def AUC_from_mock_data_set(classifiers=['nb','knn','svm','neural_network','boost_dt']): + rt1=os.path.join('new_mocks','') + dat1=sndata.Dataset(rt1) + for obj in dat1.object_names: + for i in range(len(dat1.data[obj])): + dat1.data[obj]['filter'][i]=dat1.data[obj]['filter'][i][3:7] + types=dat1.get_types() + types['Type'][np.floor(types['Type']/10)==2]=2 + types['Type'][np.floor(types['Type']/10)==3]=3 + + mod1Feats=snfeatures.ParametricFeatures('newling',sampler='leastsq') + mod1_features=mod1Feats.extract_features(dat1,nprocesses=4,chain_directory=out_int) + mod1_features.write('%s_newling.dat' %run_name, format='ascii') + #Unfortunately, sometimes the fitting methods return NaN for some parameters for these models. + for c in mod1_features.colnames[1:]: + mod1_features[c][np.isnan(mod1_features[c])]=0 + mod1Feats.fit_sn + dat1.set_model(mod1Feats.fit_sn,mod1_features) + + AUC=[] + nprocesses=4 + return_classifier=False + columns=[] + training_set=0.7 + param_dict={} + scale=True + + if isinstance(mod1_features,Table): + #The features are in astropy table format and must be converted to a numpy array before passing to sklearn + + #We need to make sure we match the correct Y values to X values. The safest way to do this is to make types an + #astropy table as well. + + if not isinstance(types,Table): + types=Table(data=[mod1_features['Object'],types],names=['Object','Type']) + feats=join(mod1_features,types,'Object') + + if len(columns)==0: + columns=feats.columns[1:-1] + + #Split into training and validation sets + if np.isscalar(training_set): + objs=feats['Object'] + objs=np.random.permutation(objs) + training_set=objs[:(int)(training_set*len(objs))] + + #Otherwise a training set has already been provided as a list of Object names and we can continue + feats_train=feats[np.in1d(feats['Object'],training_set)] + feats_test=feats[~np.in1d(feats['Object'],training_set)] + + X_train=np.array([feats_train[c] for c in columns]).T + y_train=np.array(feats_train['Type']) + X_test=np.array([feats_test[c] for c in columns]).T + y_test=np.array(feats_test['Type']) + + else: + #Otherwise the features are already in the form of a numpy array + if np.isscalar(training_set): + inds=np.random.permutation(range(len(features))) + train_inds=inds[:(int)(len(inds)*training_set)] + test_inds=inds[(int)(len(inds)*training_set):] + + else: + #We assume the training set has been provided as indices + train_inds=training_set + test_inds=range(len(types))[~np.in1d(range(len(types)),training_set)] + + X_train=mod1_features[train_inds] + y_train=types[train_inds] + X_test=mod1_features[test_inds] + y_test=types[test_inds] + + + #Rescale the data (highly recommended) + if scale: + scaler = sklearn.preprocessing.StandardScaler() + scaler.fit(np.vstack((X_train, X_test))) + X_train = scaler.transform(X_train) + X_test = scaler.transform(X_test) + + probabilities={} + classifier_objects={} + + if nprocesses>1 and not return_classifier: + partial_func=partial(snclassifier.__call_classifier,X_train=X_train, y_train=y_train, X_test=X_test, + param_dict=param_dict,return_classifier=False) + p=Pool(nprocesses) + result=p.map(partial_func,classifiers) + + for i in range(len(result)): + cls=classifiers[i] + probabilities[cls]=result[i] + + else: + for cls in classifiers: + probs,clf=snclassifier.__call_classifier(cls, X_train, y_train, X_test, param_dict,return_classifier) + probabilities[cls]=probs + if return_classifier: + classifier_objects[cls]=clf + + for i in range(len(classifiers)): + cls=classifiers[i] + probs=probabilities[cls] + fpr, tpr, auc=snclassifier.roc(probs, y_test, true_class=1) + AUC.append(auc) + return AUC + + +# In[ ]: + +classifiers=['nb','svm','boost_dt'] +#replace model +#run with full size data set +#put classifier names into plot +auc_allclass={} + +for cl in classifiers: + auc_grid=[] + for i in range(11): + produce_mock_data_set(1.+2.*i/10., 22, prototypes_Ia, prototypes_II, prototypes_Ibc) + auc_grid=np.append(auc_grid, AUC_from_mock_data_set([cl])); + auc_allclass[cl]=auc_grid + +np.save('allclassifiers.txt', auc_allclass)