diff --git a/AD_Invert.py b/AD_Invert.py
new file mode 100644
index 0000000..09b0fb0
--- /dev/null
+++ b/AD_Invert.py
@@ -0,0 +1,151 @@
+import tensorflow as tf
+import numpy as np
+import pdb
+import json
+from mod_core_rnn_cell_impl import LSTMCell  # modified to allow initializing bias in lstm
+
+import data_utils
+import plotting
+import model
+import mmd
+import utils
+import eval
+import DR_discriminator
+
+from differential_privacy.dp_sgd.dp_optimizer import dp_optimizer
+from differential_privacy.dp_sgd.dp_optimizer import sanitizer
+from differential_privacy.privacy_accountant.tf import accountant
+
+# --- get settings --- #
+# parse command line arguments, or use defaults
+parser = utils.rgan_options_parser()
+settings = vars(parser.parse_args())
+# if a settings file is specified, it overrides command line arguments/defaults
+if settings['settings_file']: settings = utils.load_settings_from_file(settings)
+
+# --- get data, split --- #
+data_path = './experiments/data/' + settings['data_load_from'] + '.data.npy'
+print('Loading data from', data_path)
+samples, labels, index = data_utils.get_data(settings["data"], settings["seq_length"], settings["seq_step"],
+                                             settings["num_signals"], settings["sub_id"], settings["eval_single"],
+                                             settings["eval_an"], data_path)
+# --- save settings, data --- #
+# no need
+print('Ready to run with settings:')
+for (k, v) in settings.items(): print(v, '\t', k)
+# add the settings to local environment
+# WARNING: at this point a lot of variables appear
+locals().update(settings)
+json.dump(settings, open('./experiments/settings/' + identifier + '.txt', 'w'), indent=0)
+
+class myADclass():
+    def __init__(self, epoch, settings=settings, samples=samples, labels=labels, index=index):
+        self.epoch = epoch
+        self.settings = settings
+        self.samples = samples
+        self.labels = labels
+        self.index = index
+    def ADfunc(self):
+        num_samples_t = self.samples.shape[0]
+        t_size = 500
+        T_index = np.random.choice(num_samples_t, size=t_size, replace=False)
+        print('sample_shape:', self.samples.shape[0])
+        print('num_samples_t', num_samples_t)
+
+        # -- only discriminate one batch for one time -- #
+        D_test = np.empty([t_size, self.settings['seq_length'], 1])
+        DL_test = np.empty([t_size, self.settings['seq_length'], 1])
+        GG = np.empty([t_size, self.settings['seq_length'], self.settings['num_signals']])
+        T_samples = np.empty([t_size, self.settings['seq_length'], self.settings['num_signals']])
+        L_mb = np.empty([t_size, self.settings['seq_length'], 1])
+        I_mb = np.empty([t_size, self.settings['seq_length'], 1])
+        for batch_idx in range(0, t_size):
+            # print('epoch:{}'.format(self.epoch))
+            # print('batch_idx:{}'.format(batch_idx))
+            # display batch progress
+            model.display_batch_progression(batch_idx, t_size)
+            T_mb = self.samples[T_index[batch_idx], :, :]
+            L_mmb = self.labels[T_index[batch_idx], :, :]
+            I_mmb = self.index[T_index[batch_idx], :, :]
+            para_path = './experiments/parameters/' + self.settings['sub_id'] + '_' + str(
+                self.settings['seq_length']) + '_' + str(self.epoch) + '.npy'
+            D_t, L_t = DR_discriminator.dis_D_model(self.settings, T_mb, para_path)
+            Gs, Zs, error_per_sample, heuristic_sigma = DR_discriminator.invert(self.settings, T_mb, para_path,
+                                                                                g_tolerance=None,
+                                                                                e_tolerance=0.1, n_iter=None,
+                                                                                max_iter=1000,
+                                                                                heuristic_sigma=None)
+            GG[batch_idx, :, :] = Gs
+            T_samples[batch_idx, :, :] = T_mb
+            D_test[batch_idx, :, :] = D_t
+            DL_test[batch_idx, :, :] = L_t
+            L_mb[batch_idx, :, :] = L_mmb
+            I_mb[batch_idx, :, :] = I_mmb
+
+        # -- use self-defined evaluation functions -- #
+        # -- test different tao values for the detection function -- #
+        results = np.empty([5, 5])
+        # for i in range(2, 8):
+        #     tao = 0.1 * i
+        tao = 0.5
+        lam = 0.8
+        Accu1, Pre1, Rec1, F11, FPR1, D_L1 = DR_discriminator.detection_D_I(DL_test, L_mb, I_mb, self.settings['seq_step'], tao)
+        print('seq_length:', self.settings['seq_length'])
+        print('D:Comb-logits-based-Epoch: {}; tao={:.1}; Accu: {:.4}; Pre: {:.4}; Rec: {:.4}; F1: {:.4}; FPR: {:.4}'
+              .format(self.epoch, tao, Accu1, Pre1, Rec1, F11, FPR1))
+        results[0, :] = [Accu1, Pre1, Rec1, F11, FPR1]
+
+        Accu2, Pre2, Rec2, F12, FPR2, D_L2 = DR_discriminator.detection_D_I(D_test, L_mb, I_mb, self.settings['seq_step'], tao)
+        print('seq_length:', self.settings['seq_length'])
+        print('D:Comb-statistic-based-Epoch: {}; tao={:.1}; Accu: {:.4}; Pre: {:.4}; Rec: {:.4}; F1: {:.4}; FPR: {:.4}'
+              .format(self.epoch, tao, Accu2, Pre2, Rec2, F12, FPR2))
+        results[1, :] = [Accu2, Pre2, Rec2, F12, FPR2]
+
+        Accu3, Pre3, Rec3, F13, FPR3, D_L3 = DR_discriminator.detection_R_D_I(DL_test, GG, T_samples, L_mb, self.settings['seq_step'], tao, lam)
+        print('seq_length:', self.settings['seq_length'])
+        print('RD:Comb-logits_based-Epoch: {}; tao={:.1}; Accu: {:.4}; Pre: {:.4}; Rec: {:.4}; F1: {:.4}; FPR: {:.4}'
+            .format(self.epoch, tao, Accu3, Pre3, Rec3, F13, FPR3))
+        results[2, :] = [Accu3, Pre3, Rec3, F13, FPR3]
+
+        Accu4, Pre4, Rec4, F14, FPR4, D_L4 = DR_discriminator.detection_R_D_I(D_test, GG, T_samples, L_mb, self.settings['seq_step'], tao, lam)
+        print('seq_length:', self.settings['seq_length'])
+        print('RD:Comb-statistic-based-Epoch: {}; tao={:.1}; Accu: {:.4}; Pre: {:.4}; Rec: {:.4}; F1: {:.4}; FPR: {:.4}'
+              .format(self.epoch, tao, Accu4, Pre4, Rec4, F14, FPR4))
+        results[3, :] = [Accu4, Pre4, Rec4, F14, FPR4]
+
+        Accu5, Pre5, Rec5, F15, FPR5, D_L5 = DR_discriminator.detection_R_I(GG, T_samples, L_mb, self.settings['seq_step'],tao)
+        print('seq_length:', self.settings['seq_length'])
+        print('G:Comb-sample-based-Epoch: {}; tao={:.1}; Accu: {:.4}; Pre: {:.4}; Rec: {:.4}; F1: {:.4}; FPR: {:.4}'
+              .format(self.epoch, tao, Accu5, Pre5, Rec5, F15, FPR5))
+        results[4, :] = [Accu5, Pre5, Rec5, F15, FPR5]
+
+        return results, GG, D_test, DL_test
+
+
+
+if __name__ == "__main__":
+    print('Main Starting...')
+
+    Results = np.empty([settings['num_epochs'], 5, 5])
+
+    t_size = 500
+    D_test = np.empty([settings['num_epochs'], t_size, settings['seq_length'], 1])
+    DL_test = np.empty([settings['num_epochs'], t_size, settings['seq_length'], 1])
+    GG = np.empty([settings['num_epochs'], t_size, settings['seq_length'], settings['num_signals']])
+
+    for epoch in range(settings['num_epochs']):
+    # for epoch in range(1):
+        ob = myADclass(epoch)
+        Results[epoch, :, :], GG[epoch, :, :, :], D_test[epoch, :, :, :], DL_test[epoch, :, :, :] = ob.ADfunc()
+
+    res_path = './experiments/plots/Results_Invert' + '_' + settings['sub_id'] + '_' + str(
+        settings['seq_length']) + '.npy'
+    np.save(res_path, Results)
+
+    dg_path = './experiments/plots/DG_Invert' + '_' + settings['sub_id'] + '_' + str(
+        settings['seq_length']) + '_'
+    np.save(dg_path + 'D_test.npy', D_test)
+    np.save(dg_path + 'DL_test.npy', DL_test)
+    np.save(dg_path + 'GG.npy', DL_test)
+
+    print('Main Terminating...')
\ No newline at end of file
diff --git a/DR_discriminator.py b/DR_discriminator.py
new file mode 100644
index 0000000..c943ab8
--- /dev/null
+++ b/DR_discriminator.py
@@ -0,0 +1,794 @@
+import numpy as np
+import tensorflow as tf
+import matplotlib.pyplot as plt
+from matplotlib.colors import hsv_to_rgb
+import model
+import mmd
+from mod_core_rnn_cell_impl import LSTMCell
+from sklearn.metrics import precision_recall_fscore_support
+
+def anomaly_detection_plot(D_test, T_mb, L_mb, D_L, epoch, identifier):
+
+    aa = D_test.shape[0]
+    bb = D_test.shape[1]
+    D_L = D_L.reshape([aa, bb, -1])
+
+    x_points = np.arange(bb)
+
+    fig, ax = plt.subplots(4, 4, sharex=True)
+    for m in range(4):
+        for n in range(4):
+            D = D_test[n * 4 + m, :, :]
+            T = T_mb[n * 4 + m, :, :]
+            L = L_mb[n * 4 + m, :, :]
+            DL = D_L[n * 4 + m, :, :]
+            ax[m, n].plot(x_points, D, '--g', label='Pro')
+            ax[m, n].plot(x_points, T, 'b', label='Data')
+            ax[m, n].plot(x_points, L, 'k', label='Label')
+            ax[m, n].plot(x_points, DL, 'r', label='Label')
+            ax[m, n].set_ylim(-1, 1)
+    for n in range(4):
+        ax[-1, n].xaxis.set_ticks(range(0, bb, int(bb/6)))
+    fig.suptitle(epoch)
+    fig.subplots_adjust(hspace=0.15)
+    fig.savefig("./experiments/plots/DR_dis/" + identifier + "_epoch" + str(epoch).zfill(4) + ".png")
+    plt.clf()
+    plt.close()
+
+    return True
+
+def detection_Comb(Label_test, L_mb, I_mb, seq_step, tao):
+    aa = Label_test.shape[0]
+    bb = Label_test.shape[1]
+
+    LL = (aa-1)*seq_step+bb
+
+    Label_test = abs(Label_test.reshape([aa, bb]))
+    L_mb = L_mb .reshape([aa, bb])
+    I_mb = I_mb .reshape([aa, bb])
+    D_L = np.zeros([LL, 1])
+    L_L = np.zeros([LL, 1])
+    Count = np.zeros([LL, 1])
+    for i in range(0, aa):
+        for j in range(0, bb):
+            # print('index:', i*10+j)
+            D_L[i*seq_step+j] += Label_test[i, j]
+            L_L[i * seq_step + j] += L_mb[i, j]
+            Count[i * seq_step + j] += 1
+
+    D_L /= Count
+    L_L /= Count
+
+    TP, TN, FP, FN = 0, 0, 0, 0
+
+    for i in range(LL):
+        if D_L[i] > tao:
+            # true/negative
+            D_L[i] = 0
+        else:
+            # false/positive
+            D_L[i] = 1
+
+    cc = (D_L == L_L)
+    # print('D_L:', D_L)
+    # print('L_L:', L_L)
+    cc = list(cc.reshape([-1]))
+    N = cc.count(True)
+
+    print('N:', N)
+
+    Accu = float((N / LL) * 100)
+
+    precision, recall, f1, _ = precision_recall_fscore_support(L_L, D_L, average='binary')
+
+    return Accu, precision, recall, f1,
+
+
+def detection_logits_I(DL_test, L_mb, I_mb, seq_step, tao):
+    aa = DL_test.shape[0]
+    bb = DL_test.shape[1]
+
+    LL = (aa-1)*seq_step+bb
+
+    DL_test = abs(DL_test.reshape([aa, bb]))
+    L_mb = L_mb .reshape([aa, bb])
+    I_mb = I_mb .reshape([aa, bb])
+    D_L = np.zeros([LL, 1])
+    L_L = np.zeros([LL, 1])
+    Count = np.zeros([LL, 1])
+    for i in range(0, aa):
+        for j in range(0, bb):
+            # print('index:', i*10+j)
+            D_L[i*seq_step+j] += DL_test[i, j]
+            L_L[i * seq_step + j] += L_mb[i, j]
+            Count[i * seq_step + j] += 1
+
+    D_L /= Count
+    L_L /= Count
+
+    TP, TN, FP, FN = 0, 0, 0, 0
+
+    for i in range(LL):
+        if D_L[i] > tao:
+            # true/negative
+            D_L[i] = 0
+        else:
+            # false/positive
+            D_L[i] = 1
+
+        A = D_L[i]
+        B = L_L[i]
+        if A == 1 and B == 1:
+            TP += 1
+        elif A == 1 and B == 0:
+            FP += 1
+        elif A == 0 and B == 0:
+            TN += 1
+        elif A == 0 and B == 1:
+            FN += 1
+
+
+    cc = (D_L == L_L)
+    # print('D_L:', D_L)
+    # print('L_L:', L_L)
+    cc = list(cc.reshape([-1]))
+    N = cc.count(True)
+
+    print('N:', N)
+
+    Accu = float((N / LL) * 100)
+
+    precision, recall, f1, _ = precision_recall_fscore_support(L_L, D_L, average='binary')
+
+    # true positive among all the detected positive
+    # Pre = (100 * TP) / (TP + FP + 1)
+    # # true positive among all the real positive
+    # Rec = (100 * TP) / (TP + FN + 1)
+    # # The F1 score is the harmonic average of the precision and recall,
+    # # where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.
+    # F1 = (2 * Pre * Rec) / (100 * (Pre + Rec + 1))
+    # False positive rate--false alarm rate
+    FPR = (100 * FP) / (FP + TN+1)
+
+    return Accu, precision, recall, f1, FPR, D_L
+
+def detection_statistic_I(D_test, L_mb, I_mb, seq_step, tao):
+    # point-wise detection for one dimension
+
+    aa = D_test.shape[0]
+    bb = D_test.shape[1]
+
+    LL = (aa-1) * seq_step + bb
+    # print('aa:', aa)
+    # print('bb:', bb)
+    # print('LL:', LL)
+
+    D_test = D_test.reshape([aa, bb])
+    L_mb = L_mb.reshape([aa, bb])
+    I_mb = I_mb.reshape([aa, bb])
+    D_L = np.zeros([LL, 1])
+    L_L = np.zeros([LL, 1])
+    Count = np.zeros([LL, 1])
+    for i in range(0, aa):
+        for j in range(0, bb):
+            # print('index:', i * 10 + j)
+            D_L[i * seq_step + j] += D_test[i, j]
+            L_L[i * seq_step + j] += L_mb[i, j]
+            Count[i * seq_step + j] += 1
+
+    D_L /= Count
+    L_L /= Count
+
+    TP, TN, FP, FN = 0, 0, 0, 0
+
+    for i in range(LL):
+        if D_L[i] > tao:
+            # true/negative
+            D_L[i] = 0
+        else:
+            # false/positive
+            D_L[i] = 1
+
+        A = D_L[i]
+        B = L_L[i]
+        if A == 1 and B == 1:
+            TP += 1
+        elif A == 1 and B == 0:
+            FP += 1
+        elif A == 0 and B == 0:
+            TN += 1
+        elif A == 0 and B == 1:
+            FN += 1
+
+    cc = (D_L == L_L)
+    cc = list(cc.reshape([-1]))
+    N = cc.count(True)
+    Accu = float((N / LL) * 100)
+
+    precision, recall, f1, _ = precision_recall_fscore_support(L_L, D_L, average='binary')
+
+    # true positive among all the detected positive
+    # Pre = (100 * TP) / (TP + FP + 1)
+    # # true positive among all the real positive
+    # Rec = (100 * TP) / (TP + FN + 1)
+    # # The F1 score is the harmonic average of the precision and recall,
+    # # where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.
+    # F1 = (2 * Pre * Rec) / (100 * (Pre + Rec + 1))
+    # False positive rate--false alarm rate
+    FPR = (100 * FP) / (FP + TN)
+
+    return Accu, precision, recall, f1, FPR, D_L
+
+def detection_D_I(DD, L_mb, I_mb, seq_step, tao):
+    # point-wise detection for one dimension
+
+    aa = DD.shape[0]
+    bb = DD.shape[1]
+
+    LL = (aa-1)*seq_step+bb
+
+    DD = abs(DD.reshape([aa, bb]))
+    L_mb = L_mb .reshape([aa, bb])
+    I_mb = I_mb .reshape([aa, bb])
+    D_L = np.zeros([LL, 1])
+    L_L = np.zeros([LL, 1])
+    Count = np.zeros([LL, 1])
+    for i in range(0, aa):
+        for j in range(0, bb):
+            # print('index:', i*10+j)
+            D_L[i*10+j] += DD[i, j]
+            L_L[i * 10 + j] += L_mb[i, j]
+            Count[i * 10 + j] += 1
+
+    D_L /= Count
+    L_L /= Count
+
+    TP, TN, FP, FN = 0, 0, 0, 0
+
+    for i in range(LL):
+        if D_L[i] > tao:
+            # true/negative
+            D_L[i] = 0
+        else:
+            # false/positive
+            D_L[i] = 1
+
+        A = D_L[i]
+        B = L_L[i]
+        if A == 1 and B == 1:
+            TP += 1
+        elif A == 1 and B == 0:
+            FP += 1
+        elif A == 0 and B == 0:
+            TN += 1
+        elif A == 0 and B == 1:
+            FN += 1
+
+
+    cc = (D_L == L_L)
+    # print('D_L:', D_L)
+    # print('L_L:', L_L)
+    cc = list(cc.reshape([-1]))
+    N = cc.count(True)
+
+    print('N:', N)
+
+    Accu = float((N / LL) * 100)
+
+    # true positive among all the detected positive
+    Pre = (100 * TP) / (TP + FP + 1)
+    # true positive among all the real positive
+    Rec = (100 * TP) / (TP + FN + 1)
+    # The F1 score is the harmonic average of the precision and recall,
+    # where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.
+    F1 = (2 * Pre * Rec) / (100 * (Pre + Rec + 1))
+    # False positive rate--false alarm rate
+    FPR = (100 * FP) / (FP + TN+1)
+
+    return Accu, Pre, Rec, F1, FPR, D_L
+
+def detection_R_D_I(DD, Gs, T_mb, L_mb, seq_step, tao, lam):
+    # point-wise detection for one dimension
+    # (1-lambda)*R(x)+lambda*D(x)
+    # lambda=0.5?
+    # D_test, Gs, T_mb, L_mb  are of same size
+
+    R = np.absolute(Gs - T_mb)
+    R = np.mean(R, axis=2)
+    aa = DD.shape[0]
+    bb = DD.shape[1]
+
+    LL = (aa - 1) * seq_step + bb
+
+    DD = abs(DD.reshape([aa, bb]))
+    DD = 1-DD
+    L_mb = L_mb.reshape([aa, bb])
+    R = R.reshape([aa, bb])
+
+    D_L = np.zeros([LL, 1])
+    R_L = np.zeros([LL, 1])
+    L_L = np.zeros([LL, 1])
+    L_pre = np.zeros([LL, 1])
+    Count = np.zeros([LL, 1])
+    for i in range(0, aa):
+        for j in range(0, bb):
+            # print('index:', i*10+j)
+            D_L[i * 10 + j] += DD[i, j]
+            L_L[i * 10 + j] += L_mb[i, j]
+            R_L[i * 10 + j] += R[i, j]
+            Count[i * 10 + j] += 1
+    D_L /= Count
+    L_L /= Count
+    R_L /= Count
+
+    TP, TN, FP, FN = 0, 0, 0, 0
+
+    for i in range(LL):
+        if (1-lam)*R_L[i] + lam*D_L[i] > tao:
+            # false
+            L_pre[i] = 1
+        else:
+            # true
+            L_pre[i] = 0
+
+        A = L_pre[i]
+        # print('A:', A)
+        B = L_L[i]
+        # print('B:', B)
+        if A == 1 and B == 1:
+            TP += 1
+        elif A == 1 and B == 0:
+            FP += 1
+        elif A == 0 and B == 0:
+            TN += 1
+        elif A == 0 and B == 1:
+            FN += 1
+
+    cc = (L_pre == L_L)
+    cc = list(cc.reshape([-1]))
+    N = cc.count(True)
+    Accu = float((N / (aa*bb)) * 100)
+
+    # true positive among all the detected positive
+    Pre = (100 * TP) / (TP + FP + 1)
+    # true positive among all the real positive
+    Rec = (100 * TP) / (TP + FN + 1)
+    # The F1 score is the harmonic average of the precision and recall,
+    # where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.
+    F1 = (2 * Pre * Rec) / (100 * (Pre + Rec + 1))
+    # False positive rate
+    FPR = (100 * FP) / (FP + TN+1)
+
+    return Accu, Pre, Rec, F1, FPR, L_pre
+
+def detection_R_I(Gs, T_mb, L_mb, seq_step, tao):
+    # point-wise detection for one dimension
+    # (1-lambda)*R(x)+lambda*D(x)
+    # lambda=0.5?
+    # D_test, Gs, T_mb, L_mb  are of same size
+
+    R = np.absolute(Gs - T_mb)
+    R = np.mean(R, axis=2)
+    aa = R.shape[0]
+    bb = R.shape[1]
+
+    LL = (aa - 1) * seq_step + bb
+
+    L_mb = L_mb.reshape([aa, bb])
+    R = R.reshape([aa, bb])
+
+    L_L = np.zeros([LL, 1])
+    R_L = np.zeros([LL, 1])
+    L_pre = np.zeros([LL, 1])
+    Count = np.zeros([LL, 1])
+    for i in range(0, aa):
+        for j in range(0, bb):
+            # print('index:', i*10+j)
+            L_L[i * 10 + j] += L_mb[i, j]
+            R_L[i * 10 + j] += R[i, j]
+            Count[i * 10 + j] += 1
+    L_L /= Count
+    R_L /= Count
+
+    TP, TN, FP, FN = 0, 0, 0, 0
+
+    for i in range(LL):
+        if R_L[i] > tao:
+            # false
+            L_pre[i] = 1
+        else:
+            # true
+            L_pre[i] = 0
+
+        A = L_pre[i]
+        B = L_L[i]
+        if A == 1 and B == 1:
+            TP += 1
+        elif A == 1 and B == 0:
+            FP += 1
+        elif A == 0 and B == 0:
+            TN += 1
+        elif A == 0 and B == 1:
+            FN += 1
+
+    cc = (L_pre == L_L)
+    cc = list(cc.reshape([-1]))
+    N = cc.count(True)
+    Accu = float((N / (aa*bb)) * 100)
+
+    # true positive among all the detected positive
+    Pre = (100 * TP) / (TP + FP + 1)
+    # true positive among all the real positive
+    Rec = (100 * TP) / (TP + FN + 1)
+    # The F1 score is the harmonic average of the precision and recall,
+    # where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.
+    F1 = (2 * Pre * Rec) / (100 * (Pre + Rec + 1))
+    # False positive rate
+    FPR = (100 * FP) / (FP + TN+1)
+
+    return Accu, Pre, Rec, F1, FPR, L_pre
+
+
+def sample_detection(D_test, L_mb, tao):
+    # sample-wise detection for one dimension
+
+    aa = D_test.shape[0]
+    bb = D_test.shape[1]
+
+    D_test = D_test.reshape([aa, bb])
+    L_mb = L_mb.reshape([aa, bb])
+    L = np.sum(L_mb, 1)
+    # NN = 0-10
+    L[L > 0] = 1
+
+    D_L = np.empty([aa, ])
+
+    for i in range(aa):
+        if np.mean(D_test[i, :]) > tao:
+            # true/negative
+            D_L[i] = 0
+        else:
+            # false/positive
+            D_L[i] = 1
+
+    cc = (D_L == L)
+    # cc = list(cc)
+    N = list(cc).count(True)
+    Accu = float((N / (aa)) * 100)
+
+    precision, recall, f1, _ = precision_recall_fscore_support(L, D_L, average='binary')
+
+    return Accu, precision, recall, f1
+
+
+def CUSUM_det(spe_n, spe_a, labels):
+
+    mu = np.mean(spe_n)
+    sigma = np.std(spe_n)
+
+    kk = 3*sigma
+    H = 15*sigma
+    print('H:', H)
+
+    tar = np.mean(spe_a)
+
+    mm = spe_a.shape[0]
+
+    SH = np.empty([mm, ])
+    SL = np.empty([mm, ])
+
+    for i in range(mm):
+        SH[-1] = 0
+        SL[-1] = 0
+        SH[i] = max(0, SH[i-1]+spe_a[i]-(tar+kk))
+        SL[i] = min(0, SL[i-1]+spe_a[i]-(tar-kk))
+
+
+    count = np.empty([mm, ])
+    TP = 0
+    TN = 0
+    FP = 0
+    FN = 0
+    for i in range(mm):
+        A = SH[i]
+        B = SL[i]
+        AA = H
+        BB = -H
+        if A <= AA and B >= BB:
+            count[i] = 0
+        else:
+            count[i] = 1
+
+        C = count[i]
+        D = labels[i]
+        if C == 1 and D == 1:
+            TP += 1
+        elif C == 1 and D == 0:
+            FP += 1
+        elif C == 0 and D == 0:
+            TN += 1
+        elif C == 0 and D == 1:
+            FN += 1
+
+    cc = (count == labels)
+    # cc = list(cc)
+    N = list(cc).count(True)
+    Accu = float((N / (mm)) * 100)
+
+    # true positive among all the detected positive
+    Pre = (100 * TP) / (TP + FP + 1)
+    # true positive among all the real positive
+    Rec = (100 * TP) / (TP + FN)
+    # The F1 score is the harmonic average of the precision and recall,
+    # where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.
+    F1 = (2 * Pre * Rec) / (100 * (Pre + Rec + 1))
+    # False positive rate
+    FPR = (100 * FP) / (FP + TN)
+
+    return Accu, Pre, Rec, F1, FPR
+
+
+def SPE(X, pc):
+    a = X.shape[0]
+    b = X.shape[1]
+
+    spe = np.empty([a])
+    # Square Prediction Error (square of residual distance)
+    #  spe = X'(I-PP')X
+    I = np.identity(b, float) - np.matmul(pc.transpose(1, 0), pc)
+    # I = np.matmul(I, I)
+    for i in range(a):
+        x = X[i, :].reshape([b, 1])
+        y = np.matmul(x.transpose(1, 0), I)
+        spe[i] = np.matmul(y, x)
+
+    return spe
+
+
+
+def generator_o(z, hidden_units_g, seq_length, batch_size, num_generated_features, reuse=False, parameters=None, learn_scale=True):
+    """
+    If parameters are supplied, initialise as such
+    """
+    # It is important to specify different variable scopes for the LSTM cells.
+    with tf.variable_scope("generator_o") as scope:
+
+        W_out_G_initializer = tf.constant_initializer(value=parameters['generator/W_out_G:0'])
+        b_out_G_initializer = tf.constant_initializer(value=parameters['generator/b_out_G:0'])
+        try:
+            scale_out_G_initializer = tf.constant_initializer(value=parameters['generator/scale_out_G:0'])
+        except KeyError:
+            scale_out_G_initializer = tf.constant_initializer(value=1)
+            assert learn_scale
+        lstm_initializer = tf.constant_initializer(value=parameters['generator/rnn/lstm_cell/weights:0'])
+        bias_start = parameters['generator/rnn/lstm_cell/biases:0']
+
+        W_out_G = tf.get_variable(name='W_out_G', shape=[hidden_units_g, num_generated_features], initializer=W_out_G_initializer)
+        b_out_G = tf.get_variable(name='b_out_G', shape=num_generated_features, initializer=b_out_G_initializer)
+        scale_out_G = tf.get_variable(name='scale_out_G', shape=1, initializer=scale_out_G_initializer, trainable=False)
+
+        inputs = z
+
+        cell = LSTMCell(num_units=hidden_units_g,
+                        state_is_tuple=True,
+                        initializer=lstm_initializer,
+                        bias_start=bias_start,
+                        reuse=reuse)
+        rnn_outputs, rnn_states = tf.nn.dynamic_rnn(
+            cell=cell,
+            dtype=tf.float32,
+            sequence_length=[seq_length] * batch_size,
+            inputs=inputs)
+        rnn_outputs_2d = tf.reshape(rnn_outputs, [-1, hidden_units_g])
+        logits_2d = tf.matmul(rnn_outputs_2d, W_out_G) + b_out_G #out put weighted sum
+        output_2d = tf.nn.tanh(logits_2d) # logits operation [-1, 1]
+        output_3d = tf.reshape(output_2d, [-1, seq_length, num_generated_features])
+    return output_3d
+
+
+def discriminator_o(x, hidden_units_d, reuse=False, parameters=None):
+
+    with tf.variable_scope("discriminator_0") as scope:
+
+        W_out_D_initializer = tf.constant_initializer(value=parameters['discriminator/W_out_D:0'])
+        b_out_D_initializer = tf.constant_initializer(value=parameters['discriminator/b_out_D:0'])
+
+        W_out_D = tf.get_variable(name='W_out_D', shape=[hidden_units_d, 1],  initializer=W_out_D_initializer)
+        b_out_D = tf.get_variable(name='b_out_D', shape=1, initializer=b_out_D_initializer)
+
+
+        inputs = x
+
+        cell = tf.contrib.rnn.LSTMCell(num_units=hidden_units_d, state_is_tuple=True, reuse=reuse)
+
+        rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell=cell, dtype=tf.float32, inputs=inputs)
+
+
+        logits = tf.einsum('ijk,km', rnn_outputs, W_out_D) + b_out_D # output weighted sum
+
+        output = tf.nn.sigmoid(logits) # y = 1 / (1 + exp(-x)). output activation [0, 1]. Probability??
+        # sigmoid output ([0,1]), Probability?
+
+    return output, logits
+
+
+def invert(settings, samples, para_path, g_tolerance=None, e_tolerance=0.1,
+           n_iter=None, max_iter=10000, heuristic_sigma=None):
+    """
+    Return the latent space points corresponding to a set of a samples
+    ( from gradient descent )
+    Note: this function is designed for ONE sample generation
+    """
+    # num_samples = samples.shape[0]
+    # cast samples to float32
+
+    samples = np.float32(samples)
+
+    # get the model
+    # if settings is a string, assume it's an identifier and load
+    if type(settings) == str:
+        settings = json.load(open('./experiments/settings/' + settings + '.txt', 'r'))
+
+
+
+    # print('Inverting', 1, 'samples using model', settings['identifier'], 'at epoch', epoch,)
+    # if not g_tolerance is None:
+    #     print('until gradient norm is below', g_tolerance)
+    # else:
+    #     print('until error is below', e_tolerance)
+
+
+    # get parameters
+    parameters = model.load_parameters(para_path)
+    # # assertions
+    # assert samples.shape[2] == settings['num_generated_features']
+    # create VARIABLE Z
+    Z = tf.get_variable(name='Z', shape=[1, settings['seq_length'],
+                                         settings['latent_dim']],
+                        initializer=tf.random_normal_initializer())
+    # create outputs
+
+    G_samples = generator_o(Z, settings['hidden_units_g'], settings['seq_length'],
+                          1, settings['num_generated_features'],
+                          reuse=False, parameters=parameters)
+    # generator_vars = ['hidden_units_g', 'seq_length', 'batch_size', 'num_generated_features', 'cond_dim', 'learn_scale']
+    # generator_settings = dict((k, settings[k]) for k in generator_vars)
+    # G_samples = model.generator(Z, **generator_settings, reuse=True)
+
+    fd = None
+
+    # define loss mmd-based loss
+    if heuristic_sigma is None:
+        heuristic_sigma = mmd.median_pairwise_distance_o(samples)  # this is noisy
+        print('heuristic_sigma:', heuristic_sigma)
+    samples = tf.reshape(samples, [1, settings['seq_length'], settings['num_generated_features']])
+    Kxx, Kxy, Kyy, wts = mmd._mix_rbf_kernel(G_samples, samples, sigmas=tf.constant(value=heuristic_sigma, shape=(1, 1)))
+    similarity_per_sample = tf.diag_part(Kxy)
+    reconstruction_error_per_sample = 1 - similarity_per_sample
+    # reconstruction_error_per_sample = tf.reduce_sum((tf.nn.l2_normalize(G_samples, dim=1) - tf.nn.l2_normalize(samples, dim=1))**2, axis=[1,2])
+    similarity = tf.reduce_mean(similarity_per_sample)
+    reconstruction_error = 1 - similarity
+    # updater
+    #    solver = tf.train.AdamOptimizer().minimize(reconstruction_error_per_sample, var_list=[Z])
+    # solver = tf.train.RMSPropOptimizer(learning_rate=500).minimize(reconstruction_error, var_list=[Z])
+    solver = tf.train.RMSPropOptimizer(learning_rate=0.1).minimize(reconstruction_error_per_sample, var_list=[Z])
+    # solver = tf.train.MomentumOptimizer(learning_rate=0.1, momentum=0.9).minimize(reconstruction_error_per_sample, var_list=[Z])
+
+    grad_Z = tf.gradients(reconstruction_error_per_sample, Z)[0]
+    grad_per_Z = tf.norm(grad_Z, axis=(1, 2))
+    grad_norm = tf.reduce_mean(grad_per_Z)
+    # solver = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(reconstruction_error, var_list=[Z])
+    print('Finding latent state corresponding to samples...')
+
+    sess = tf.Session()
+    sess.run(tf.global_variables_initializer())
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        error = sess.run(reconstruction_error, feed_dict=fd)
+        g_n = sess.run(grad_norm, feed_dict=fd)
+        # print(g_n)
+        i = 0
+        if not n_iter is None:
+            while i < n_iter:
+                _ = sess.run(solver, feed_dict=fd)
+                error = sess.run(reconstruction_error, feed_dict=fd)
+                i += 1
+        else:
+            if not g_tolerance is None:
+                while g_n > g_tolerance:
+                    _ = sess.run(solver, feed_dict=fd)
+                    error, g_n = sess.run([reconstruction_error, grad_norm], feed_dict=fd)
+                    i += 1
+                    print(error, g_n)
+                    if i > max_iter:
+                        break
+            else:
+                while np.abs(error) > e_tolerance:
+                    _ = sess.run(solver, feed_dict=fd)
+                    error = sess.run(reconstruction_error, feed_dict=fd)
+                    i += 1
+                    # print(error)
+                    if i > max_iter:
+                        break
+        Zs = sess.run(Z, feed_dict=fd)
+        Gs = sess.run(G_samples, feed_dict={Z: Zs})
+        error_per_sample = sess.run(reconstruction_error_per_sample, feed_dict=fd)
+        print('Z found in', i, 'iterations with final reconstruction error of', error)
+    tf.reset_default_graph()
+
+    return Gs, Zs, error_per_sample, heuristic_sigma
+
+
+def dis_trained_model(settings, samples, para_path):
+    """
+    Return the discrimination results of  num_samples testing samples from a trained model described by settings dict
+    Note: this function is designed for ONE sample discrimination
+    """
+
+    # if settings is a string, assume it's an identifier and load
+    if type(settings) == str:
+        settings = json.load(open('./experiments/settings/' + settings + '.txt', 'r'))
+
+    num_samples = samples.shape[0]
+    samples = np.float32(samples)
+    num_variables = samples.shape[2]
+    # samples = np.reshape(samples, [1, settings['seq_length'], settings['num_generated_features']])
+
+    # get the parameters, get other variables
+    # parameters = model.load_parameters(settings['sub_id'] + '_' + str(settings['seq_length']) + '_' + str(epoch))
+    parameters = model.load_parameters(para_path)
+    # settings['sub_id'] + '_' + str(settings['seq_length']) + '_' + str(epoch)
+
+    # create placeholder, T samples
+    # T = tf.placeholder(tf.float32, [settings['batch_size'], settings['seq_length'], settings['num_generated_features']])
+
+    T = tf.placeholder(tf.float32, [num_samples, settings['seq_length'], num_variables])
+
+    # create the discriminator (GAN)
+    # normal GAN
+    D_t, L_t = discriminator_o(T, settings['hidden_units_d'], reuse=False, parameters=parameters)
+
+    config = tf.ConfigProto()
+    config.gpu_options.allow_growth = True
+    # with tf.device('/gpu:1'):
+    with tf.Session(config=config) as sess:
+        sess.run(tf.global_variables_initializer())
+        D_t, L_t = sess.run([D_t, L_t], feed_dict={T: samples})
+
+    tf.reset_default_graph()
+    return D_t, L_t
+
+def dis_D_model(settings, samples, para_path):
+    """
+    Return the discrimination results of  num_samples testing samples from a trained model described by settings dict
+    Note: this function is designed for ONE sample discrimination
+    """
+
+    # if settings is a string, assume it's an identifier and load
+    if type(settings) == str:
+        settings = json.load(open('./experiments/settings/' + settings + '.txt', 'r'))
+
+    # num_samples = samples.shape[0]
+    samples = np.float32(samples)
+    samples = np.reshape(samples, [1, settings['seq_length'], settings['num_generated_features']])
+
+    # get the parameters, get other variables
+    parameters = model.load_parameters(para_path)
+    # create placeholder, T samples
+
+    T = tf.placeholder(tf.float32, [1, settings['seq_length'], settings['num_generated_features']])
+
+    # create the discriminator (GAN or CGAN)
+    # normal GAN
+    D_t, L_t = discriminator_o(T, settings['hidden_units_d'], reuse=False, parameters=parameters)
+    # D_t, L_t = model.discriminator(T, settings['hidden_units_d'], settings['seq_length'], num_samples, reuse=False,
+    #               parameters=parameters, cond_dim=0, c=None, batch_mean=False)
+
+    config = tf.ConfigProto()
+    config.gpu_options.allow_growth = True
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        D_t, L_t = sess.run([D_t, L_t], feed_dict={T: samples})
+
+    tf.reset_default_graph()
+    return D_t, L_t
\ No newline at end of file
diff --git a/data_utils.py b/data_utils.py
new file mode 100644
index 0000000..5122361
--- /dev/null
+++ b/data_utils.py
@@ -0,0 +1,639 @@
+import numpy as np
+import pandas as pd
+import pdb
+import re
+from time import time
+import json
+import random
+
+import model
+
+from scipy.spatial.distance import pdist, squareform
+from scipy.stats import multivariate_normal, invgamma, mode
+from scipy.special import gamma
+from scipy.misc.pilutil import imresize
+from functools import partial
+from math import ceil
+
+from sklearn.metrics.pairwise import rbf_kernel
+from sklearn.preprocessing import MinMaxScaler
+from sklearn import preprocessing
+
+
+# --- deal with the SWaT data --- #
+def swat(seq_length, seq_step, num_signals, randomize=False):
+    """ Load and serialise """
+    # train = np.load('./data/swat.npy')
+    # print('Loaded swat from .npy')
+    train = np.loadtxt(open('./data/swat.csv'), delimiter=',')
+    print('Loaded swat from .csv')
+    m, n = train.shape # m=496800, n=52
+    for i in range(n - 1):
+        A = max(train[:, i])
+        if A != 0:
+            train[:, i] /= max(train[:, i])
+            # scale from -1 to 1
+            train[:, i] = 2 * train[:, i] - 1
+        else:
+            train[:, i] = train[:, i]
+
+    samples = train[21600:, 0:n-1]
+    labels = train[21600:, n-1]    # the last colummn is label
+    #############################
+    # -- choose variable for uni-variate GAN-AD -- #
+    # samples = samples[:, [1, 8, 18, 28]]
+    ############################
+    # -- apply PCA dimension reduction for multi-variate GAN-AD -- #
+    from sklearn.decomposition import PCA
+    # ALL SENSORS IDX
+    # XS = [0, 1, 5, 6, 7, 8, 16, 17, 18, 25, 26, 27, 28, 33, 34, 35, 36, 37, 38, 39, 40, 41, 44, 45, 46, 47]
+    # X_n = samples[:, XS]
+    # X_a = samples_a[:, XS]
+    # All VARIABLES
+    X_n = samples
+    ####################################
+    ###################################
+    # -- the best PC dimension is chosen pc=5 -- #
+    n_components = num_signals
+    pca = PCA(n_components, svd_solver='full')
+    pca.fit(X_n)
+    ex_var = pca.explained_variance_ratio_
+    pc = pca.components_
+
+    # projected values on the principal component
+    T_n = np.matmul(X_n, pc.transpose(1, 0))
+    samples = T_n
+
+    # # only for one-dimensional
+    # samples = T_n.reshape([samples.shape[0], ])
+    ###########################################
+    ###########################################
+    # seq_length = 7200
+    num_samples = (samples.shape[0]-seq_length)//seq_step
+    print("num_samples:", num_samples)
+    print("num_signals:", num_signals)
+    aa = np.empty([num_samples, seq_length, num_signals])
+    bb = np.empty([num_samples, seq_length, 1])
+
+    for j in range(num_samples):
+       bb[j, :, :] = np.reshape(labels[(j * seq_step):(j * seq_step + seq_length)], [-1,1])
+       for i in range(num_signals):
+           aa[j, :, i] = samples[(j * seq_step):(j*seq_step + seq_length), i]
+
+    # samples = aa[:, 0:7200:200, :]
+    # labels = bb[:, 0:7200:200, :]
+    samples = aa
+    labels = bb
+
+    return samples, labels
+
+def swat_birgan(seq_length, seq_step, num_signals, randomize=False):
+    """ Load and serialise """
+    # train = np.load('./data/swat.npy')
+    # print('Loaded swat from .npy')
+    train = np.loadtxt(open('./data/swat.csv'), delimiter=',')
+    print('Loaded swat from .csv')
+    m, n = train.shape # m=496800, n=52
+    for i in range(n - 1):
+        A = max(train[:, i])
+        if A != 0:
+            train[:, i] /= max(train[:, i])
+            # scale from -1 to 1
+            train[:, i] = 2 * train[:, i] - 1
+        else:
+            train[:, i] = train[:, i]
+
+    samples = train[21600:, 0:n-1]
+    labels = train[21600:, n-1]    # the last colummn is label
+    #############################
+    # # -- choose variable for uni-variate GAN-AD -- #
+    # # samples = samples[:, [1, 8, 18, 28]]
+    ###########################################
+    ###########################################
+    nn = samples.shape[1]
+    num_samples = (samples.shape[0]-seq_length)//seq_step
+    aa = np.empty([num_samples, nn, nn])
+    AA = np.empty([seq_length, nn])
+    bb = np.empty([num_samples, seq_length, 1])
+
+    print('Pre-process training data...')
+    for j in range(num_samples):
+       # display batch progress
+       model_bigan.display_batch_progression(j, num_samples)
+       bb[j, :, :] = np.reshape(labels[(j * seq_step):(j * seq_step + seq_length)], [-1,1])
+       for i in range(nn):
+           AA[:, i] = samples[(j * seq_step):(j * seq_step + seq_length), i]
+       aa[j, :, :] = np.cov(AA.T)
+
+    samples = aa
+    labels = bb
+
+    return samples, labels
+
+def swat_test(seq_length, seq_step, num_signals, randomize=False):
+    """ Load and serialise """
+    # test = np.load('./data/swat_a.npy')
+    # print('Loaded swat_a from .npy')
+    test = np.loadtxt(open('./data/swat_a.csv'), delimiter=',')
+    print('Loaded swat_a from .csv')
+    m, n = test.shape  # m1=449919, n1=52
+    for i in range(n - 1):
+        B = max(test[:, i])
+        if B != 0:
+            test[:, i] /= max(test[:, i])
+            # scale from -1 to 1
+            test[:, i] = 2 * test[:, i] - 1
+        else:
+            test[:, i] = test[:, i]
+
+    samples = test[:, 0:n - 1]
+    labels = test[:, n - 1]
+    idx = np.asarray(list(range(0, m)))  # record the idx of each point
+    #############################
+    # -- choose variable for uni-variate GAN-AD -- #
+    # samples = samples[:, [1,2,3,4]]
+    # samples_a = samples_a[:, [1,2,3,4]]
+    ############################
+    ############################
+    # -- apply PCA dimension reduction for multi-variate GAN-AD -- #
+    from sklearn.decomposition import PCA
+    import DR_discriminator as dr
+    # ALL SENSORS IDX
+    # XS = [0, 1, 5, 6, 7, 8, 16, 17, 18, 25, 26, 27, 28, 33, 34, 35, 36, 37, 38, 39, 40, 41, 44, 45, 46, 47]
+    # X_n = samples[:, XS]
+    # X_a = samples_a[:, XS]
+    # All VARIABLES
+    X_a = samples
+    ####################################
+    ###################################
+    # -- the best PC dimension is chosen pc=5 -- #
+    n_components = num_signals
+    pca_a = PCA(n_components, svd_solver='full')
+    pca_a.fit(X_a)
+    pc_a = pca_a.components_
+    # projected values on the principal component
+    T_a = np.matmul(X_a, pc_a.transpose(1, 0))
+
+    samples = T_a
+    # # only for one-dimensional
+    # samples = T_a.reshape([samples.shape[0], ])
+    ###########################################
+    ###########################################
+    num_samples_t = (samples.shape[0] - seq_length) // seq_step
+    aa = np.empty([num_samples_t, seq_length, num_signals])
+    bb = np.empty([num_samples_t, seq_length, 1])
+    bbb = np.empty([num_samples_t, seq_length, 1])
+
+    for j in range(num_samples_t):
+        bb[j, :, :] = np.reshape(labels[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
+        bbb[j, :, :] = np.reshape(idx[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
+        for i in range(num_signals):
+            aa[j, :, i] = samples[(j * seq_step):(j * seq_step + seq_length), i]
+
+    samples = aa
+    labels = bb
+    index = bbb
+
+    return samples, labels, index
+
+
+def swat_birgan_test(seq_length, seq_step, num_signals, randomize=False):
+    """ Load and serialise """
+    # train = np.load('./data/swat.npy')
+    # print('Loaded swat from .npy')
+    test = np.loadtxt(open('./data/swat_a.csv'), delimiter=',')
+    print('Loaded swat_a from .csv')
+    m, n = test.shape  # m1=449919, n1=52
+    for i in range(n - 1):
+        B = max(test[:, i])
+        if B != 0:
+            test[:, i] /= max(test[:, i])
+            # scale from -1 to 1
+            test[:, i] = 2 * test[:, i] - 1
+        else:
+            test[:, i] = test[:, i]
+
+    samples = test[:, 0:n - 1]
+    labels = test[:, n - 1]
+    # idx = np.asarray(list(range(0, m)))  # record the idx of each point
+    #############################
+    # # -- choose variable for uni-variate GAN-AD -- #
+    # # samples = samples[:, [1, 8, 18, 28]]
+    ###########################################
+    ###########################################
+    nn = samples.shape[1]
+    num_samples = (samples.shape[0]-seq_length)//seq_step
+    aa = np.empty([num_samples, nn, nn])
+    AA = np.empty([seq_length, nn])
+    bb = np.empty([num_samples, seq_length, 1])
+
+    print('Pre-process testing data...')
+    for j in range(num_samples):
+       # display batch progress
+       model_bigan.display_batch_progression(j, num_samples)
+       bb[j, :, :] = np.reshape(labels[(j * seq_step):(j * seq_step + seq_length)], [-1,1])
+       for i in range(nn):
+           AA[:, i] = samples[(j * seq_step):(j * seq_step + seq_length), i]
+       aa[j, :, :] = np.cov(AA.T)
+
+    samples = aa
+    labels = bb
+
+    return samples, labels
+
+
+def wadi(seq_length, seq_step, num_signals, randomize=False):
+    train = np.load('./data/wadi.npy')
+    print('Loaded wadi from .npy')
+    m, n = train.shape  # m=1048571, n=119
+    for i in range(n-1):
+        A = max(train[:, i])
+        if A != 0:
+            train[:, i] /= max(train[:, i])
+            # scale from -1 to 1
+            train[:, i] = 2 * train[:, i] - 1
+        else:
+            train[:, i] = train[:, i]
+
+    samples = train[259200:, 0:n-1]  # normal
+    labels = train[259200:, n-1]
+    #############################
+    samples = samples[:, [0, 3, 6, 17]]
+    # samples = samples[:, 0]
+    ############################
+    # # -- apply PCA dimension reduction for multi-variate GAN-AD -- #
+    # from sklearn.decomposition import PCA
+    # import DR_discriminator as dr
+    # X_n = samples
+    # ####################################
+    # ###################################
+    # # -- the best PC dimension is chosen pc=8 -- #
+    # n_components = num_signals
+    # pca = PCA(n_components, svd_solver='full')
+    # pca.fit(X_n)
+    # pc = pca.components_
+    # # projected values on the principal component
+    # T_n = np.matmul(X_n, pc.transpose(1, 0))
+    #
+    # samples = T_n
+    # # # only for one-dimensional
+    # # samples = T_n.reshape([samples.shape[0], ])
+    ###########################################
+    ###########################################
+    seq_length = 10800
+    num_samples = (samples.shape[0] - seq_length) // seq_step
+    print("num_samples:", num_samples)
+    print("num_signals:", num_signals)
+    aa = np.empty([num_samples, seq_length, num_signals])
+    bb = np.empty([num_samples, seq_length, 1])
+
+    for j in range(num_samples):
+        bb[j, :, :] = np.reshape(labels[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
+        # aa[j, :, :] = np.reshape(samples[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
+        for i in range(num_signals):
+            aa[j, :, i] = samples[(j * seq_step):(j * seq_step + seq_length), i]
+
+    samples = aa[:, 0:10800:300, :]
+    labels = bb[:, 0:10800:300, :]
+
+    return samples, labels
+
+
+def wadi_test(seq_length, seq_step, num_signals, randomize=False):
+    test = np.load('./data/wadi_a.npy')
+    print('Loaded wadi_a from .npy')
+    m, n = test.shape  # m1=172801, n1=119
+
+    for i in range(n - 1):
+        B = max(test[:, i])
+        if B != 0:
+            test[:, i] /= max(test[:, i])
+            # scale from -1 to 1
+            test[:, i] = 2 * test[:, i] - 1
+        else:
+            test[:, i] = test[:, i]
+
+    samples = test[:, 0:n - 1]
+    labels = test[:, n - 1]
+    idx = np.asarray(list(range(0, m)))  # record the idx of each point
+    #############################
+    ############################
+    # -- apply PCA dimension reduction for multi-variate GAN-AD -- #
+    from sklearn.decomposition import PCA
+    import DR_discriminator as dr
+    X_a = samples
+    ####################################
+    ###################################
+    # -- the best PC dimension is chosen pc=8 -- #
+    n_components = num_signals
+    pca_a = PCA(n_components, svd_solver='full')
+    pca_a.fit(X_a)
+    pc_a = pca_a.components_
+    # projected values on the principal component
+    T_a = np.matmul(X_a, pc_a.transpose(1, 0))
+
+    samples = T_a
+    # # only for one-dimensional
+    # samples = T_a.reshape([samples.shape[0], ])
+    ###########################################
+    ###########################################
+    num_samples_t = (samples.shape[0] - seq_length) // seq_step
+    aa = np.empty([num_samples_t, seq_length, num_signals])
+    bb = np.empty([num_samples_t, seq_length, 1])
+    bbb = np.empty([num_samples_t, seq_length, 1])
+
+    for j in range(num_samples_t):
+        bb[j, :, :] = np.reshape(labels[(j * 10):(j * seq_step + seq_length)], [-1, 1])
+        bbb[j, :, :] = np.reshape(idx[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
+        for i in range(num_signals):
+            aa[j, :, i] = samples[(j * seq_step):(j * seq_step + seq_length), i]
+
+    samples = aa
+    labels = bb
+    index = bbb
+
+    return samples, labels, index
+
+def kdd99(seq_length, seq_step, num_signals):
+    train = np.load('./data/kdd99_train.npy')
+    print('load kdd99_train from .npy')
+    m, n = train.shape  # m=562387, n=35
+    # normalization
+    for i in range(n - 1):
+        # print('i=', i)
+        A = max(train[:, i])
+        # print('A=', A)
+        if A != 0:
+            train[:, i] /= max(train[:, i])
+            # scale from -1 to 1
+            train[:, i] = 2 * train[:, i] - 1
+        else:
+            train[:, i] = train[:, i]
+
+    samples = train[:, 0:n - 1]
+    labels = train[:, n - 1]  # the last colummn is label
+    #############################
+    ############################
+    # -- apply PCA dimension reduction for multi-variate GAN-AD -- #
+    from sklearn.decomposition import PCA
+    X_n = samples
+    ####################################
+    ###################################
+    # -- the best PC dimension is chosen pc=6 -- #
+    n_components = num_signals
+    pca = PCA(n_components, svd_solver='full')
+    pca.fit(X_n)
+    ex_var = pca.explained_variance_ratio_
+    pc = pca.components_
+    # projected values on the principal component
+    T_n = np.matmul(X_n, pc.transpose(1, 0))
+    samples = T_n
+    # # only for one-dimensional
+    # samples = T_n.reshape([samples.shape[0], ])
+    ###########################################
+    ###########################################
+    num_samples = (samples.shape[0] - seq_length) // seq_step
+    aa = np.empty([num_samples, seq_length, num_signals])
+    bb = np.empty([num_samples, seq_length, 1])
+
+    for j in range(num_samples):
+        bb[j, :, :] = np.reshape(labels[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
+        for i in range(num_signals):
+            aa[j, :, i] = samples[(j * seq_step):(j * seq_step + seq_length), i]
+
+    samples = aa
+    labels = bb
+
+    return samples, labels
+
+def kdd99_test(seq_length, seq_step, num_signals):
+    test = np.load('./data/kdd99_test.npy')
+    print('load kdd99_test from .npy')
+
+    m, n = test.shape  # m1=494021, n1=35
+
+    for i in range(n - 1):
+        B = max(test[:, i])
+        if B != 0:
+            test[:, i] /= max(test[:, i])
+            # scale from -1 to 1
+            test[:, i] = 2 * test[:, i] - 1
+        else:
+            test[:, i] = test[:, i]
+
+    samples = test[:, 0:n - 1]
+    labels = test[:, n - 1]
+    idx = np.asarray(list(range(0, m)))  # record the idx of each point
+    #############################
+    ############################
+    # -- apply PCA dimension reduction for multi-variate GAN-AD -- #
+    from sklearn.decomposition import PCA
+    import DR_discriminator as dr
+    X_a = samples
+    ####################################
+    ###################################
+    # -- the best PC dimension is chosen pc=6 -- #
+    n_components = num_signals
+    pca_a = PCA(n_components, svd_solver='full')
+    pca_a.fit(X_a)
+    pc_a = pca_a.components_
+    # projected values on the principal component
+    T_a = np.matmul(X_a, pc_a.transpose(1, 0))
+    samples = T_a
+    # # only for one-dimensional
+    # samples = T_a.reshape([samples.shape[0], ])
+    ###########################################
+    ###########################################
+    num_samples_t = (samples.shape[0] - seq_length) // seq_step
+    aa = np.empty([num_samples_t, seq_length, num_signals])
+    bb = np.empty([num_samples_t, seq_length, 1])
+    bbb = np.empty([num_samples_t, seq_length, 1])
+
+    for j in range(num_samples_t):
+        bb[j, :, :] = np.reshape(labels[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
+        bbb[j, :, :] = np.reshape(idx[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
+        for i in range(num_signals):
+            aa[j, :, i] = samples[(j * seq_step):(j * seq_step + seq_length), i]
+
+    samples = aa
+    labels = bb
+    index = bbb
+
+    return samples, labels, index
+
+
+# ############################ data pre-processing #################################
+# --- to do with loading --- #
+# --- to do with loading --- #
+def get_samples_and_labels(settings):
+    """
+    Parse settings options to load or generate correct type of data,
+    perform test/train split as necessary, and reform into 'samples' and 'labels'
+    dictionaries.
+    """
+    if settings['data_load_from']:
+        data_path = './experiments/data/' + settings['data_load_from'] + '.data.npy'
+        print('Loading data from', data_path)
+        samples, pdf, labels = get_data('load', data_path)
+        train, vali, test = samples['train'], samples['vali'], samples['test']
+        train_labels, vali_labels, test_labels = labels['train'], labels['vali'], labels['test']
+        del samples, labels
+    else:
+        # generate the data
+        data_vars = ['num_samples', 'num_samples_t','seq_length', 'seq_step', 'num_signals', 'freq_low',
+                'freq_high', 'amplitude_low', 'amplitude_high', 'scale', 'full_mnist']
+        data_settings = dict((k, settings[k]) for k in data_vars if k in settings.keys())
+        samples, pdf, labels = get_data(settings['data'], settings['seq_length'], settings['seq_step'], settings['num_signals'], settings['sub_id'])
+        if 'multivariate_mnist' in settings and settings['multivariate_mnist']:
+            seq_length = samples.shape[1]
+            samples = samples.reshape(-1, int(np.sqrt(seq_length)), int(np.sqrt(seq_length)))
+        if 'normalise' in settings and settings['normalise']: # TODO this is a mess, fix
+            print(settings['normalise'])
+            norm = True
+        else:
+            norm = False
+        if labels is None:
+            train, vali, test = split(samples, [0.6, 0.2, 0.2], normalise=norm)
+            train_labels, vali_labels, test_labels = None, None, None
+        else:
+            train, vali, test, labels_list = split(samples, [0.6, 0.2, 0.2], normalise=norm, labels=labels)
+            train_labels, vali_labels, test_labels = labels_list
+
+    labels = dict()
+    labels['train'], labels['vali'], labels['test'] = train_labels, vali_labels, test_labels
+
+    samples = dict()
+    samples['train'], samples['vali'], samples['test'] = train, vali, test
+
+    # futz around with labels
+    # TODO refactor cause this is messy
+    if 'one_hot' in settings and settings['one_hot'] and not settings['data_load_from']:
+        if len(labels['train'].shape) == 1:
+            # ASSUME labels go from 0 to max_val inclusive, find max-val
+            max_val = int(np.max([labels['train'].max(), labels['test'].max(), labels['vali'].max()]))
+            # now we have max_val + 1 dimensions
+            print('Setting cond_dim to', max_val + 1, 'from', settings['cond_dim'])
+            settings['cond_dim'] = max_val + 1
+            print('Setting max_val to 1 from', settings['max_val'])
+            settings['max_val'] = 1
+
+            labels_oh = dict()
+            for (k, v) in labels.items():
+                A = np.zeros(shape=(len(v), settings['cond_dim']))
+                A[np.arange(len(v)), (v).astype(int)] = 1
+                labels_oh[k] = A
+            labels = labels_oh
+        else:
+            assert settings['max_val'] == 1
+            # this is already one-hot!
+
+    if 'predict_labels' in settings and settings['predict_labels']:
+        samples, labels = data_utils.make_predict_labels(samples, labels)
+        print('Setting cond_dim to 0 from', settings['cond_dim'])
+        settings['cond_dim'] = 0
+
+    # update the settings dictionary to update erroneous settings
+    # (mostly about the sequence length etc. - it gets set by the data!)
+    settings['seq_length'] = samples['train'].shape[1]
+    settings['num_samples'] = samples['train'].shape[0] + samples['vali'].shape[0] + samples['test'].shape[0]
+    settings['num_signals'] = samples['train'].shape[2]
+
+    return samples, pdf, labels
+
+
+def get_data(data_type, seq_length, seq_step, num_signals, sub_id, eval_single, eval_an, data_options=None):
+    """
+    Helper/wrapper function to get the requested data.
+    """
+    print('data_type')
+    labels = None
+    index = None
+    if data_type == 'load':
+        data_dict = np.load(data_options).item()
+        samples = data_dict['samples']
+        pdf = data_dict['pdf']
+        labels = data_dict['labels']
+    elif data_type == 'swat':
+        samples, labels = swat(seq_length, seq_step, num_signals)
+    elif data_type == 'swat_test':
+        samples, labels, index = swat_test(seq_length, seq_step, num_signals)
+    elif data_type == 'kdd99':
+        samples, labels = kdd99(seq_length, seq_step, num_signals)
+    elif data_type == 'kdd99_test':
+        samples, labels, index = kdd99_test(seq_length, seq_step, num_signals)
+    elif data_type == 'wadi':
+        samples, labels = wadi(seq_length, seq_step, num_signals)
+    elif data_type == 'wadi_test':
+        samples, labels, index = wadi_test(seq_length, seq_step, num_signals)
+    else:
+        raise ValueError(data_type)
+    print('Generated/loaded', len(samples), 'samples from data-type', data_type)
+    return samples, labels, index
+
+
+def get_batch(samples, batch_size, batch_idx, labels=None):
+    start_pos = batch_idx * batch_size
+    end_pos = start_pos + batch_size
+    if labels is None:
+        return samples[start_pos:end_pos], None
+    else:
+        if type(labels) == tuple: # two sets of labels
+            assert len(labels) == 2
+            return samples[start_pos:end_pos], labels[0][start_pos:end_pos], labels[1][start_pos:end_pos]
+        else:
+            assert type(labels) == np.ndarray
+            return samples[start_pos:end_pos], labels[start_pos:end_pos]
+
+
+
+def split(samples, proportions, normalise=False, scale=False, labels=None, random_seed=None):
+    """
+    Return train/validation/test split.
+    """
+    if random_seed != None:
+        random.seed(random_seed)
+        np.random.seed(random_seed)
+    assert np.sum(proportions) == 1
+    n_total = samples.shape[0]
+    n_train = ceil(n_total * proportions[0])
+    n_test = ceil(n_total * proportions[2])
+    n_vali = n_total - (n_train + n_test)
+    # permutation to shuffle the samples
+    shuff = np.random.permutation(n_total)
+    train_indices = shuff[:n_train]
+    vali_indices = shuff[n_train:(n_train + n_vali)]
+    test_indices = shuff[(n_train + n_vali):]
+    # TODO when we want to scale we can just return the indices
+    assert len(set(train_indices).intersection(vali_indices)) == 0
+    assert len(set(train_indices).intersection(test_indices)) == 0
+    assert len(set(vali_indices).intersection(test_indices)) == 0
+    # split up the samples
+    train = samples[train_indices]
+    vali = samples[vali_indices]
+    test = samples[test_indices]
+    # apply the same normalisation scheme to all parts of the split
+    if normalise:
+        if scale: raise ValueError(normalise, scale)  # mutually exclusive
+        train, vali, test = normalise_data(train, vali, test)
+    elif scale:
+        train, vali, test = scale_data(train, vali, test)
+    if labels is None:
+        return train, vali, test
+    else:
+        print('Splitting labels...')
+        if type(labels) == np.ndarray:
+            train_labels = labels[train_indices]
+            vali_labels = labels[vali_indices]
+            test_labels = labels[test_indices]
+            labels_split = [train_labels, vali_labels, test_labels]
+        elif type(labels) == dict:
+            # more than one set of labels!  (weird case)
+            labels_split = dict()
+            for (label_name, label_set) in labels.items():
+                train_labels = label_set[train_indices]
+                vali_labels = label_set[vali_indices]
+                test_labels = label_set[test_indices]
+                labels_split[label_name] = [train_labels, vali_labels, test_labels]
+        else:
+            raise ValueError(type(labels))
+        return train, vali, test, labels_split
diff --git a/eugenium_mmd.py b/eugenium_mmd.py
new file mode 100644
index 0000000..30dbd75
--- /dev/null
+++ b/eugenium_mmd.py
@@ -0,0 +1,240 @@
+'''
+Code taken from: https://github.com/eugenium/mmd
+(modified slightly for efficiency/PEP by Stephanie Hyland)
+
+Python implementation of MMD and Covariance estimates for Relative MMD
+
+Some code is based on code from Vincent Van Asch 
+which is based on matlab code from Arthur Gretton
+
+
+Eugene Belilovsky
+eugene.belilovsky@inria.fr
+'''
+import numpy as np
+import scipy as sp
+from numpy import sqrt
+from sklearn.metrics.pairwise import rbf_kernel
+from functools import partial
+import pdb
+
+def my_kernel(X, Y, sigma):
+    gamma = 1 / (2 * sigma**2)
+    if len(X.shape) == 2:
+        X_sqnorms = np.einsum('...i,...i', X, X)
+        Y_sqnorms = np.einsum('...i,...i', Y, Y)
+        XY = np.einsum('ia,ja', X, Y)
+    elif len(X.shape) == 3:
+        X_sqnorms = np.einsum('...ij,...ij', X, X)
+        Y_sqnorms = np.einsum('...ij,...ij', Y, Y)
+        XY = np.einsum('iab,jab', X, Y)
+    else:
+        pdb.set_trace()
+    Kxy = np.exp(-gamma*(X_sqnorms.reshape(-1, 1) - 2*XY + Y_sqnorms.reshape(1, -1)))
+    return Kxy
+
+def MMD_3_Sample_Test(X, Y, Z, sigma=-1, SelectSigma=True, computeMMDs=False):
+    '''Performs the relative MMD test which returns a test statistic for whether Y is closer to X or than Z.
+    See http://arxiv.org/pdf/1511.04581.pdf
+    The bandwith heuristic is based on the median heuristic (see Smola,Gretton).
+    '''
+    if(sigma<0):
+        #Similar heuristics
+        if SelectSigma:
+            siz=np.min((1000, X.shape[0]))
+            sigma1=kernelwidthPair(X[0:siz], Y[0:siz]);
+            sigma2=kernelwidthPair(X[0:siz], Z[0:siz]);
+            sigma=(sigma1+sigma2)/2.
+        else:
+            siz=np.min((1000, X.shape[0]*3))
+            Zem=np.r_[X[0:siz/3], Y[0:siz/3], Z[0:siz/3]]
+            sigma=kernelwidth(Zem);
+
+    #kernel = partial(rbf_kernel, gamma=1.0/(sigma**2))
+    kernel = partial(my_kernel, sigma=sigma)
+    #kernel = partial(grbf, sigma=sigma)
+
+    Kyy = kernel(Y, Y)
+    Kzz = kernel(Z, Z)
+    Kxy = kernel(X, Y)
+    Kxz = kernel(X, Z)
+
+    Kyynd = Kyy-np.diag(np.diagonal(Kyy))
+    Kzznd = Kzz-np.diag(np.diagonal(Kzz))
+    m = Kxy.shape[0];
+    n = Kyy.shape[0];
+    r = Kzz.shape[0];    
+
+    
+    u_yy=np.sum(Kyynd)*( 1./(n*(n-1)) )
+    u_zz=np.sum(Kzznd)*( 1./(r*(r-1)) )
+    u_xy=np.sum(Kxy)/(m*n)
+    u_xz=np.sum(Kxz)/(m*r)
+    #Compute the test statistic
+    t=u_yy - 2.*u_xy - (u_zz-2.*u_xz)
+    Diff_Var, Diff_Var_z2, data=MMD_Diff_Var(Kyy, Kzz, Kxy, Kxz)
+
+    pvalue=sp.stats.norm.cdf(-t/np.sqrt((Diff_Var)))
+  #  pvalue_z2=sp.stats.norm.cdf(-t/np.sqrt((Diff_Var_z2)))
+    tstat=t/sqrt(Diff_Var)
+    
+    if(computeMMDs):
+         Kxx = kernel(X, X)
+         Kxxnd = Kxx-np.diag(np.diagonal(Kxx))
+         u_xx=np.sum(Kxxnd)*( 1./(m*(m-1)) )
+         MMDXY=u_xx+u_yy-2.*u_xy
+         MMDXZ=u_xx+u_zz-2.*u_xz
+    else:
+         MMDXY=None
+         MMDXZ=None
+    return pvalue, tstat, sigma, MMDXY, MMDXZ
+    
+def MMD_Diff_Var(Kyy, Kzz, Kxy, Kxz):
+    '''
+    Compute the variance of the difference statistic MMDXY-MMDXZ
+    See http://arxiv.org/pdf/1511.04581.pdf Appendix for derivations
+    '''
+    m = Kxy.shape[0];
+    n = Kyy.shape[0];
+    r = Kzz.shape[0];
+    
+    
+    Kyynd = Kyy-np.diag(np.diagonal(Kyy));
+    Kzznd = Kzz-np.diag(np.diagonal(Kzz));
+    
+    u_yy=np.sum(Kyynd)*( 1./(n*(n-1)) );
+    u_zz=np.sum(Kzznd)*( 1./(r*(r-1)) );
+    u_xy=np.sum(Kxy)/(m*n);
+    u_xz=np.sum(Kxz)/(m*r);
+    
+    #compute zeta1
+    t1=(1./n**3)*np.sum(Kyynd.T.dot(Kyynd))-u_yy**2;
+    t2=(1./(n**2*m))*np.sum(Kxy.T.dot(Kxy))-u_xy**2;
+    t3=(1./(n*m**2))*np.sum(Kxy.dot(Kxy.T))-u_xy**2;
+    t4=(1./r**3)*np.sum(Kzznd.T.dot(Kzznd))-u_zz**2;
+    t5=(1./(r*m**2))*np.sum(Kxz.dot(Kxz.T))-u_xz**2;
+    t6=(1./(r**2*m))*np.sum(Kxz.T.dot(Kxz))-u_xz**2;
+    t7=(1./(n**2*m))*np.sum(Kyynd.dot(Kxy.T))-u_yy*u_xy;
+    t8=(1./(n*m*r))*np.sum(Kxy.T.dot(Kxz))-u_xz*u_xy;
+    t9=(1./(r**2*m))*np.sum(Kzznd.dot(Kxz.T))-u_zz*u_xz;
+    
+    zeta1=(t1+t2+t3+t4+t5+t6-2.*(t7+t8+t9)); 
+    
+    zeta2=(1/m/(m-1))*np.sum((Kyynd-Kzznd-Kxy.T-Kxy+Kxz+Kxz.T)**2)-(u_yy - 2.*u_xy - (u_zz-2.*u_xz))**2;
+    
+    
+    data=dict({'t1':t1,
+               't2':t2,
+               't3':t3,
+               't4':t4,
+               't5':t5,
+               't6':t6,
+               't7':t7,
+               't8':t8,
+               't9':t9,
+               'zeta1':zeta1,
+               'zeta2':zeta2,
+                })
+    #TODO more precise version for zeta2 
+    #    xx=(1/m^2)*sum(sum(Kxxnd.*Kxxnd))-u_xx^2;
+    # yy=(1/n^2)*sum(sum(Kyynd.*Kyynd))-u_yy^2;
+    #xy=(1/(n*m))*sum(sum(Kxy.*Kxy))-u_xy^2;
+    #xxy=(1/(n*m^2))*sum(sum(Kxxnd*Kxy))-u_xx*u_xy;
+    #yyx=(1/(n^2*m))*sum(sum(Kyynd*Kxy'))-u_yy*u_xy;
+    #zeta2=(xx+yy+xy+xy-2*(xxy+xxy +yyx+yyx))
+    
+    
+    Var=(4.*(m-2)/(m*(m-1)))*zeta1;
+    Var_z2=Var+(2./(m*(m-1)))*zeta2;
+
+    return Var, Var_z2, data
+def grbf(x1, x2, sigma):
+    '''Calculates the Gaussian radial base function kernel'''
+    n, nfeatures = x1.shape
+    m, mfeatures = x2.shape
+    
+    k1 = np.sum((x1*x1), 1)
+    q = np.tile(k1, (m, 1)).transpose()
+    del k1
+    
+    k2 = np.sum((x2*x2), 1)
+    r = np.tile(k2.T, (n, 1))
+    del k2
+    
+    h = q + r
+    del q, r
+    
+    # The norm
+    h = h - 2*np.dot(x1, x2.transpose())
+    h = np.array(h, dtype=float)
+    
+    return np.exp(-1.*h/(2.*pow(sigma, 2)))
+    
+    
+def kernelwidthPair(x1, x2):
+    '''Implementation of the median heuristic. See Gretton 2012
+       Pick sigma such that the exponent of exp(- ||x-y|| / (2*sigma2)),
+       in other words ||x-y|| / (2*sigma2),  equals 1 for the median distance x
+       and y of all distances between points from both data sets X and Y.
+    '''
+    n, nfeatures = x1.shape
+    m, mfeatures = x2.shape
+    
+    k1 = np.sum((x1*x1), 1)
+    q = np.tile(k1, (m, 1)).transpose()
+    del k1
+    
+    k2 = np.sum((x2*x2), 1)
+    r = np.tile(k2, (n, 1))
+    del k2
+    
+    h= q + r
+    del q, r
+    
+    # The norm
+    h = h - 2*np.dot(x1, x2.transpose())
+    h = np.array(h, dtype=float)
+    
+    mdist = np.median([i for i in h.flat if i])
+    
+    sigma = sqrt(mdist/2.0)
+    if not sigma: sigma = 1
+    
+    return sigma
+def kernelwidth(Zmed):
+    '''Alternative median heuristic when we cant partition the points
+    '''
+    m= Zmed.shape[0]
+    k1 = np.expand_dims(np.sum((Zmed*Zmed), axis=1), 1)
+    q = np.kron(np.ones((1, m)), k1)
+    r = np.kron(np.ones((m, 1)), k1.T)
+    del k1
+    
+    h= q + r
+    del q, r
+    
+    # The norm
+    h = h - 2.*Zmed.dot(Zmed.T)
+    h = np.array(h, dtype=float)
+    
+    mdist = np.median([i for i in h.flat if i])
+    
+    sigma = sqrt(mdist/2.0)
+    if not sigma: sigma = 1
+    
+    return sigma
+    
+    
+
+def MMD_unbiased(Kxx, Kyy, Kxy):
+#The estimate when distribution of x is not equal to y
+    m = Kxx.shape[0]
+    n = Kyy.shape[0]
+    
+    t1 = (1./(m*(m-1)))*np.sum(Kxx - np.diag(np.diagonal(Kxx)))
+    t2 = (2./(m*n)) * np.sum(Kxy)
+    t3 = (1./(n*(n-1)))* np.sum(Kyy - np.diag(np.diagonal(Kyy)))
+    
+    MMDsquared = (t1-t2+t3)
+    
+    return MMDsquared
diff --git a/eval.py b/eval.py
new file mode 100644
index 0000000..be41e7d
--- /dev/null
+++ b/eval.py
@@ -0,0 +1,728 @@
+#!/usr/bin/env ipython
+# Evaluation of models
+#
+
+import json
+import pdb
+import numpy as np
+import pandas as pd
+from eugenium_mmd import MMD_3_Sample_Test
+from scipy.stats import ks_2samp
+import mmd
+from sklearn.svm import SVC
+from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score, roc_auc_score, average_precision_score
+from sklearn.ensemble import RandomForestClassifier
+import sklearn
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+
+# for keras
+import keras
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Flatten
+from keras.layers import Conv2D, MaxPooling2D
+from keras.backend import clear_session
+
+import model
+import data_utils
+import plotting
+
+import pickle
+
+def assert_same_data(A, B):
+    # case 0, both loaded
+    if A['data'] == 'load' and B['data'] == 'load':
+        assert A['data_load_from'] == B['data_load_from']
+        data_path = './experiments/data/' + A['data_load_from']
+    elif A['data'] == 'load' and (not B['data'] == 'load'):
+        assert A['data_load_from'] == B['identifier']
+        data_path = './experiments/data/' + A['data_load_from']
+    elif (not A['data'] == 'load') and B['data'] == 'load':
+        assert B['data_load_from'] == A['identifier']
+        data_path = './experiments/data/' + A['identifier']
+    else:
+        raise ValueError(A['data'], B['data'])
+    return data_path
+
+def model_memorisation(identifier, epoch, max_samples=2000, tstr=False):
+    """
+    Compare samples from a model against training set and validation set in mmd
+    """
+    if tstr:
+        print('Loading data from TSTR experiment (not sampling from model)')
+        # load pre-generated samples
+        synth_data = np.load('./experiments/tstr/' + identifier + '_' + str(epoch) + '.data.npy').item()
+        model_samples = synth_data['samples']
+        synth_labels = synth_data['labels']
+        # load real data used in that experiment
+        real_data = np.load('./experiments/data/' + identifier + '.data.npy').item()
+        real_samples = real_data['samples']
+        train = real_samples['train']
+        test = real_samples['test']
+        n_samples = test.shape[0]
+        if model_samples.shape[0] > n_samples:
+            model_samples = np.random.permutation(model_samples)[:n_samples]
+        print('Data loaded successfully!')
+    else:
+        if identifier == 'cristobal_eICU':
+            model_samples = pickle.load(open('REDACTED', 'rb'))
+            samples, labels = data_utils.eICU_task()
+            train = samples['train'].reshape(-1,16,4)
+            vali = samples['vali'].reshape(-1,16,4)
+            test = samples['test'].reshape(-1,16,4)
+            #train_targets = labels['train']
+            #vali_targets = labels['vali']
+            #test_targets = labels['test']
+            train, vali, test = data_utils.scale_data(train, vali, test)
+            n_samples = test.shape[0]
+            if n_samples > max_samples:
+                n_samples = max_samples
+                test = np.random.permutation(test)[:n_samples]
+            if model_samples.shape[0] > n_samples:
+                model_samples = np.random.permutation(model_samples)[:n_samples]
+        elif identifier == 'cristobal_MNIST':
+            the_dir = 'REDACTED'
+            # pick a random one
+            which = np.random.choice(['NEW_OK_', '_r4', '_r5', '_r6', '_r7'])
+            model_samples, model_labels = pickle.load(open(the_dir + 'synth_mnist_minist_cdgan_1_2_100_multivar_14_nolr_rdim3_0_2_' + which + '_190.pk', 'rb'))
+            # get test and train...
+            # (generated with fixed seed...)
+            mnist_resized_dim = 14
+            samples, labels = data_utils.load_resized_mnist(mnist_resized_dim)
+            proportions = [0.6, 0.2, 0.2]
+            train, vali, test, labels_split = data_utils.split(samples, labels=labels, random_seed=1, proportions=proportions)
+            np.random.seed()
+            train = train.reshape(-1, 14, 14)
+            test = test.reshape(-1, 14, 14)
+            vali = vali.reshape(-1, 14, 14)
+            n_samples = test.shape[0]
+            if n_samples > max_samples:
+                n_samples = max_samples
+                test = np.random.permutation(test)[:n_samples]
+            if model_samples.shape[0] > n_samples:
+                model_samples = np.random.permutation(model_samples)[:n_samples]
+        else:
+            settings = json.load(open('./experiments/settings/' + identifier + '.txt', 'r'))
+            # get the test, train sets
+            data = np.load('./experiments/data/' + identifier + '.data.npy').item()
+            train = data['samples']['train']
+            test = data['samples']['test']
+            n_samples = test.shape[0]
+            if n_samples > max_samples:
+                n_samples = max_samples
+                test = np.random.permutation(test)[:n_samples]
+            model_samples = model.sample_trained_model(settings, epoch, n_samples)
+    all_samples = np.vstack([train, test, model_samples])
+    heuristic_sigma = mmd.median_pairwise_distance(all_samples)
+    print('heuristic sigma:', heuristic_sigma)
+    pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test(model_samples, test, np.random.permutation(train)[:n_samples], sigma=heuristic_sigma, computeMMDs=False)
+    #pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test(model_samples, np.random.permutation(train)[:n_samples], test, sigma=heuristic_sigma, computeMMDs=False)
+#    if pvalue < 0.05:
+#        print('At confidence level 0.05, we reject the null hypothesis that MMDXY <= MMDXZ, and conclude that the test data has a smaller MMD with the true data than the generated data')
+        # the function takes (X, Y, Z) as its first arguments, it's testing if MMDXY (i.e. MMD between model and train) is less than MMDXZ (MMd between model and test)
+#    else:
+#        print('We have failed to reject the null hypothesis that MMDXY <= MMDXZ, and cannot conclu#de that the test data has a smaller MMD with the true data than the generated data')
+    return pvalue, tstat, sigma
+
+def model_comparison(identifier_A, identifier_B, epoch_A=99, epoch_B=99):
+    """
+    Compare two models using relative MMD test
+    """
+    # make sure they used the same data
+    settings_A = json.load(open('./experiments/settings/' + identifier_A + '.txt', 'r'))
+    settings_B = json.load(open('./experiments/settings/' + identifier_B + '.txt', 'r'))
+    data_path = assert_same_data(settings_A, settings_B)
+    # now load the data
+    data = np.load(data_path + '.data.npy').item()['samples']['vali']
+    n_samples = data.shape[0]
+    A_samples = model.sample_trained_model(settings_A, epoch_A, n_samples)
+    B_samples = model.sample_trained_model(settings_B, epoch_B, n_samples)
+    # do the comparison
+    # TODO: support multiple signals
+    ## some notes about this test:
+    ## MMD_3_Sample_Test(X, Y, Z) tests the hypothesis that Px is closer to Pz than Py
+    ## that is, test the null hypothesis H0:
+    ##   MMD(F, Px, Py) <= MMD(F, Px, Pz)
+    ## versus the alternate hypothesis:
+    ##   MMD(F, Px, Py) > MMD(F, Px, Pz)
+    ## at significance level that we select later (just the threshold on the p-value)
+    pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test(data[:, :, 0], A_samples[:, :, 0], B_samples[:, :, 0], computeMMDs=True)
+    print(pvalue, tstat, sigma)
+    if pvalue < 0.05:
+        print('At confidence level 0.05, we reject the null hypothesis that MMDXY <= MMDXZ, and conclude that', identifier_B, 'has a smaller MMD with the true data than', identifier_A)
+    else:
+        print('We have failed to reject the null hypothesis that MMDXY <= MMDXZ, and cannot conclude that', identifier_B, 'has a smaller MMD with the true data than', identifier_A)
+    return pvalue, tstat, sigma, MMDXY, MMDXZ
+
+# --- to do with reconstruction --- #
+
+def get_reconstruction_errors(identifier, epoch, g_tolerance=0.05, max_samples=1000, rerun=False, tstr=False):
+    """
+    Get the reconstruction error of every point in the training set of a given
+    experiment.
+    """
+    settings = json.load(open('./experiments/settings/' + identifier + '.txt', 'r'))
+    if settings['data_load_from']:
+        data_dict = np.load('./experiments/data/' + settings['data_load_from'] + '.data.npy').item()
+    else:
+        data_dict = np.load('./experiments/data/' + identifier + '.data.npy').item()
+    samples = data_dict['samples']
+    train = samples['train']
+    vali = samples['vali']
+    test = samples['test']
+    labels = data_dict['labels']
+    train_labels, test_labels, synth_labels, vali_labels = None, None, None, None
+    try:
+        if rerun:
+            raise FileNotFoundError
+        errors = np.load('./experiments/eval/' + identifier + '_' + str(epoch) + '_' + str(g_tolerance) + '.reconstruction_errors.npy').item()
+        train_errors = errors['train']
+        test_errors = errors['test']
+        generated_errors = errors['generated']
+        noisy_errors = errors['noisy']
+        print('Loaded precomputed errors')
+    except FileNotFoundError:
+        if tstr:
+            synth_data = np.load('./experiments/tstr/' + identifier + '_' + str(epoch) + '.data.npy').item()
+            generated = synth_data['samples']
+            synth_labels = synth_data['labels']
+            train_labels = labels['train']
+            test_labels = labels['test']
+            vali_labels = labels['vali']
+        else:
+            # generate new data
+            n_eval = 500
+            # generate "easy" samples from the distribution
+            generated = model.sample_trained_model(settings, epoch, n_eval)
+            # generate "hard' random samples, not from train/test distribution
+            # TODO: use original validation examples, add noise etc.
+        ##    random_samples = np.random.normal(size=generated.shape)
+        #    random_samples -= np.mean(random_samples, axis=0) 
+        #    random_samples += np.mean(vali, axis=0)
+        #    random_samples /= np.std(random_samples, axis=0)
+        #    random_samples *= np.std(vali, axis=0)
+
+        # get all the errors
+        print('Getting reconstruction errors on train set')
+        if train.shape[0] > max_samples:
+            index_subset = np.random.permutation(train.shape[0])[:max_samples]
+            train = train[index_subset]
+            if train_labels is not None:
+                train_labels = train_labels[index_subset]
+        train_errors = error_per_sample(identifier, epoch, train, n_rep=5, g_tolerance=g_tolerance, C_samples=train_labels)
+        print('Getting reconstruction errors on test set')
+        if test.shape[0] > max_samples:
+            index_subset = np.random.permutation(test.shape[0])[:max_samples]
+            test = test[index_subset]
+            if test_labels is not None:
+                test_labels = test_labels[index_subset]
+        test_errors = error_per_sample(identifier, epoch, test, n_rep=5, g_tolerance=g_tolerance, C_samples=test_labels)
+        D_test, p_test = ks_2samp(train_errors, test_errors)
+        print('KS statistic and p-value for train v. test erors:', D_test, p_test)
+        pdb.set_trace()
+        print('Getting reconstruction errors on generated set')
+        generated_errors = error_per_sample(identifier, epoch, generated, n_rep=5, g_tolerance=g_tolerance, C_samples=synth_labels)
+        D_gen, p_gen = ks_2samp(generated_errors, train_errors)
+        print('KS statistic and p-value for train v. gen erors:', D_gen, p_gen)
+        D_gentest, p_gentest = ks_2samp(generated_errors, test_errors)
+        print('KS statistic and p-value for gen v. test erors:', D_gentest, p_gentest)
+#        print('Getting reconstruction errors on noisy set')
+#        alpha = 0.5
+#        noisy_samples = alpha*vali + (1-alpha)*np.random.permutation(vali)
+#        noisy_errors = error_per_sample(identifier, epoch, noisy_samples, n_rep=5, g_tolerance=g_tolerance, C_samples=vali_labels)
+        noisy_errors = None
+        # save!
+        errors = {'train': train_errors, 'test': test_errors, 'generated': generated_errors, 'noisy': noisy_errors}
+        np.save('./experiments/eval/' + identifier + '_' + str(epoch) + '_' + str(g_tolerance) + '.reconstruction_errors.npy', errors)
+    # do two-sample Kolomogorov-Smirnov test for equality
+    D_test, p_test = ks_2samp(train_errors, test_errors)
+    print('KS statistic and p-value for train v. test erors:', D_test, p_test)
+    D_gen, p_gen = ks_2samp(generated_errors, train_errors)
+    print('KS statistic and p-value for train v. gen erors:', D_gen, p_gen)
+    D_gentest, p_gentest = ks_2samp(generated_errors, test_errors)
+    print('KS statistic and p-value for gen v. test erors:', D_gentest, p_gentest)
+    # visualise distribution of errors for train and test
+    plotting.reconstruction_errors(identifier + '_' + str(epoch) + '_' + str(g_tolerance), train_errors, test_errors, generated_errors, noisy_errors)
+    # visualise the "hardest" and "easiest" samples from train
+    ranking_train = np.argsort(train_errors)
+    easiest_train = ranking_train[:6]
+    hardest_train = ranking_train[-6:]
+    plotting.save_plot_sample(train[easiest_train], epoch, identifier + '_easytrain', n_samples=6, num_epochs=None, ncol=2)
+    plotting.save_plot_sample(train[hardest_train], epoch, identifier + '_hardtrain', n_samples=6, num_epochs=None, ncol=2)
+    # visualise the "hardest" and "easiest" samples from random
+#    ranking_random = np.argsort(noisy_errors)
+#    easiest_random = ranking_random[:6]
+#    hardest_random = ranking_random[-6:]
+#    plotting.save_plot_sample(random_samples[easiest_random], epoch, identifier + '_easyrandom', n_samples=6, num_epochs=None, ncol=2)
+#    plotting.save_plot_sample(random_samples[hardest_random], epoch, identifier + '_hardrandom', n_samples=6, num_epochs=None, ncol=2)
+    return True
+
+def error_per_sample(identifier, epoch, samples, n_rep=3, n_iter=None, g_tolerance=0.025, use_min=True, C_samples=None):
+    """
+    Get (average over a few runs) of the reconstruction error per sample
+    """
+    n_samples = samples.shape[0]
+    heuristic_sigma = np.float32(mmd.median_pairwise_distance(samples))
+    errors = np.zeros(shape=(n_samples, n_rep))
+    for rep in range(n_rep):
+        Z, rep_errors, sigma = model.invert(identifier, epoch, samples, n_iter=n_iter, heuristic_sigma=heuristic_sigma, g_tolerance=g_tolerance, C_samples=C_samples)
+        errors[:, rep] = rep_errors
+    # return min, or average?
+    if use_min:
+        errors = np.min(errors, axis=1)
+    else:
+        # use mean
+        errors = np.mean(errors, axis=1)
+    return errors
+
+# --- visualisation evaluation --- #
+
+def view_digit(identifier, epoch, digit, n_samples=6):
+    """
+    Generate a bunch of MNIST digits from a CGAN, view them
+    """
+    settings = json.load(open('./experiments/settings/' + identifier + '.txt', 'r'))
+    if settings['one_hot']:
+        assert settings['max_val'] == 1
+        assert digit <= settings['cond_dim']
+        C_samples = np.zeros(shape=(n_samples, settings['cond_dim']))
+        C_samples[:, digit] = 1
+    else:
+        assert settings['cond_dim'] == 1
+        assert digit <= settings['max_val']
+        C_samples = np.array([digit]*n_samples).reshape(-1, 1)
+    digit_samples = model.sample_trained_model(settings, epoch, n_samples, Z_samples=None, cond_dim=settings['cond_dim'], C_samples=C_samples)
+    digit_samples = digit_samples.reshape(n_samples, -1, 1)
+    # visualise
+    plotting.save_mnist_plot_sample(digit_samples, digit, identifier + '_' + str(epoch) + '_digit_', n_samples)
+    return True
+
+def view_interpolation(identifier, epoch, n_steps=6, input_samples=None, e_tolerance=0.01, sigma=3.29286853021):
+    """
+    If samples: generate interpolation between real points
+    Else:
+        Sample two points in the latent space, view a linear interpolation between them.
+    """
+    settings = json.load(open('./experiments/settings/' + identifier + '.txt', 'r'))
+    if input_samples is None:
+        # grab two trainng examples
+        data = np.load('./experiments/data/' + identifier + '.data.npy').item()
+        train = data['samples']['train']
+        input_samples = np.random.permutation(train)[:2]
+#        Z_sampleA, Z_sampleB = model.sample_Z(2, settings['seq_length'], settings['latent_dim'], 
+#                                          settings['use_time'])
+        if sigma is None:
+            ## gotta get a sigma somehow
+            sigma = mmd.median_pairwise_distance(train)
+            print('Calcualted heuristic sigma from training data:', sigma)
+    Zs, error, _ = model.invert(settings, epoch, input_samples, e_tolerance=e_tolerance)
+    Z_sampleA, Z_sampleB = Zs
+    Z_samples = plotting.interpolate(Z_sampleA, Z_sampleB, n_steps=n_steps)
+    samples = model.sample_trained_model(settings, epoch, Z_samples.shape[0], Z_samples)
+    # get distances from generated samples to target samples
+    d_A, d_B = [], []
+    for sample in samples:
+        d_A.append(sample_distance(sample, samples[0], sigma))
+        d_B.append(sample_distance(sample, samples[-1], sigma))
+    distances = pd.DataFrame({'dA': d_A, 'dB': d_B})
+    plotting.save_plot_interpolate(input_samples, samples, epoch, settings['identifier'] + '_epoch' + str(epoch), distances=distances, sigma=sigma)
+    return True
+
+def view_latent_vary(identifier, epoch, n_steps=6):
+    settings = json.load(open('./experiments/settings/' + identifier + '.txt', 'r'))
+    Z_sample = model.sample_Z(1, settings['seq_length'], settings['latent_dim'], 
+                                      settings['use_time'])[0]
+    samples_dim = []
+    for dim in range(settings['latent_dim']):
+        Z_samples_dim = plotting.vary_latent_dimension(Z_sample, dim, n_steps)
+        samples_dim.append(model.sample_trained_model(settings, epoch, Z_samples_dim.shape[0], Z_samples_dim))
+    plotting.save_plot_vary_dimension(samples_dim, epoch, settings['identifier'] + '_varydim', n_dim=settings['latent_dim'])
+    return True
+
+def view_reconstruction(identifier, epoch, real_samples, tolerance=1):
+    """
+    Given a set of real samples, find the "closest" latent space points 
+    corresponding to them, generate samples from these, visualise!
+    """
+    settings = json.load(open('./experiments/settings/' + identifier + '.txt', 'r'))
+    Zs, error, sigma = model.invert(settings, epoch, real_samples, tolerance=tolerance)
+    plotting.visualise_latent(Zs[0], identifier+'_' + str(epoch) + '_0')
+    plotting.visualise_latent(Zs[1], identifier+'_' + str(epoch) + '_1')
+    model_samples = model.sample_trained_model(settings, epoch, Zs.shape[0], Zs)
+    plotting.save_plot_reconstruct(real_samples, model_samples, settings['identifier'])
+    return True
+
+def view_fixed(identifier, epoch, n_samples=6, dim=None):
+    """ What happens when we give the same point at each time step? """
+    settings = json.load(open('./experiments/settings/' + identifier + '.txt', 'r'))
+    Z_samples = model.sample_Z(n_samples, settings['seq_length'], settings['latent_dim'], 
+                                      settings['use_time'])
+    # now, propagate forward the value at time 0 (which time doesn't matter)
+    for i in range(1, settings['seq_length']):
+        if dim is None:
+            Z_samples[:, i, :] = Z_samples[:, 0, :]
+        else:
+            Z_samples[:, i, dim] = Z_samples[:, 0, dim]
+    # now generate
+    samples = model.sample_trained_model(settings, epoch, n_samples, Z_samples)
+    # now visualise
+    plotting.save_plot_sample(samples, epoch, identifier + '_fixed', n_samples)
+    return True
+
+def view_params(identifier, epoch):
+    """ Visualise weight matrices in the GAN """
+    settings = json.load(open('./experiments/settings/' + identifier + '.txt', 'r'))
+    parameters = model.load_parameters(identifier + '_' + str(epoch))
+    plotting.plot_parameters(parameters, identifier + '_' + str(epoch))
+    return True
+
+# --- to do with samples --- #
+
+def sample_distance(sampleA, sampleB, sigma):
+    """
+    I know this isn't the best distance measure, alright.
+    """
+    # RBF!
+    gamma = 1 / (2 * sigma**2)
+    similarity = np.exp(-gamma*(np.linalg.norm(sampleA - sampleB)**2))
+    distance = 1 - similarity
+    return distance
+
+### --- TSTR ---- ###
+
+def train_CNN(train_X, train_Y, vali_X, vali_Y, test_X):
+    """
+    Train a CNN (code copied/adapted from Cristobal's mnist_keras_trts_0_2)
+	(ONLY MNIST, ONLY 14x14)
+    (ONLY DIGITS UP TO 3)
+    """
+    print('Training CNN!')
+    input_shape = (14,14,1)
+    batch_size = 128
+    num_classes = 3
+    epochs = 1000
+
+    m = Sequential()
+    m.add(Conv2D(16, kernel_size=(3, 3),
+                        activation='relu',
+                        input_shape=input_shape))
+    m.add(Conv2D(32, (3, 3), activation='relu'))
+    m.add(MaxPooling2D(pool_size=(2, 2)))
+    m.add(Dropout(0.25))
+    m.add(Flatten())
+    m.add(Dense(128, activation='relu'))
+    m.add(Dropout(0.5))
+    m.add(Dense(num_classes, activation='softmax'))
+
+    m.compile(loss=keras.losses.categorical_crossentropy,
+            optimizer=keras.optimizers.Adadelta(),
+            metrics=['accuracy'])
+
+    earlyStopping=keras.callbacks.EarlyStopping(monitor='val_loss', patience=0, verbose=1, mode='auto')
+    m.fit(np.expand_dims(train_X, axis=-1), train_Y,
+            batch_size=batch_size,
+            epochs=epochs,
+            verbose=1,
+            validation_data=(np.expand_dims(vali_X, axis=-1), vali_Y),
+            callbacks=[earlyStopping])
+    test_predictions = m.predict(np.expand_dims(test_X, axis=-1))
+    return test_predictions
+
+def TSTR_mnist(identifier, epoch, generate=True, duplicate_synth=1, vali=True, CNN=False, reverse=False):
+    """
+    Either load or generate synthetic training, real test data...
+    Load synthetic training, real test data, do multi-class SVM
+    (basically just this: http://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html)
+
+    If reverse = True: do TRTS
+    """
+    print('Running TSTR on', identifier, 'at epoch', epoch)
+    if vali:
+        test_set = 'vali'
+    else:
+        test_set = 'test'
+    if generate:
+        data = np.load('./experiments/data/' + identifier + '.data.npy').item()
+        samples = data['samples']
+        train_X = samples['train']
+        test_X = samples[test_set]
+        labels = data['labels']
+        train_Y = labels['train']
+        test_Y = labels[test_set]
+        # now sample from the model
+        synth_Y = np.tile(train_Y, [duplicate_synth, 1])
+        synth_X = model.sample_trained_model(identifier, epoch, num_samples=synth_Y.shape[0], C_samples=synth_Y)
+        # for use in TRTS
+        synth_testX = model.sample_trained_model(identifier, epoch, num_samples=test_Y.shape[0], C_samples=test_Y)
+        synth_data = {'samples': synth_X, 'labels': synth_Y, 'test_samples': synth_testX, 'test_labels': test_Y}
+        np.save('./experiments/tstr/' + identifier + '_' + str(epoch) + '.data.npy', synth_data)
+    else:
+        print('Loading synthetic data from pre-sampled model')
+        exp_data = np.load('./experiments/tstr/' + identifier + '_' + str(epoch) + '.data.npy').item()
+        test_X, test_Y = exp_data['test_data'], exp_data['test_labels']
+        train_X, train_Y = exp_data['train_data'], exp_data['train_labels']
+        synth_X, synth_Y = exp_data['synth_data'], exp_data['synth_labels']
+    if reverse:
+        which_setting = 'trts'
+        print('Swapping synthetic test set in for real, to do TRTS!')
+        test_X = synth_testX
+    else:
+        print('Doing normal TSTR')
+        which_setting = 'tstr'
+    # make classifier
+    if not CNN:
+        model_choice = 'RF'
+         # if multivariate, reshape
+        if len(test_X.shape) == 3:
+            test_X = test_X.reshape(test_X.shape[0], -1)
+        if len(train_X.shape) == 3:
+            train_X = train_X.reshape(train_X.shape[0], -1)
+        if len(synth_X.shape) == 3:
+            synth_X = synth_X.reshape(synth_X.shape[0], -1)
+        # if one hot, fix
+        if len(synth_Y.shape) > 1 and not synth_Y.shape[1] == 1:
+            synth_Y = np.argmax(synth_Y, axis=1)
+            train_Y = np.argmax(train_Y, axis=1)
+            test_Y = np.argmax(test_Y, axis=1)
+       # random forest
+        #synth_classifier = SVC(gamma=0.001)
+        #real_classifier = SVC(gamma=0.001)
+        synth_classifier = RandomForestClassifier(n_estimators=500)
+        real_classifier = RandomForestClassifier(n_estimators=500)
+        # fit
+        real_classifier.fit(train_X, train_Y)
+        synth_classifier.fit(synth_X, synth_Y)
+        # test on real
+        synth_predY = synth_classifier.predict(test_X)
+        real_predY = real_classifier.predict(test_X)
+    else:
+        model_choice = 'CNN'
+        synth_predY = train_CNN(synth_X, synth_Y, samples['vali'], labels['vali'], test_X)
+        clear_session()
+        real_predY = train_CNN(train_X, train_Y, samples['vali'], labels['vali'], test_X)
+        clear_session()
+        # CNN setting is all 'one-hot'
+        test_Y = np.argmax(test_Y, axis=1)
+        synth_predY = np.argmax(synth_predY, axis=1)
+        real_predY = np.argmax(real_predY, axis=1)
+    
+    # report on results
+    synth_prec, synth_recall, synth_f1, synth_support = precision_recall_fscore_support(test_Y, synth_predY, average='weighted')
+    synth_accuracy = accuracy_score(test_Y, synth_predY)
+    synth_auprc = 'NaN'
+    synth_auroc = 'NaN'
+    synth_scores = [synth_prec, synth_recall, synth_f1, synth_accuracy, synth_auprc, synth_auroc]
+    real_prec, real_recall, real_f1, real_support = precision_recall_fscore_support(test_Y, real_predY, average='weighted')
+    real_accuracy = accuracy_score(test_Y, real_predY)
+    real_auprc = 'NaN'
+    real_auroc = 'NaN'
+    real_scores = [real_prec, real_recall, real_f1, real_accuracy, real_auprc, real_auroc]
+    
+    all_scores = synth_scores + real_scores
+
+    if vali:
+        report_file = open('./experiments/tstr/vali.' + which_setting + '_report.v3.csv', 'a')
+        report_file.write('mnist,' + identifier + ',' + model_choice + ',' + str(epoch) + ',' + ','.join(map(str, all_scores)) + '\n')
+        report_file.close()
+    else:
+        report_file = open('./experiments/tstr/' + which_setting + '_report.v3.csv', 'a')
+        report_file.write('mnist,' + identifier + ',' + model_choice + ',' + str(epoch) + ',' + ','.join(map(str, all_scores)) + '\n')
+        report_file.close()
+        # visualise results
+        try:
+            plotting.view_mnist_eval(identifier + '_' + str(epoch), train_X, train_Y, synth_X, synth_Y, test_X, test_Y, synth_predY, real_predY)
+        except ValueError:
+            print('PLOTTING ERROR')
+            pdb.set_trace()
+    print(classification_report(test_Y, synth_predY))
+    print(classification_report(test_Y, real_predY))
+    return synth_f1, real_f1
+
+def TSTR_eICU(identifier, epoch, generate=True, vali=True, CNN=False, do_OR=False, duplicate_synth=1, reverse=False):
+    """
+    """
+    if vali:
+        test_set = 'vali'
+    else:
+        test_set = 'test'
+    data = np.load('./experiments/data/' + identifier + '.data.npy').item()
+    samples = data['samples']
+    train_X = samples['train']
+    test_X = samples[test_set]
+    labels = data['labels']
+    train_Y = labels['train']
+    test_Y = labels[test_set]
+    if generate:
+        # now sample from the model
+        synth_Y = np.tile(train_Y, [duplicate_synth, 1])
+        synth_X = model.sample_trained_model(identifier, epoch, num_samples=synth_Y.shape[0], C_samples=synth_Y)
+        # for use in TRTS
+        synth_testX = model.sample_trained_model(identifier, epoch, num_samples=test_Y.shape[0], C_samples=test_Y)
+        synth_data = {'samples': synth_X, 'labels': synth_Y, 'test_samples': synth_testX, 'test_labels': test_Y}
+        np.save('./experiments/tstr/' + identifier + '_' + str(epoch) + '.data.npy', synth_data)
+    else:
+        print('Loading pre-generated data')
+        print('WARNING: not implemented for TRTS')
+        # get "train" data
+        exp_data = np.load('./experiments/tstr/' + identifier + '_' + str(epoch) + '.data.npy').item()
+        synth_X = exp_data['samples']
+        synth_Y = exp_data['labels']
+        n_synth = synth_X.shape[0]
+        synth_X = synth_X.reshape(n_synth, -1)
+    #    pdb.set_trace()
+    #    # ALERT ALERT MODIFYING
+    #    synth_X = 2*(synth_X > 0) - 1
+    orig_data = np.load('/cluster/home/hyland/eICU_task_data.npy').item()
+    if reverse:
+        which_setting = 'trts'
+    # visualise distribution of errors for train and test
+        print('Swapping synthetic test set in for real, to do TRTS!')
+        test_X = synth_testX
+    else:
+        print('Doing normal TSTR')
+        which_setting = 'tstr'
+#    # get test data
+#    test_X = data['test_X']
+#    test_Y = data['test_Y']
+    if not CNN:
+        model_choice = 'RF'
+         # if multivariate, reshape
+        if len(test_X.shape) == 3:
+            test_X = test_X.reshape(test_X.shape[0], -1)
+        if len(train_X.shape) == 3:
+            train_X = train_X.reshape(train_X.shape[0], -1)
+        if len(synth_X.shape) == 3:
+            synth_X = synth_X.reshape(synth_X.shape[0], -1)
+    else:
+        raise ValueError(CNN)
+        model_choice = 'CNN'
+    # we will select the best validation set epoch based on F1 score, take average across all the tasks
+    score_list = []
+    for label in range(synth_Y.shape[1]):
+        task = orig_data['Y_columns'][label]
+        if vali:
+            if not task in ['low_sao2', 'high_heartrate', 'low_respiration']:
+                print('Skipping task', task, 'because validation evaluation.')
+                continue
+        print('Evaluating on task:', task)
+        #print('(', np.mean(synth_Y[:, label]), 'positive in train, ', np.mean(test_Y[:, label]), 'in test)')
+        #m = RandomForestClassifier(n_estimators=50).fit(synth_X, synth_Y[:, label])
+        #m = SVC(gamma=0.001).fit(synth_X, synth_Y[:, label])
+        synth_classifier = RandomForestClassifier(n_estimators=100).fit(synth_X, synth_Y[:, label])
+        synth_predY = synth_classifier.predict(test_X)
+        synth_predY_prob = synth_classifier.predict_proba(test_X)[:, 1]
+        real_classifier = RandomForestClassifier(n_estimators=100).fit(train_X, train_Y[:, label])
+        real_predY = real_classifier.predict(test_X)
+        real_predY_prob = real_classifier.predict_proba(test_X)[:, 1]
+        #print('(predicted', np.mean(predict), 'positive labels)')
+        
+        synth_prec, synth_recall, synth_f1, synth_support = precision_recall_fscore_support(test_Y[:, label], synth_predY, average='weighted')
+        synth_accuracy = accuracy_score(test_Y[:, label], synth_predY)
+        synth_auprc = average_precision_score(test_Y[:, label], synth_predY_prob)
+        synth_auroc = roc_auc_score(test_Y[:, label], synth_predY_prob)
+        synth_scores = [synth_prec, synth_recall, synth_f1, synth_accuracy, synth_auprc, synth_auroc]
+
+        real_prec, real_recall, real_f1, real_support = precision_recall_fscore_support(test_Y[:, label], real_predY, average='weighted')
+        real_accuracy = accuracy_score(test_Y[:, label], real_predY)
+        real_auprc = average_precision_score(test_Y[:, label], real_predY_prob)
+        real_auroc = roc_auc_score(test_Y[:, label], real_predY_prob)
+        real_scores = [real_prec, real_recall, real_f1, real_accuracy, real_auprc, real_auroc]
+        
+        all_scores = synth_scores + real_scores
+
+        if vali:
+            report_file = open('./experiments/tstr/vali.' + which_setting + '_report.v3.csv', 'a')
+            report_file.write('eICU_' + task + ',' + identifier + ',' + model_choice + ',' + str(epoch) + ',' + ','.join(map(str, all_scores)) + '\n')
+            report_file.close()
+        else:
+            report_file = open('./experiments/tstr/' + which_setting + '_report.v3.csv', 'a')
+            report_file.write('eICU_' + task + ',' + identifier + ',' + model_choice + ',' + str(epoch) + ',' + ','.join(map(str, all_scores)) + '\n')
+            report_file.close()
+        
+        print(classification_report(test_Y[:, label], synth_predY))
+        print(classification_report(test_Y[:, label], real_predY))
+        if task in ['low_sao2', 'high_heartrate', 'low_respiration']:
+            score_list.append(synth_auprc + synth_auroc)
+
+    if do_OR:
+        raise NotImplementedError
+        # do the OR task
+        extreme_heartrate_test = test_Y[:, 1] + test_Y[:, 4]
+        extreme_respiration_test = test_Y[:, 2] + test_Y[:, 5]
+        extreme_systemicmean_test = test_Y[:, 3] + test_Y[:, 6]
+        Y_OR_test = np.vstack([extreme_heartrate_test, extreme_respiration_test, extreme_systemicmean_test]).T
+        Y_OR_test = (Y_OR_test > 0)*1
+
+        extreme_heartrate_synth = synth_Y[:, 1] + synth_Y[:, 4]
+        extreme_respiration_synth = synth_Y[:, 2] + synth_Y[:, 5]
+        extreme_systemicmean_synth = synth_Y[:, 3] + synth_Y[:, 6]
+        Y_OR_synth = np.vstack([extreme_heartrate_synth, extreme_respiration_synth, extreme_systemicmean_synth]).T
+        Y_OR_synth = (Y_OR_synth > 0)*1
+
+        OR_names = ['extreme heartrate', 'extreme respiration', 'extreme MAP']
+        OR_results = []
+        for label in range(Y_OR_synth.shape[1]):
+            print('task:', OR_names[label])
+            print('(', np.mean(Y_OR_synth[:, label]), 'positive in train, ', np.mean(Y_OR_test[:, label]), 'in test)')
+            m = RandomForestClassifier(n_estimators=500).fit(synth_X, Y_OR_synth[:, label])
+            predict = m.predict(X_test)
+            print('(predicted', np.mean(predict), 'positive labels)')
+            accuracy = accuracy_score(Y_OR_test[:, label], predict)
+            precision = sklearn.metrics.precision_score(Y_OR_test[:, label], predict)
+            recall = sklearn.metrics.recall_score(Y_OR_test[:, label], predict)
+            print(accuracy, precision, recall)
+            OR_results.append([accuracy, precision, recall])
+    else:
+        OR_results = []
+
+    score_across_tasks = np.mean(np.array(score_list))
+    return score_across_tasks
+
+def NIPS_toy_plot(identifier_rbf, epoch_rbf, identifier_sine, epoch_sine, identifier_mnist, epoch_mnist):
+    """
+    for each experiment:
+    - plot a bunch of train examples
+    - sample a bunch of generated examples
+    - plot all in separate PDFs so i can merge in illustrator
+
+    for sine and rbf, grey background
+    MNIST is just MNIST (square though)
+    """
+    n_samples = 15
+    # settings
+    settings_rbf = json.load(open('./experiments/settings/' + identifier_rbf + '.txt', 'r'))
+    settings_sine = json.load(open('./experiments/settings/' + identifier_sine + '.txt', 'r'))
+    settings_mnist = json.load(open('./experiments/settings/' + identifier_mnist + '.txt', 'r'))
+    # data
+    data_rbf = np.load('./experiments/data/' + identifier_rbf + '.data.npy').item()
+    data_sine = np.load('./experiments/data/' + identifier_sine + '.data.npy').item()
+    data_mnist = np.load('./experiments/data/' + identifier_mnist + '.data.npy').item()
+    train_rbf = data_rbf['samples']['train']
+    train_sine = data_sine['samples']['train']
+    train_mnist = data_mnist['samples']['train']
+    # sample
+    samples_rbf = model.sample_trained_model(settings_rbf, epoch_rbf, n_samples)
+    samples_sine = model.sample_trained_model(settings_sine, epoch_sine, n_samples)
+    samples_mnist = model.sample_trained_model(settings_mnist, epoch_mnist, n_samples)
+    # plot them all
+    index = 0
+    #for sample in np.random.permutation(train_rbf)[:n_samples]:
+    #    plotting.nips_plot_rbf(sample, index, 'train')
+    #    index += 1
+    #for sample in samples_rbf:
+    #    plotting.nips_plot_rbf(sample, index, 'GAN')
+    #    index += 1
+    #for sample in np.random.permutation(train_sine)[:n_samples]:
+    #    plotting.nips_plot_sine(sample, index, 'train')
+    #    index += 1
+    #for sample in samples_sine:
+    #    plotting.nips_plot_sine(sample, index, 'GAN')
+    #    index += 1
+    for sample in np.random.permutation(train_mnist)[:n_samples]:
+        plotting.nips_plot_mnist(sample, index, 'train')
+        index += 1
+    for sample in samples_mnist:
+        plotting.nips_plot_mnist(sample, index, 'GAN')
+        index += 1
+    return True
diff --git a/mmd.py b/mmd.py
new file mode 100644
index 0000000..c2fa363
--- /dev/null
+++ b/mmd.py
@@ -0,0 +1,233 @@
+'''
+MMD functions implemented in tensorflow.
+(from https://github.com/dougalsutherland/opt-mmd/blob/master/gan/mmd.py)
+'''
+from __future__ import division
+
+import tensorflow as tf
+
+from tf_ops import dot, sq_sum
+
+from scipy.spatial.distance import pdist
+from numpy import median, vstack, einsum
+import pdb
+import numpy as np
+
+_eps=1e-8
+
+################################################################################
+### Quadratic-time MMD with Gaussian RBF kernel
+
+def _mix_rbf_kernel(X, Y, sigmas, wts=None):
+    """
+    """
+    if wts is None:
+        wts = [1.0] * sigmas.get_shape()[0]
+
+    # debug!
+    if len(X.shape) == 2:
+        # matrix
+        XX = tf.matmul(X, X, transpose_b=True)
+        XY = tf.matmul(X, Y, transpose_b=True)
+        YY = tf.matmul(Y, Y, transpose_b=True)
+    elif len(X.shape) == 3:
+        # tensor -- this is computing the Frobenius norm
+        XX = tf.tensordot(X, X, axes=[[1, 2], [1, 2]])
+        XY = tf.tensordot(X, Y, axes=[[1, 2], [1, 2]])
+        YY = tf.tensordot(Y, Y, axes=[[1, 2], [1, 2]])
+    else:
+        raise ValueError(X)
+
+    X_sqnorms = tf.diag_part(XX)
+    Y_sqnorms = tf.diag_part(YY)
+
+    r = lambda x: tf.expand_dims(x, 0)
+    c = lambda x: tf.expand_dims(x, 1)
+
+    K_XX, K_XY, K_YY = 0, 0, 0
+    for sigma, wt in zip(tf.unstack(sigmas, axis=0), wts):
+        gamma = 1 / (2 * sigma**2)
+        K_XX += wt * tf.exp(-gamma * (-2 * XX + c(X_sqnorms) + r(X_sqnorms)))
+        K_XY += wt * tf.exp(-gamma * (-2 * XY + c(X_sqnorms) + r(Y_sqnorms)))
+        K_YY += wt * tf.exp(-gamma * (-2 * YY + c(Y_sqnorms) + r(Y_sqnorms)))
+
+    return K_XX, K_XY, K_YY, tf.reduce_sum(wts)
+
+
+def rbf_mmd2(X, Y, sigma=1, biased=True):
+    return mix_rbf_mmd2(X, Y, sigmas=[sigma], biased=biased)
+
+
+def mix_rbf_mmd2(X, Y, sigmas=(1,), wts=None, biased=True):
+    K_XX, K_XY, K_YY, d = _mix_rbf_kernel(X, Y, sigmas, wts)
+    return _mmd2(K_XX, K_XY, K_YY, const_diagonal=d, biased=biased)
+
+
+def rbf_mmd2_and_ratio(X, Y, sigma=1, biased=True):
+    return mix_rbf_mmd2_and_ratio(X, Y, sigmas=[sigma], biased=biased)
+
+
+def mix_rbf_mmd2_and_ratio(X, Y, sigmas=(1,), wts=None, biased=True):
+    K_XX, K_XY, K_YY, d = _mix_rbf_kernel(X, Y, sigmas, wts)
+    return _mmd2_and_ratio(K_XX, K_XY, K_YY, const_diagonal=d, biased=biased)
+
+
+################################################################################
+### Helper functions to compute variances based on kernel matrices
+
+
+def _mmd2(K_XX, K_XY, K_YY, const_diagonal=False, biased=False):
+    m = tf.cast(K_XX.get_shape()[0], tf.float32)
+    n = tf.cast(K_YY.get_shape()[0], tf.float32)
+
+    if biased:
+        mmd2 = (tf.reduce_sum(K_XX) / (m * m)
+              + tf.reduce_sum(K_YY) / (n * n)
+              - 2 * tf.reduce_sum(K_XY) / (m * n))
+    else:
+        if const_diagonal is not False:
+            trace_X = m * const_diagonal
+            trace_Y = n * const_diagonal
+        else:
+            trace_X = tf.trace(K_XX)
+            trace_Y = tf.trace(K_YY)
+
+        mmd2 = ((tf.reduce_sum(K_XX) - trace_X) / (m * (m - 1))
+              + (tf.reduce_sum(K_YY) - trace_Y) / (n * (n - 1))
+              - 2 * tf.reduce_sum(K_XY) / (m * n))
+
+    return mmd2
+
+
+def _mmd2_and_ratio(K_XX, K_XY, K_YY, const_diagonal=False, biased=False,
+                    min_var_est=_eps):
+    mmd2, var_est = _mmd2_and_variance(
+        K_XX, K_XY, K_YY, const_diagonal=const_diagonal, biased=biased)
+    ratio = mmd2 / tf.sqrt(tf.maximum(var_est, min_var_est))
+    return mmd2, ratio
+
+
+def _mmd2_and_variance(K_XX, K_XY, K_YY, const_diagonal=False, biased=False):
+    m = tf.cast(K_XX.get_shape()[0], tf.float32)  # Assumes X, Y are same shape
+
+    ### Get the various sums of kernels that we'll use
+    # Kts drop the diagonal, but we don't need to compute them explicitly
+    if const_diagonal is not False:
+        const_diagonal = tf.cast(const_diagonal, tf.float32)
+        diag_X = diag_Y = const_diagonal
+        sum_diag_X = sum_diag_Y = m * const_diagonal
+        sum_diag2_X = sum_diag2_Y = m * const_diagonal**2
+    else:
+        diag_X = tf.diag_part(K_XX)
+        diag_Y = tf.diag_part(K_YY)
+
+        sum_diag_X = tf.reduce_sum(diag_X)
+        sum_diag_Y = tf.reduce_sum(diag_Y)
+
+        sum_diag2_X = sq_sum(diag_X)
+        sum_diag2_Y = sq_sum(diag_Y)
+
+    Kt_XX_sums = tf.reduce_sum(K_XX, 1) - diag_X
+    Kt_YY_sums = tf.reduce_sum(K_YY, 1) - diag_Y
+    K_XY_sums_0 = tf.reduce_sum(K_XY, 0)
+    K_XY_sums_1 = tf.reduce_sum(K_XY, 1)
+
+    Kt_XX_sum = tf.reduce_sum(Kt_XX_sums)
+    Kt_YY_sum = tf.reduce_sum(Kt_YY_sums)
+    K_XY_sum = tf.reduce_sum(K_XY_sums_0)
+
+    Kt_XX_2_sum = sq_sum(K_XX) - sum_diag2_X
+    Kt_YY_2_sum = sq_sum(K_YY) - sum_diag2_Y
+    K_XY_2_sum  = sq_sum(K_XY)
+
+    if biased:
+        mmd2 = ((Kt_XX_sum + sum_diag_X) / (m * m)
+              + (Kt_YY_sum + sum_diag_Y) / (m * m)
+              - 2 * K_XY_sum / (m * m))
+    else:
+        mmd2 = ((Kt_XX_sum + sum_diag_X) / (m * (m-1))
+              + (Kt_YY_sum + sum_diag_Y) / (m * (m-1))
+              - 2 * K_XY_sum / (m * m))
+
+    var_est = (
+          2 / (m**2 * (m-1)**2) * (
+              2 * sq_sum(Kt_XX_sums) - Kt_XX_2_sum
+            + 2 * sq_sum(Kt_YY_sums) - Kt_YY_2_sum)
+        - (4*m-6) / (m**3 * (m-1)**3) * (Kt_XX_sum**2 + Kt_YY_sum**2)
+        + 4*(m-2) / (m**3 * (m-1)**2) * (
+              sq_sum(K_XY_sums_1) + sq_sum(K_XY_sums_0))
+        - 4 * (m-3) / (m**3 * (m-1)**2) * K_XY_2_sum
+        - (8*m - 12) / (m**5 * (m-1)) * K_XY_sum**2
+        + 8 / (m**3 * (m-1)) * (
+              1/m * (Kt_XX_sum + Kt_YY_sum) * K_XY_sum
+            - dot(Kt_XX_sums, K_XY_sums_1)
+            - dot(Kt_YY_sums, K_XY_sums_0))
+    )
+
+    return mmd2, var_est
+
+
+### additions from stephanie, for convenience
+
+def median_pairwise_distance(X, Y=None):
+    """
+    Heuristic for bandwidth of the RBF. Median pairwise distance of joint data.
+    If Y is missing, just calculate it from X:
+        this is so that, during training, as Y changes, we can use a fixed
+        bandwidth (and save recalculating this each time we evaluated the mmd)
+    At the end of training, we do the heuristic "correctly" by including
+    both X and Y.
+
+    Note: most of this code is assuming tensorflow, but X and Y are just ndarrays
+    """
+    if Y is None:
+        Y = X       # this is horrendously inefficient, sorry
+   
+    if len(X.shape) == 2:
+        # matrix
+        X_sqnorms = einsum('...i,...i', X, X)
+        Y_sqnorms = einsum('...i,...i', Y, Y)
+        XY = einsum('ia,ja', X, Y)
+    elif len(X.shape) == 3:
+        # tensor -- this is computing the Frobenius norm
+        X_sqnorms = einsum('...ij,...ij', X, X)
+        Y_sqnorms = einsum('...ij,...ij', Y, Y)
+        XY = einsum('iab,jab', X, Y)
+    else:
+        raise ValueError(X)
+
+    distances = np.sqrt(X_sqnorms.reshape(-1, 1) - 2*XY + Y_sqnorms.reshape(1, -1))
+    return median(distances)
+
+
+def median_pairwise_distance_o(X, Y=None):
+    """
+    Heuristic for bandwidth of the RBF. Median pairwise distance of joint data.
+    If Y is missing, just calculate it from X:
+        this is so that, during training, as Y changes, we can use a fixed
+        bandwidth (and save recalculating this each time we evaluated the mmd)
+    At the end of training, we do the heuristic "correctly" by including
+    both X and Y.
+
+    Note: most of this code is assuming tensorflow, but X and Y are just ndarrays
+    """
+    if Y is None:
+        Y = X  # this is horrendously inefficient, sorry
+
+    if len(X.shape) == 2:
+        # matrix
+        X_sqnorms = np.einsum('...i,...i', X, X)
+        Y_sqnorms = np.einsum('...i,...i', Y, Y)
+        XY = np.einsum('ia,ja', X, Y)
+    elif len(X.shape) == 3:
+        # tensor -- this is computing the Frobenius norm
+        X_sqnorms = np.einsum('...ij,...ij', X, X)  # reduce the tensor shape
+        Y_sqnorms = np.einsum('...ij,...ij', Y, Y)
+        XY = np.einsum('iab,jab', X, Y)  # X*Y^T??
+    else:
+        raise ValueError(X)
+
+    distances = np.sqrt(X_sqnorms.reshape(-1, 1) - 2 * XY + Y_sqnorms.reshape(1, -1))
+    distances = distances.reshape(-1, 1)
+    distances = distances[~np.isnan(distances)]
+    return np.median(distances)
\ No newline at end of file
diff --git a/mod_core_rnn_cell_impl.py b/mod_core_rnn_cell_impl.py
new file mode 100644
index 0000000..265538e
--- /dev/null
+++ b/mod_core_rnn_cell_impl.py
@@ -0,0 +1,1063 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#modified by Stephanie (@corcra) to enable initializing the bias term in lstm """
+# ==============================================================================
+
+"""Module implementing RNN Cells."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import contextlib
+import hashlib
+import math
+import numbers
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope as vs
+
+from tensorflow.python.ops.math_ops import sigmoid
+from tensorflow.python.ops.math_ops import tanh
+#from tensorflow.python.ops.rnn_cell_impl import _RNNCell as RNNCell
+from tensorflow.python.ops.rnn_cell_impl import RNNCell
+
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
+
+
+_BIAS_VARIABLE_NAME = "biases"
+_WEIGHTS_VARIABLE_NAME = "weights"
+
+
+@contextlib.contextmanager
+def _checked_scope(cell, scope, reuse=None, **kwargs):
+  if reuse is not None:
+    kwargs["reuse"] = reuse
+  with vs.variable_scope(scope, **kwargs) as checking_scope:
+    scope_name = checking_scope.name
+    if hasattr(cell, "_scope"):
+      cell_scope = cell._scope  # pylint: disable=protected-access
+      if cell_scope.name != checking_scope.name:
+        raise ValueError(
+            "Attempt to reuse RNNCell %s with a different variable scope than "
+            "its first use.  First use of cell was with scope '%s', this "
+            "attempt is with scope '%s'.  Please create a new instance of the "
+            "cell if you would like it to use a different set of weights.  "
+            "If before you were using: MultiRNNCell([%s(...)] * num_layers), "
+            "change to: MultiRNNCell([%s(...) for _ in range(num_layers)]).  "
+            "If before you were using the same cell instance as both the "
+            "forward and reverse cell of a bidirectional RNN, simply create "
+            "two instances (one for forward, one for reverse).  "
+            "In May 2017, we will start transitioning this cell's behavior "
+            "to use existing stored weights, if any, when it is called "
+            "with scope=None (which can lead to silent model degradation, so "
+            "this error will remain until then.)"
+            % (cell, cell_scope.name, scope_name, type(cell).__name__,
+               type(cell).__name__))
+    else:
+      weights_found = False
+      try:
+        with vs.variable_scope(checking_scope, reuse=True):
+          vs.get_variable(_WEIGHTS_VARIABLE_NAME)
+        weights_found = True
+      except ValueError:
+        pass
+      if weights_found and reuse is None:
+        raise ValueError(
+            "Attempt to have a second RNNCell use the weights of a variable "
+            "scope that already has weights: '%s'; and the cell was not "
+            "constructed as %s(..., reuse=True).  "
+            "To share the weights of an RNNCell, simply "
+            "reuse it in your second calculation, or create a new one with "
+            "the argument reuse=True." % (scope_name, type(cell).__name__))
+
+    # Everything is OK.  Update the cell's scope and yield it.
+    cell._scope = checking_scope  # pylint: disable=protected-access
+    yield checking_scope
+
+
+class BasicRNNCell(RNNCell):
+  """The most basic RNN cell."""
+
+  def __init__(self, num_units, input_size=None, activation=tanh, reuse=None):
+    if input_size is not None:
+      logging.warn("%s: The input_size parameter is deprecated.", self)
+    self._num_units = num_units
+    self._activation = activation
+    self._reuse = reuse
+
+  @property
+  def state_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def __call__(self, inputs, state, scope=None):
+    """Most basic RNN: output = new_state = act(W * input + U * state + B)."""
+    with _checked_scope(self, scope or "basic_rnn_cell", reuse=self._reuse):
+      output = self._activation(
+          _linear([inputs, state], self._num_units, True))
+    return output, output
+
+
+class GRUCell(RNNCell):
+  """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078)."""
+
+  def __init__(self, num_units, input_size=None, activation=tanh, reuse=None):
+    if input_size is not None:
+      logging.warn("%s: The input_size parameter is deprecated.", self)
+    self._num_units = num_units
+    self._activation = activation
+    self._reuse = reuse
+
+  @property
+  def state_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def __call__(self, inputs, state, scope=None):
+    """Gated recurrent unit (GRU) with nunits cells."""
+    with _checked_scope(self, scope or "gru_cell", reuse=self._reuse):
+      with vs.variable_scope("gates"):  # Reset gate and update gate.
+        # We start with bias of 1.0 to not reset and not update.
+        value = sigmoid(_linear(
+          [inputs, state], 2 * self._num_units, True, 1.0))
+        r, u = array_ops.split(
+            value=value,
+            num_or_size_splits=2,
+            axis=1)
+      with vs.variable_scope("candidate"):
+        c = self._activation(_linear([inputs, r * state],
+                                     self._num_units, True))
+      new_h = u * state + (1 - u) * c
+    return new_h, new_h
+
+
+_LSTMStateTuple = collections.namedtuple("LSTMStateTuple", ("c", "h"))
+
+
+class LSTMStateTuple(_LSTMStateTuple):
+  """Tuple used by LSTM Cells for `state_size`, `zero_state`, and output state.
+
+  Stores two elements: `(c, h)`, in that order.
+
+  Only used when `state_is_tuple=True`.
+  """
+  __slots__ = ()
+
+  @property
+  def dtype(self):
+    (c, h) = self
+    if not c.dtype == h.dtype:
+      raise TypeError("Inconsistent internal state: %s vs %s" %
+                      (str(c.dtype), str(h.dtype)))
+    return c.dtype
+
+
+class BasicLSTMCell(RNNCell):
+  """Basic LSTM recurrent network cell.
+
+  The implementation is based on: http://arxiv.org/abs/1409.2329.
+
+  We add forget_bias (default: 1) to the biases of the forget gate in order to
+  reduce the scale of forgetting in the beginning of the training.
+
+  It does not allow cell clipping, a projection layer, and does not
+  use peep-hole connections: it is the basic baseline.
+
+  For advanced models, please use the full LSTMCell that follows.
+  """
+
+  def __init__(self, num_units, forget_bias=1.0, input_size=None,
+               state_is_tuple=True, activation=tanh, reuse=None):
+    """Initialize the basic LSTM cell.
+
+    Args:
+      num_units: int, The number of units in the LSTM cell.
+      forget_bias: float, The bias added to forget gates (see above).
+      input_size: Deprecated and unused.
+      state_is_tuple: If True, accepted and returned states are 2-tuples of
+        the `c_state` and `m_state`.  If False, they are concatenated
+        along the column axis.  The latter behavior will soon be deprecated.
+      activation: Activation function of the inner states.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+    """
+    if not state_is_tuple:
+      logging.warn("%s: Using a concatenated state is slower and will soon be "
+                   "deprecated.  Use state_is_tuple=True.", self)
+    if input_size is not None:
+      logging.warn("%s: The input_size parameter is deprecated.", self)
+    self._num_units = num_units
+    self._forget_bias = forget_bias
+    self._state_is_tuple = state_is_tuple
+    self._activation = activation
+    self._reuse = reuse
+
+  @property
+  def state_size(self):
+    return (LSTMStateTuple(self._num_units, self._num_units)
+            if self._state_is_tuple else 2 * self._num_units)
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def __call__(self, inputs, state, scope=None):
+    """Long short-term memory cell (LSTM)."""
+    with _checked_scope(self, scope or "basic_lstm_cell", reuse=self._reuse):
+      # Parameters of gates are concatenated into one multiply for efficiency.
+      if self._state_is_tuple:
+        c, h = state
+      else:
+        c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1)
+      concat = _linear([inputs, h], 4 * self._num_units, True)
+
+      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+      i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
+
+      new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
+               self._activation(j))
+      new_h = self._activation(new_c) * sigmoid(o)
+
+      if self._state_is_tuple:
+        new_state = LSTMStateTuple(new_c, new_h)
+      else:
+        new_state = array_ops.concat([new_c, new_h], 1)
+      return new_h, new_state
+
+
+class LSTMCell(RNNCell):
+  """Long short-term memory unit (LSTM) recurrent network cell.
+
+  The default non-peephole implementation is based on:
+
+    http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
+
+  S. Hochreiter and J. Schmidhuber.
+  "Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
+
+  The peephole implementation is based on:
+
+    https://research.google.com/pubs/archive/43905.pdf
+
+  Hasim Sak, Andrew Senior, and Francoise Beaufays.
+  "Long short-term memory recurrent neural network architectures for
+   large scale acoustic modeling." INTERSPEECH, 2014.
+
+  The class uses optional peep-hole connections, optional cell clipping, and
+  an optional projection layer.
+  """
+
+  def __init__(self, num_units, input_size=None,
+               use_peepholes=False, cell_clip=None,
+               initializer=None, bias_start=0.0, num_proj=None, proj_clip=None,
+               num_unit_shards=None, num_proj_shards=None,
+               forget_bias=1.0, state_is_tuple=True,
+               activation=tanh, reuse=None):
+    """Initialize the parameters for an LSTM cell.
+
+    Args:
+      num_units: int, The number of units in the LSTM cell
+      input_size: Deprecated and unused.
+      use_peepholes: bool, set True to enable diagonal/peephole connections.
+      cell_clip: (optional) A float value, if provided the cell state is clipped
+        by this value prior to the cell output activation.
+      initializer: (optional) The initializer to use for the weight and
+        projection matrices.
+      bias_start: (optional) The VALUE to initialize the bias to, in
+        the linear call
+      num_proj: (optional) int, The output dimensionality for the projection
+        matrices.  If None, no projection is performed.
+      proj_clip: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
+        provided, then the projected values are clipped elementwise to within
+        `[-proj_clip, proj_clip]`.
+      num_unit_shards: Deprecated, will be removed by Jan. 2017.
+        Use a variable_scope partitioner instead.
+      num_proj_shards: Deprecated, will be removed by Jan. 2017.
+        Use a variable_scope partitioner instead.
+      forget_bias: Biases of the forget gate are initialized by default to 1
+        in order to reduce the scale of forgetting at the beginning of
+        the training.
+      state_is_tuple: If True, accepted and returned states are 2-tuples of
+        the `c_state` and `m_state`.  If False, they are concatenated
+        along the column axis.  This latter behavior will soon be deprecated.
+      activation: Activation function of the inner states.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+    """
+    if not state_is_tuple:
+      logging.warn("%s: Using a concatenated state is slower and will soon be "
+                   "deprecated.  Use state_is_tuple=True.", self)
+    if input_size is not None:
+      logging.warn("%s: The input_size parameter is deprecated.", self)
+    if num_unit_shards is not None or num_proj_shards is not None:
+      logging.warn(
+          "%s: The num_unit_shards and proj_unit_shards parameters are "
+          "deprecated and will be removed in Jan 2017.  "
+          "Use a variable scope with a partitioner instead.", self)
+
+    self._num_units = num_units
+    self._use_peepholes = use_peepholes
+    self._cell_clip = cell_clip
+    self._initializer = initializer
+    self._bias_start = bias_start
+    self._num_proj = num_proj
+    self._proj_clip = proj_clip
+    self._num_unit_shards = num_unit_shards
+    self._num_proj_shards = num_proj_shards
+    self._forget_bias = forget_bias
+    self._state_is_tuple = state_is_tuple
+    self._activation = activation
+    self._reuse = reuse
+
+    if num_proj:
+      self._state_size = (
+          LSTMStateTuple(num_units, num_proj)
+          if state_is_tuple else num_units + num_proj)
+      self._output_size = num_proj
+    else:
+      self._state_size = (
+          LSTMStateTuple(num_units, num_units)
+          if state_is_tuple else 2 * num_units)
+      self._output_size = num_units
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  def __call__(self, inputs, state, scope=None):
+    """Run one step of LSTM.
+
+    Args:
+      inputs: input Tensor, 2D, batch x num_units.
+      state: if `state_is_tuple` is False, this must be a state Tensor,
+        `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
+        tuple of state Tensors, both `2-D`, with column sizes `c_state` and
+        `m_state`.
+      scope: VariableScope for the created subgraph; defaults to "lstm_cell".
+
+    Returns:
+      A tuple containing:
+
+      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
+        LSTM after reading `inputs` when previous state was `state`.
+        Here output_dim is:
+           num_proj if num_proj was set,
+           num_units otherwise.
+      - Tensor(s) representing the new state of LSTM after reading `inputs` when
+        the previous state was `state`.  Same type and shape(s) as `state`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
+    num_proj = self._num_units if self._num_proj is None else self._num_proj
+
+    if self._state_is_tuple:
+      (c_prev, m_prev) = state
+    else:
+      c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
+      m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
+
+    dtype = inputs.dtype
+    input_size = inputs.get_shape().with_rank(2)[1]
+    if input_size.value is None:
+      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+    with _checked_scope(self, scope or "lstm_cell",
+                        initializer=self._initializer,
+                        reuse=self._reuse) as unit_scope:
+      if self._num_unit_shards is not None:
+        unit_scope.set_partitioner(
+            partitioned_variables.fixed_size_partitioner(
+                self._num_unit_shards))
+      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+      lstm_matrix = _linear([inputs, m_prev], 4 * self._num_units, bias=True, bias_start=self._bias_start)
+      i, j, f, o = array_ops.split(
+          value=lstm_matrix, num_or_size_splits=4, axis=1)
+      # Diagonal connections
+      if self._use_peepholes:
+        with vs.variable_scope(unit_scope) as projection_scope:
+          if self._num_unit_shards is not None:
+            projection_scope.set_partitioner(None)
+          w_f_diag = vs.get_variable(
+              "w_f_diag", shape=[self._num_units], dtype=dtype)
+          w_i_diag = vs.get_variable(
+              "w_i_diag", shape=[self._num_units], dtype=dtype)
+          w_o_diag = vs.get_variable(
+              "w_o_diag", shape=[self._num_units], dtype=dtype)
+
+      if self._use_peepholes:
+        c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
+             sigmoid(i + w_i_diag * c_prev) * self._activation(j))
+      else:
+        c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
+             self._activation(j))
+
+      if self._cell_clip is not None:
+        # pylint: disable=invalid-unary-operand-type
+        c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
+        # pylint: enable=invalid-unary-operand-type
+      if self._use_peepholes:
+        m = sigmoid(o + w_o_diag * c) * self._activation(c)
+      else:
+        m = sigmoid(o) * self._activation(c)
+
+      if self._num_proj is not None:
+        with vs.variable_scope("projection") as proj_scope:
+          if self._num_proj_shards is not None:
+            proj_scope.set_partitioner(
+                partitioned_variables.fixed_size_partitioner(
+                    self._num_proj_shards))
+          m = _linear(m, self._num_proj, bias=False)
+
+        if self._proj_clip is not None:
+          # pylint: disable=invalid-unary-operand-type
+          m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
+          # pylint: enable=invalid-unary-operand-type
+
+    new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else
+                 array_ops.concat([c, m], 1))
+    return m, new_state
+
+
+class OutputProjectionWrapper(RNNCell):
+  """Operator adding an output projection to the given cell.
+
+  Note: in many cases it may be more efficient to not use this wrapper,
+  but instead concatenate the whole sequence of your outputs in time,
+  do the projection on this batch-concatenated sequence, then split it
+  if needed or directly feed into a softmax.
+  """
+
+  def __init__(self, cell, output_size, reuse=None):
+    """Create a cell with output projection.
+
+    Args:
+      cell: an RNNCell, a projection to output_size is added to it.
+      output_size: integer, the size of the output after projection.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+
+    Raises:
+      TypeError: if cell is not an RNNCell.
+      ValueError: if output_size is not positive.
+    """
+    if not isinstance(cell, RNNCell):
+      raise TypeError("The parameter cell is not RNNCell.")
+    if output_size < 1:
+      raise ValueError("Parameter output_size must be > 0: %d." % output_size)
+    self._cell = cell
+    self._output_size = output_size
+    self._reuse = reuse
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      return self._cell.zero_state(batch_size, dtype)
+
+  def __call__(self, inputs, state, scope=None):
+    """Run the cell and output projection on inputs, starting from state."""
+    output, res_state = self._cell(inputs, state)
+    # Default scope: "OutputProjectionWrapper"
+    with _checked_scope(self, scope or "output_projection_wrapper",
+                        reuse=self._reuse):
+      projected = _linear(output, self._output_size, True)
+    return projected, res_state
+
+
+class InputProjectionWrapper(RNNCell):
+  """Operator adding an input projection to the given cell.
+
+  Note: in many cases it may be more efficient to not use this wrapper,
+  but instead concatenate the whole sequence of your inputs in time,
+  do the projection on this batch-concatenated sequence, then split it.
+  """
+
+  def __init__(self, cell, num_proj, input_size=None):
+    """Create a cell with input projection.
+
+    Args:
+      cell: an RNNCell, a projection of inputs is added before it.
+      num_proj: Python integer.  The dimension to project to.
+      input_size: Deprecated and unused.
+
+    Raises:
+      TypeError: if cell is not an RNNCell.
+    """
+    if input_size is not None:
+      logging.warn("%s: The input_size parameter is deprecated.", self)
+    if not isinstance(cell, RNNCell):
+      raise TypeError("The parameter cell is not RNNCell.")
+    self._cell = cell
+    self._num_proj = num_proj
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return self._cell.output_size
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      return self._cell.zero_state(batch_size, dtype)
+
+  def __call__(self, inputs, state, scope=None):
+    """Run the input projection and then the cell."""
+    # Default scope: "InputProjectionWrapper"
+    with vs.variable_scope(scope or "input_projection_wrapper"):
+      projected = _linear(inputs, self._num_proj, True)
+    return self._cell(projected, state)
+
+
+def _enumerated_map_structure(map_fn, *args, **kwargs):
+  ix = [0]
+  def enumerated_fn(*inner_args, **inner_kwargs):
+    r = map_fn(ix[0], *inner_args, **inner_kwargs)
+    ix[0] += 1
+    return r
+  return nest.map_structure(enumerated_fn, *args, **kwargs)
+
+
+class DropoutWrapper(RNNCell):
+  """Operator adding dropout to inputs and outputs of the given cell."""
+
+  def __init__(self, cell, input_keep_prob=1.0, output_keep_prob=1.0,
+               state_keep_prob=1.0, variational_recurrent=False,
+               input_size=None, dtype=None, seed=None):
+    """Create a cell with added input, state, and/or output dropout.
+
+    If `variational_recurrent` is set to `True` (**NOT** the default behavior),
+    then the the same dropout mask is applied at every step, as described in:
+
+    Y. Gal, Z Ghahramani.  "A Theoretically Grounded Application of Dropout in
+    Recurrent Neural Networks".  https://arxiv.org/abs/1512.05287
+
+    Otherwise a different dropout mask is applied at every time step.
+
+    Args:
+      cell: an RNNCell, a projection to output_size is added to it.
+      input_keep_prob: unit Tensor or float between 0 and 1, input keep
+        probability; if it is constant and 1, no input dropout will be added.
+      output_keep_prob: unit Tensor or float between 0 and 1, output keep
+        probability; if it is constant and 1, no output dropout will be added.
+      state_keep_prob: unit Tensor or float between 0 and 1, output keep
+        probability; if it is constant and 1, no output dropout will be added.
+        State dropout is performed on the *output* states of the cell.
+      variational_recurrent: Python bool.  If `True`, then the same
+        dropout pattern is applied across all time steps per run call.
+        If this parameter is set, `input_size` **must** be provided.
+      input_size: (optional) (possibly nested tuple of) `TensorShape` objects
+        containing the depth(s) of the input tensors expected to be passed in to
+        the `DropoutWrapper`.  Required and used **iff**
+         `variational_recurrent = True` and `input_keep_prob < 1`.
+      dtype: (optional) The `dtype` of the input, state, and output tensors.
+        Required and used **iff** `variational_recurrent = True`.
+      seed: (optional) integer, the randomness seed.
+
+    Raises:
+      TypeError: if cell is not an RNNCell.
+      ValueError: if any of the keep_probs are not between 0 and 1.
+    """
+    if not isinstance(cell, RNNCell):
+      raise TypeError("The parameter cell is not a RNNCell.")
+    with ops.name_scope("DropoutWrapperInit"):
+      def tensor_and_const_value(v):
+        tensor_value = ops.convert_to_tensor(v)
+        const_value = tensor_util.constant_value(tensor_value)
+        return (tensor_value, const_value)
+      for prob, attr in [(input_keep_prob, "input_keep_prob"),
+                         (state_keep_prob, "state_keep_prob"),
+                         (output_keep_prob, "output_keep_prob")]:
+        tensor_prob, const_prob = tensor_and_const_value(prob)
+        if const_prob is not None:
+          if const_prob < 0 or const_prob > 1:
+            raise ValueError("Parameter %s must be between 0 and 1: %d"
+                             % (attr, const_prob))
+          setattr(self, "_%s" % attr, float(const_prob))
+        else:
+          setattr(self, "_%s" % attr, tensor_prob)
+
+    # Set cell, variational_recurrent, seed before running the code below
+    self._cell = cell
+    self._variational_recurrent = variational_recurrent
+    self._seed = seed
+
+    self._recurrent_input_noise = None
+    self._recurrent_state_noise = None
+    self._recurrent_output_noise = None
+
+    if variational_recurrent:
+      if dtype is None:
+        raise ValueError(
+            "When variational_recurrent=True, dtype must be provided")
+
+      def convert_to_batch_shape(s):
+        # Prepend a 1 for the batch dimension; for recurrent
+        # variational dropout we use the same dropout mask for all
+        # batch elements.
+        return array_ops.concat(
+            ([1], tensor_shape.TensorShape(s).as_list()), 0)
+
+      def batch_noise(s, inner_seed):
+        shape = convert_to_batch_shape(s)
+        return random_ops.random_uniform(shape, seed=inner_seed, dtype=dtype)
+
+      if (not isinstance(self._input_keep_prob, numbers.Real) or
+          self._input_keep_prob < 1.0):
+        if input_size is None:
+          raise ValueError(
+              "When variational_recurrent=True and input_keep_prob < 1.0 or "
+              "is unknown, input_size must be provided")
+        self._recurrent_input_noise = _enumerated_map_structure(
+            lambda i, s: batch_noise(s, inner_seed=self._gen_seed("input", i)),
+            input_size)
+      self._recurrent_state_noise = _enumerated_map_structure(
+          lambda i, s: batch_noise(s, inner_seed=self._gen_seed("state", i)),
+          cell.state_size)
+      self._recurrent_output_noise = _enumerated_map_structure(
+          lambda i, s: batch_noise(s, inner_seed=self._gen_seed("output", i)),
+          cell.output_size)
+
+  def _gen_seed(self, salt_prefix, index):
+    if self._seed is None:
+      return None
+    salt = "%s_%d" % (salt_prefix, index)
+    string = (str(self._seed) + salt).encode("utf-8")
+    return int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return self._cell.output_size
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      return self._cell.zero_state(batch_size, dtype)
+
+  def _variational_recurrent_dropout_value(
+      self, index, value, noise, keep_prob):
+    """Performs dropout given the pre-calculated noise tensor."""
+    # uniform [keep_prob, 1.0 + keep_prob)
+    random_tensor = keep_prob + noise
+
+    # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
+    binary_tensor = math_ops.floor(random_tensor)
+    ret = math_ops.div(value, keep_prob) * binary_tensor
+    ret.set_shape(value.get_shape())
+    return ret
+
+  def _dropout(self, values, salt_prefix, recurrent_noise, keep_prob):
+    """Decides whether to perform standard dropout or recurrent dropout."""
+    if not self._variational_recurrent:
+      def dropout(i, v):
+        return nn_ops.dropout(
+            v, keep_prob=keep_prob, seed=self._gen_seed(salt_prefix, i))
+      return _enumerated_map_structure(dropout, values)
+    else:
+      def dropout(i, v, n):
+        return self._variational_recurrent_dropout_value(i, v, n, keep_prob)
+      return _enumerated_map_structure(dropout, values, recurrent_noise)
+
+  def __call__(self, inputs, state, scope=None):
+    """Run the cell with the declared dropouts."""
+    def _should_dropout(p):
+      return (not isinstance(p, float)) or p < 1
+
+    if _should_dropout(self._input_keep_prob):
+      inputs = self._dropout(inputs, "input",
+                             self._recurrent_input_noise,
+                             self._input_keep_prob)
+    output, new_state = self._cell(inputs, state, scope)
+    if _should_dropout(self._state_keep_prob):
+      new_state = self._dropout(new_state, "state",
+                                self._recurrent_state_noise,
+                                self._state_keep_prob)
+    if _should_dropout(self._output_keep_prob):
+      output = self._dropout(output, "output",
+                             self._recurrent_output_noise,
+                             self._output_keep_prob)
+    return output, new_state
+
+
+class ResidualWrapper(RNNCell):
+  """RNNCell wrapper that ensures cell inputs are added to the outputs."""
+
+  def __init__(self, cell):
+    """Constructs a `ResidualWrapper` for `cell`.
+
+    Args:
+      cell: An instance of `RNNCell`.
+    """
+    self._cell = cell
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return self._cell.output_size
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      return self._cell.zero_state(batch_size, dtype)
+
+  def __call__(self, inputs, state, scope=None):
+    """Run the cell and add its inputs to its outputs.
+
+    Args:
+      inputs: cell inputs.
+      state: cell state.
+      scope: optional cell scope.
+
+    Returns:
+      Tuple of cell outputs and new state.
+
+    Raises:
+      TypeError: If cell inputs and outputs have different structure (type).
+      ValueError: If cell inputs and outputs have different structure (value).
+    """
+    outputs, new_state = self._cell(inputs, state, scope=scope)
+    nest.assert_same_structure(inputs, outputs)
+    # Ensure shapes match
+    def assert_shape_match(inp, out):
+      inp.get_shape().assert_is_compatible_with(out.get_shape())
+    nest.map_structure(assert_shape_match, inputs, outputs)
+    res_outputs = nest.map_structure(
+        lambda inp, out: inp + out, inputs, outputs)
+    return (res_outputs, new_state)
+
+
+class DeviceWrapper(RNNCell):
+  """Operator that ensures an RNNCell runs on a particular device."""
+
+  def __init__(self, cell, device):
+    """Construct a `DeviceWrapper` for `cell` with device `device`.
+
+    Ensures the wrapped `cell` is called with `tf.device(device)`.
+
+    Args:
+      cell: An instance of `RNNCell`.
+      device: A device string or function, for passing to `tf.device`.
+    """
+    self._cell = cell
+    self._device = device
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return self._cell.output_size
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      return self._cell.zero_state(batch_size, dtype)
+
+  def __call__(self, inputs, state, scope=None):
+    """Run the cell on specified device."""
+    with ops.device(self._device):
+      return self._cell(inputs, state, scope=scope)
+
+
+class EmbeddingWrapper(RNNCell):
+  """Operator adding input embedding to the given cell.
+
+  Note: in many cases it may be more efficient to not use this wrapper,
+  but instead concatenate the whole sequence of your inputs in time,
+  do the embedding on this batch-concatenated sequence, then split it and
+  feed into your RNN.
+  """
+
+  def __init__(self, cell, embedding_classes, embedding_size, initializer=None,
+               reuse=None):
+    """Create a cell with an added input embedding.
+
+    Args:
+      cell: an RNNCell, an embedding will be put before its inputs.
+      embedding_classes: integer, how many symbols will be embedded.
+      embedding_size: integer, the size of the vectors we embed into.
+      initializer: an initializer to use when creating the embedding;
+        if None, the initializer from variable scope or a default one is used.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+
+    Raises:
+      TypeError: if cell is not an RNNCell.
+      ValueError: if embedding_classes is not positive.
+    """
+    if not isinstance(cell, RNNCell):
+      raise TypeError("The parameter cell is not RNNCell.")
+    if embedding_classes <= 0 or embedding_size <= 0:
+      raise ValueError("Both embedding_classes and embedding_size must be > 0: "
+                       "%d, %d." % (embedding_classes, embedding_size))
+    self._cell = cell
+    self._embedding_classes = embedding_classes
+    self._embedding_size = embedding_size
+    self._initializer = initializer
+    self._reuse = reuse
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return self._cell.output_size
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      return self._cell.zero_state(batch_size, dtype)
+
+  def __call__(self, inputs, state, scope=None):
+    """Run the cell on embedded inputs."""
+    with _checked_scope(self, scope or "embedding_wrapper", reuse=self._reuse):
+      with ops.device("/cpu:0"):
+        if self._initializer:
+          initializer = self._initializer
+        elif vs.get_variable_scope().initializer:
+          initializer = vs.get_variable_scope().initializer
+        else:
+          # Default initializer for embeddings should have variance=1.
+          sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
+          initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)
+
+        if type(state) is tuple:
+          data_type = state[0].dtype
+        else:
+          data_type = state.dtype
+
+        embedding = vs.get_variable(
+            "embedding", [self._embedding_classes, self._embedding_size],
+            initializer=initializer,
+            dtype=data_type)
+        embedded = embedding_ops.embedding_lookup(
+            embedding, array_ops.reshape(inputs, [-1]))
+    return self._cell(embedded, state)
+
+
+class MultiRNNCell(RNNCell):
+  """RNN cell composed sequentially of multiple simple cells."""
+
+  def __init__(self, cells, state_is_tuple=True):
+    """Create a RNN cell composed sequentially of a number of RNNCells.
+
+    Args:
+      cells: list of RNNCells that will be composed in this order.
+      state_is_tuple: If True, accepted and returned states are n-tuples, where
+        `n = len(cells)`.  If False, the states are all
+        concatenated along the column axis.  This latter behavior will soon be
+        deprecated.
+
+    Raises:
+      ValueError: if cells is empty (not allowed), or at least one of the cells
+        returns a state tuple but the flag `state_is_tuple` is `False`.
+    """
+    if not cells:
+      raise ValueError("Must specify at least one cell for MultiRNNCell.")
+    if not nest.is_sequence(cells):
+      raise TypeError(
+          "cells must be a list or tuple, but saw: %s." % cells)
+
+    self._cells = cells
+    self._state_is_tuple = state_is_tuple
+    if not state_is_tuple:
+      if any(nest.is_sequence(c.state_size) for c in self._cells):
+        raise ValueError("Some cells return tuples of states, but the flag "
+                         "state_is_tuple is not set.  State sizes are: %s"
+                         % str([c.state_size for c in self._cells]))
+
+  @property
+  def state_size(self):
+    if self._state_is_tuple:
+      return tuple(cell.state_size for cell in self._cells)
+    else:
+      return sum([cell.state_size for cell in self._cells])
+
+  @property
+  def output_size(self):
+    return self._cells[-1].output_size
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      if self._state_is_tuple:
+        return tuple(cell.zero_state(batch_size, dtype) for cell in self._cells)
+      else:
+        # We know here that state_size of each cell is not a tuple and
+        # presumably does not contain TensorArrays or anything else fancy
+        return super(MultiRNNCell, self).zero_state(batch_size, dtype)
+
+  def __call__(self, inputs, state, scope=None):
+    """Run this multi-layer cell on inputs, starting from state."""
+    with vs.variable_scope(scope or "multi_rnn_cell"):
+      cur_state_pos = 0
+      cur_inp = inputs
+      new_states = []
+      for i, cell in enumerate(self._cells):
+        with vs.variable_scope("cell_%d" % i):
+          if self._state_is_tuple:
+            if not nest.is_sequence(state):
+              raise ValueError(
+                  "Expected state to be a tuple of length %d, but received: %s"
+                  % (len(self.state_size), state))
+            cur_state = state[i]
+          else:
+            cur_state = array_ops.slice(
+                state, [0, cur_state_pos], [-1, cell.state_size])
+            cur_state_pos += cell.state_size
+          cur_inp, new_state = cell(cur_inp, cur_state)
+          new_states.append(new_state)
+    new_states = (tuple(new_states) if self._state_is_tuple else
+                  array_ops.concat(new_states, 1))
+    return cur_inp, new_states
+
+
+class _SlimRNNCell(RNNCell):
+  """A simple wrapper for slim.rnn_cells."""
+
+  def __init__(self, cell_fn):
+    """Create a SlimRNNCell from a cell_fn.
+
+    Args:
+      cell_fn: a function which takes (inputs, state, scope) and produces the
+        outputs and the new_state. Additionally when called with inputs=None and
+        state=None it should return (initial_outputs, initial_state).
+
+    Raises:
+      TypeError: if cell_fn is not callable
+      ValueError: if cell_fn cannot produce a valid initial state.
+    """
+    if not callable(cell_fn):
+      raise TypeError("cell_fn %s needs to be callable", cell_fn)
+    self._cell_fn = cell_fn
+    self._cell_name = cell_fn.func.__name__
+    init_output, init_state = self._cell_fn(None, None)
+    output_shape = init_output.get_shape()
+    state_shape = init_state.get_shape()
+    self._output_size = output_shape.with_rank(2)[1].value
+    self._state_size = state_shape.with_rank(2)[1].value
+    if self._output_size is None:
+      raise ValueError("Initial output created by %s has invalid shape %s" %
+                       (self._cell_name, output_shape))
+    if self._state_size is None:
+      raise ValueError("Initial state created by %s has invalid shape %s" %
+                       (self._cell_name, state_shape))
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  def __call__(self, inputs, state, scope=None):
+    scope = scope or self._cell_name
+    output, state = self._cell_fn(inputs, state, scope=scope)
+    return output, state
+
+
+def _linear(args, output_size, bias, bias_start=0.0, scope=None):
+  """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
+
+  Args:
+    args: a 2D Tensor or a list of 2D, batch x n, Tensors.
+    output_size: int, second dimension of W[i].
+    bias: boolean, whether to add a bias term or not.
+    bias_start: starting value to initialize the bias; 0 by default.
+
+  Returns:
+    A 2D Tensor with shape [batch x output_size] equal to
+    sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
+
+  Raises:
+    ValueError: if some of the arguments has unspecified or wrong shape.
+  """
+  if args is None or (nest.is_sequence(args) and not args):
+    raise ValueError("`args` must be specified")
+  if not nest.is_sequence(args):
+    args = [args]
+
+  # Calculate the total size of arguments on dimension 1.
+  total_arg_size = 0
+  shapes = [a.get_shape() for a in args]
+  for shape in shapes:
+    if shape.ndims != 2:
+      raise ValueError("linear is expecting 2D arguments: %s" % shapes)
+    if shape[1].value is None:
+      raise ValueError("linear expects shape[1] to be provided for shape %s, "
+                       "but saw %s" % (shape, shape[1]))
+    else:
+      total_arg_size += shape[1].value
+
+  dtype = [a.dtype for a in args][0]
+
+  # Now the computation.
+  scope = vs.get_variable_scope()
+  with vs.variable_scope(scope) as outer_scope:
+    weights = vs.get_variable(
+        _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], dtype=dtype)
+    if len(args) == 1:
+      res = math_ops.matmul(args[0], weights)
+    else:
+      res = math_ops.matmul(array_ops.concat(args, 1), weights)
+    if not bias:
+      return res
+    with vs.variable_scope(outer_scope) as inner_scope:
+      inner_scope.set_partitioner(None)
+      biases = vs.get_variable(
+          _BIAS_VARIABLE_NAME, [output_size],
+          dtype=dtype,
+          initializer=init_ops.constant_initializer(bias_start, dtype=dtype))
+    return nn_ops.bias_add(res, biases)
diff --git a/model.py b/model.py
new file mode 100644
index 0000000..5c5c069
--- /dev/null
+++ b/model.py
@@ -0,0 +1,289 @@
+import tensorflow as tf
+import numpy as np
+# from data_utils import get_batch
+import data_utils
+import pdb
+import json
+import sys
+from mod_core_rnn_cell_impl import LSTMCell  # modified to allow initializing bias in lstm
+
+# from tensorflow.contrib.rnn import LSTMCell
+tf.logging.set_verbosity(tf.logging.ERROR)
+import mmd
+
+from differential_privacy.dp_sgd.dp_optimizer import dp_optimizer
+from differential_privacy.dp_sgd.dp_optimizer import sanitizer
+from differential_privacy.privacy_accountant.tf import accountant
+
+# ------------------------------- #
+"""
+Most of the models are copied from https://github.com/ratschlab/RGAN
+"""
+
+# --- to do with latent space --- #
+
+def sample_Z(batch_size, seq_length, latent_dim, use_time=False, use_noisy_time=False):
+    sample = np.float32(np.random.normal(size=[batch_size, seq_length, latent_dim]))
+    if use_time:
+        print('WARNING: use_time has different semantics')
+        sample[:, :, 0] = np.linspace(0, 1.0 / seq_length, num=seq_length)
+    return sample
+
+# --- samples for testing ---#
+
+def sample_T(batch_size, batch_idx):
+    samples_aaa = np.load('./data/samples_aa.npy')
+    num_samples_t = samples_aaa.shape[0]
+    labels_aaa = np.load('./data/labels_aa.npy')
+    idx_aaa = np.load('./data/idx_aa.npy')
+    start_pos = batch_idx * batch_size
+    end_pos = start_pos + batch_size
+    T_mb = samples_aaa[start_pos:end_pos, :, :]
+    L_mb = labels_aaa[start_pos:end_pos, :, :]
+    I_mb = idx_aaa[start_pos:end_pos, :, :]
+    return T_mb, L_mb, I_mb, num_samples_t
+
+def sample_TT(batch_size):
+    samples_aaa = np.load('./data/samples_aa.npy')
+    labels_aaa = np.load('./data/labels_aa.npy')
+    idx_aaa = np.load('./data/idx_aa.npy')
+    T_indices = np.random.choice(len(samples_aaa), size=batch_size, replace=False)
+    T_mb = samples_aaa[T_indices, :, :]
+    L_mb = labels_aaa[T_indices, :, :]
+    I_mb = idx_aaa[T_indices, :, :]
+    return T_mb, L_mb, I_mb
+
+# --- to do with training --- #
+def train_epoch(epoch, samples, labels, sess, Z, X, D_loss, G_loss, D_solver, G_solver,
+                batch_size, use_time, D_rounds, G_rounds, seq_length,
+                latent_dim, num_signals):
+    """
+    Train generator and discriminator for one epoch.
+    """
+    # for batch_idx in range(0, int(len(samples) / batch_size) - (D_rounds + (cond_dim > 0) * G_rounds), D_rounds + (cond_dim > 0) * G_rounds):
+    for batch_idx in range(0, int(len(samples) / batch_size) - (D_rounds + G_rounds), D_rounds + G_rounds):
+        # update the discriminator
+        X_mb, Y_mb = data_utils.get_batch(samples, batch_size, batch_idx, labels)
+        Z_mb = sample_Z(batch_size, seq_length, latent_dim, use_time)
+        for d in range(D_rounds):
+            # run the discriminator solver
+            _ = sess.run(D_solver, feed_dict={X: X_mb, Z: Z_mb})
+
+        # update the generator
+        for g in range(G_rounds):
+            # run the generator solver
+            _ = sess.run(G_solver, feed_dict={Z: sample_Z(batch_size, seq_length, latent_dim, use_time=use_time)})
+
+    # at the end, get the loss
+    D_loss_curr, G_loss_curr = sess.run([D_loss, G_loss], feed_dict={X: X_mb,
+                                                                     Z: sample_Z(batch_size, seq_length, latent_dim,
+                                                                                 use_time=use_time)})
+    D_loss_curr = np.mean(D_loss_curr)
+    G_loss_curr = np.mean(G_loss_curr)
+
+
+    return D_loss_curr, G_loss_curr
+
+
+def GAN_loss(Z, X, generator_settings, discriminator_settings):
+
+    # normal GAN
+    G_sample = generator(Z, **generator_settings)
+
+    D_real, D_logit_real = discriminator(X, **discriminator_settings)
+
+    D_fake, D_logit_fake = discriminator(G_sample, reuse=True, **discriminator_settings)
+
+    # Measures the probability error in discrete classification tasks in which each class is independent
+    # and not mutually exclusive.
+    # logits: predicted labels??
+
+    D_loss_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=D_logit_real, labels=tf.ones_like(D_logit_real)), 1)
+    D_loss_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=D_logit_fake, labels=tf.zeros_like(D_logit_fake)), 1)
+
+    D_loss = D_loss_real + D_loss_fake
+
+
+    # G_loss = tf.reduce_mean(tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=D_logit_fake, labels=tf.ones_like(D_logit_fake)), axis=1))
+    G_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=D_logit_fake, labels=tf.ones_like(D_logit_fake)), 1)
+
+
+    return D_loss, G_loss
+
+
+def GAN_solvers(D_loss, G_loss, learning_rate, batch_size, total_examples, l2norm_bound, batches_per_lot, sigma, dp=False):
+    """
+    Optimizers
+    """
+    discriminator_vars = [v for v in tf.trainable_variables() if v.name.startswith('discriminator')]
+    generator_vars = [v for v in tf.trainable_variables() if v.name.startswith('generator')]
+    if dp:
+        print('Using differentially private SGD to train discriminator!')
+        eps = tf.placeholder(tf.float32)
+        delta = tf.placeholder(tf.float32)
+        priv_accountant = accountant.GaussianMomentsAccountant(total_examples)
+        clip = True
+        l2norm_bound = l2norm_bound / batch_size
+        batches_per_lot = 1
+        gaussian_sanitizer = sanitizer.AmortizedGaussianSanitizer(
+            priv_accountant,
+            [l2norm_bound, clip])
+
+        # the trick is that we need to calculate the gradient with respect to
+        # each example in the batch, during the DP SGD step
+        D_solver = dp_optimizer.DPGradientDescentOptimizer(learning_rate,
+                                                           [eps, delta],
+                                                           sanitizer=gaussian_sanitizer,
+                                                           sigma=sigma,
+                                                           batches_per_lot=batches_per_lot).minimize(D_loss, var_list=discriminator_vars)
+    else:
+        D_loss_mean_over_batch = tf.reduce_mean(D_loss)
+        D_solver = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(D_loss_mean_over_batch, var_list=discriminator_vars)
+        priv_accountant = None
+    G_loss_mean_over_batch = tf.reduce_mean(G_loss)
+    G_solver = tf.train.AdamOptimizer().minimize(G_loss_mean_over_batch, var_list=generator_vars)
+    return D_solver, G_solver, priv_accountant
+
+
+# --- to do with the model --- #
+
+def create_placeholders(batch_size, seq_length, latent_dim, num_signals):
+
+    Z = tf.placeholder(tf.float32, [batch_size, seq_length, latent_dim])
+    X = tf.placeholder(tf.float32, [batch_size, seq_length, num_signals])
+    T = tf.placeholder(tf.float32, [batch_size, seq_length, num_signals])
+    return Z, X, T
+
+def generator(z, hidden_units_g, seq_length, batch_size, num_signals, reuse=False, parameters=None, learn_scale=True):
+
+    """
+    If parameters are supplied, initialise as such
+    """
+    with tf.variable_scope("generator") as scope:
+        if reuse:
+            scope.reuse_variables()
+        if parameters is None:
+            W_out_G_initializer = tf.truncated_normal_initializer()
+            b_out_G_initializer = tf.truncated_normal_initializer()
+            scale_out_G_initializer = tf.constant_initializer(value=1.0)
+            lstm_initializer = None
+            bias_start = 1.0
+        else:
+            W_out_G_initializer = tf.constant_initializer(value=parameters['generator/W_out_G:0'])
+            b_out_G_initializer = tf.constant_initializer(value=parameters['generator/b_out_G:0'])
+            try:
+                scale_out_G_initializer = tf.constant_initializer(value=parameters['generator/scale_out_G:0'])
+            except KeyError:
+                scale_out_G_initializer = tf.constant_initializer(value=1)
+                assert learn_scale
+            lstm_initializer = tf.constant_initializer(value=parameters['generator/rnn/lstm_cell/weights:0'])
+            bias_start = parameters['generator/rnn/lstm_cell/biases:0']
+
+        W_out_G = tf.get_variable(name='W_out_G', shape=[hidden_units_g, num_signals],
+                                  initializer=W_out_G_initializer)
+        b_out_G = tf.get_variable(name='b_out_G', shape=num_signals, initializer=b_out_G_initializer)
+        scale_out_G = tf.get_variable(name='scale_out_G', shape=1, initializer=scale_out_G_initializer,
+                                      trainable=learn_scale)
+        # inputs
+        inputs = z
+
+        cell = LSTMCell(num_units=hidden_units_g,
+                        state_is_tuple=True,
+                        initializer=lstm_initializer,
+                        bias_start=bias_start,
+                        reuse=reuse)
+        rnn_outputs, rnn_states = tf.nn.dynamic_rnn(
+            cell=cell,
+            dtype=tf.float32,
+            sequence_length=[seq_length] * batch_size,
+            inputs=inputs)
+        rnn_outputs_2d = tf.reshape(rnn_outputs, [-1, hidden_units_g])
+        logits_2d = tf.matmul(rnn_outputs_2d, W_out_G) + b_out_G #out put weighted sum
+        #        output_2d = tf.multiply(tf.nn.tanh(logits_2d), scale_out_G)
+        output_2d = tf.nn.tanh(logits_2d) # logits operation [-1, 1]
+        output_3d = tf.reshape(output_2d, [-1, seq_length, num_signals])
+
+    return output_3d
+
+
+def discriminator(x, hidden_units_d, seq_length, batch_size, reuse=False, parameters=None, batch_mean=False):
+    with tf.variable_scope("discriminator") as scope:
+        if reuse:
+            scope.reuse_variables()
+        if parameters is None:
+            W_out_D = tf.get_variable(name='W_out_D', shape=[hidden_units_d, 1],
+                                      initializer=tf.truncated_normal_initializer())
+            b_out_D = tf.get_variable(name='b_out_D', shape=1,
+                                      initializer=tf.truncated_normal_initializer())
+
+        else:
+            W_out_D = tf.constant_initializer(value=parameters['discriminator/W_out_D:0'])
+            b_out_D = tf.constant_initializer(value=parameters['discriminator/b_out_D:0'])
+
+        # inputs
+        inputs = x
+
+        # add the average of the inputs to the inputs (mode collapse?
+        if batch_mean:
+            mean_over_batch = tf.stack([tf.reduce_mean(x, axis=0)] * batch_size, axis=0)
+            inputs = tf.concat([x, mean_over_batch], axis=2)
+
+        cell = tf.contrib.rnn.LSTMCell(num_units=hidden_units_d,
+                                       state_is_tuple=True,
+                                       reuse=reuse)
+        rnn_outputs, rnn_states = tf.nn.dynamic_rnn(
+            cell=cell,
+            dtype=tf.float32,
+            inputs=inputs)
+        # logit_final = tf.matmul(rnn_outputs[:, -1], W_final_D) + b_final_D
+        logits = tf.einsum('ijk,km', rnn_outputs, W_out_D) + b_out_D # output weighted sum
+        # real logits or actual output layer?
+        # logit is a function that maps probabilities ([0,1]) to ([-inf,inf]) ?
+
+        output = tf.nn.sigmoid(logits) # y = 1 / (1 + exp(-x)). output activation [0, 1]. Probability??
+        # sigmoid output ([0,1]), Probability?
+
+    return output, logits
+
+
+# --- display ----#
+def display_batch_progression(j, id_max):
+    '''
+    See epoch progression
+    '''
+    batch_progression = int((j / id_max) * 100)
+    sys.stdout.write(str(batch_progression) + ' % epoch' + chr(13))
+    _ = sys.stdout.flush
+
+
+# --- to do with saving/loading --- #
+
+def dump_parameters(identifier, sess):
+    """
+    Save model parmaters to a numpy file
+    """
+    # dump_path = './experiments/parameters/' + identifier + '.npy'
+    dump_path = './experiments/parameters/' + identifier + '.npy'
+    model_parameters = dict()
+    for v in tf.trainable_variables():
+        model_parameters[v.name] = sess.run(v)
+    np.save(dump_path, model_parameters)
+    print('Recorded', len(model_parameters), 'parameters to', dump_path)
+    return True
+
+
+def load_parameters(identifier):
+    """
+    Load parameters from a numpy file
+    """
+    # load_path = './experiments/plots/parameters/' + identifier + '.npy'
+    # load_path = './experiments/plots/parameters/parameters_60/' + identifier + '.npy'
+    # load_path = './experiments/parameters/' + identifier + '.npy'
+    # load_path = './experiments/parameters/' + identifier + '.npy'
+    model_parameters = np.load(identifier).item()
+    return model_parameters
+
+
+
+
+
diff --git a/plotting.py b/plotting.py
new file mode 100644
index 0000000..1ad0cdc
--- /dev/null
+++ b/plotting.py
@@ -0,0 +1,606 @@
+import numpy as np
+import matplotlib as mpl
+mpl.use('Agg')
+import matplotlib.pyplot as plt
+import pdb
+from time import time
+from matplotlib.colors import hsv_to_rgb
+from pandas import read_table, read_hdf
+
+def plot_label(label, id):
+
+    fig, ax = plt.subplots(1, 1)
+    ax.plot(label)
+    fig.savefig("./Figs/Label_" + str(id).zfill(2) + ".png")
+    plt.clf()
+    plt.close(fig)
+
+    return
+
+def visualise_at_epoch(vis_sample, data, predict_labels, epoch,
+        identifier, num_epochs, resample_rate_in_min, multivariate_mnist,
+        seq_length, labels):
+    # TODO: what's with all these arguments
+    if data == 'mnist':
+        if predict_labels:
+            n_labels = 1
+            if one_hot:
+                n_labels = 6
+                lab_votes = np.argmax(vis_sample[:, :, -n_labels:], axis=2)
+            else:
+                lab_votes = vis_sample[:, :, -n_labels:]
+            labs, _ = mode(lab_votes, axis=1)
+            samps = vis_sample[:, :, :-n_labels]
+        else:
+            labs = labels
+            samps = vis_sample
+        if multivariate_mnist:
+            save_mnist_plot_sample(samps.reshape(-1, seq_length**2, 1), epoch, identifier, n_samples=6, labels=labs)
+        else:
+            save_mnist_plot_sample(samps, epoch, identifier, n_samples=6, labels=labs)
+    else:
+        save_plot_sample(vis_sample, epoch, identifier, n_samples=6,
+                num_epochs=num_epochs)
+
+    return True
+
+
+def save_plot_sample(samples, idx, identifier, n_samples=16, num_epochs=None, ncol=4):
+    assert n_samples <= samples.shape[0]
+    assert n_samples % ncol == 0
+    sample_length = samples.shape[1]
+  
+    if not num_epochs is None:
+        col = hsv_to_rgb((1, 1.0*(idx)/num_epochs, 0.8)) #convert hsv values in a numpy array to rgb values all values assumed to be in range [0, 1].
+    else:
+        col = 'grey'
+
+    x_points = np.arange(sample_length) #Return evenly spaced values within a given interval.
+
+    nrow = int(n_samples/ncol)
+    fig, axarr = plt.subplots(nrow, ncol, sharex=True, figsize=(6, 6))
+    for m in range(nrow):
+        for n in range(ncol):
+            # first column
+            sample = samples[n*nrow + m, :, :]
+            axarr[m, n].plot(x_points, sample, color=col)
+            # axarr[m, n].set_ylim(-1, 1)
+    for n in range(ncol):
+        axarr[-1, n].xaxis.set_ticks(range(0, sample_length, int(sample_length/4)))
+    fig.suptitle(idx)
+    fig.subplots_adjust(hspace = 0.15)
+    fig.savefig("./experiments/plots/gs/" + identifier + "_epoch" + str(idx).zfill(4) + ".png")
+    plt.clf()
+    plt.close()
+    return
+
+def save_plot_interpolate(input_samples, samples, idx, identifier,  num_epochs=None, distances=None, sigma=1):
+    """ very boilerplate, unsure how to make nicer """
+    n_samples = samples.shape[0]
+    sample_length = samples.shape[1]
+  
+    if not num_epochs is None:
+        col = hsv_to_rgb((1, 1.0*(idx)/num_epochs, 0.8))
+    else:
+        col = 'grey'
+
+    x_points = np.arange(sample_length)
+    if distances is None:
+        nrow = n_samples
+    else:
+        nrow = n_samples + 1
+    ncol = 1
+    fig, axarr = plt.subplots(nrow, ncol, figsize=(3, 9))
+    if distances is None:
+        startat = 0
+    else:
+        startat = 1
+        axarr[0].plot(distances.dA, color='green', label='distance from A', linestyle='--', marker='o', markersize=4)
+        axarr[0].plot(distances.dB, color='orange', label='distance from B', linestyle='dotted', marker='o', markersize=4)
+        axarr[0].get_xaxis().set_visible(False)
+        axarr[0].set_title('distance from endpoints')
+    for m in range(startat, nrow):
+        sample = samples[m-startat, :, 0]
+        axarr[m].plot(x_points, sample, color=col)
+    for m in range(startat, nrow):
+        axarr[m].set_ylim(-1.1, 1.1)
+        axarr[m].set_xlim(0, sample_length)
+        axarr[m].spines["top"].set_visible(False)
+        axarr[m].spines["bottom"].set_visible(False)
+        axarr[m].spines["right"].set_visible(False)
+        axarr[m].spines["left"].set_visible(False)
+        axarr[m].tick_params(bottom='off', left='off')
+        axarr[m].get_xaxis().set_visible(False)
+        axarr[m].get_yaxis().set_visible(False)
+        axarr[m].set_facecolor((0.96, 0.96, 0.96))
+    if not input_samples is None:
+        # now do the real samples
+        axarr[startat].plot(x_points, input_samples[0], color='green', linestyle='--')
+        axarr[-1].plot(x_points, input_samples[1], color='green', linestyle='--')
+    
+    axarr[-1].xaxis.set_ticks(range(0, sample_length, int(sample_length/4)))
+    fig.suptitle(idx)
+    fig.subplots_adjust(hspace = 0.2)
+    fig.savefig("./experiments/plots/" + identifier + "_interpolate.png")
+    fig.savefig("./experiments/plots/" + identifier + "_interpolate.pdf")
+    plt.clf()
+    plt.close()
+    return
+
+def reconstruction_errors(identifier, train_errors, vali_errors, 
+                          generated_errors, random_errors):
+    """
+    Plot two histogram of the reconstruction errors.
+    """
+    print(identifier)
+    fig, axarr = plt.subplots(4, 1, sharex=True, figsize=(4, 8))
+    axarr[0].hist(train_errors, normed=1, color='green', bins=50)
+    axarr[0].set_title("train reconstruction errors")
+    axarr[1].hist(vali_errors, normed=1, color='blue', bins=50)
+    axarr[1].set_title('vali reconstruction errors')
+    axarr[2].hist(generated_errors, normed=1, color='pink', bins=50)
+    axarr[2].set_title('generated reconstruction errors')
+    axarr[3].hist(random_errors, normed=1, color='grey', bins=50)
+    axarr[3].set_title('random reconstruction errors')
+    for ax in axarr:
+        ax.spines["top"].set_visible(False)
+        ax.spines["bottom"].set_visible(False)
+        ax.spines["right"].set_visible(False)
+        ax.spines["left"].set_visible(False)
+        ax.tick_params(bottom='off', left='off')
+        ax.get_xaxis().set_visible(False)
+        ax.get_yaxis().set_visible(False)
+    axarr[3].set_xlim(0, 0.05)
+    plt.tight_layout()
+    plt.savefig('./experiments/plots/' + identifier + '_reconstruction_errors.png')
+    return True
+
+def save_plot_reconstruct(real_samples, model_samples, identifier):
+    assert real_samples.shape == model_samples.shape
+    sample_length = real_samples.shape[1]
+    x_points = np.arange(sample_length)
+    nrow = real_samples.shape[0]
+    ncol = 2
+    fig, axarr = plt.subplots(nrow, ncol, sharex=True, figsize=(6, 6))
+    for m in range(nrow):
+        real_sample = real_samples[m, :, 0]
+        model_sample = model_samples[m, :, 0]
+        axarr[m, 0].plot(x_points, real_sample, color='green')
+        axarr[m, 1].plot(x_points, model_sample, color='red')
+    axarr[-1, 0].xaxis.set_ticks(range(0, sample_length, int(sample_length/4)))
+    axarr[-1, 1].xaxis.set_ticks(range(0, sample_length, int(sample_length/4)))
+    axarr[0, 0].set_title('real')
+    axarr[0, 1].set_title('reconstructed')
+    fig.subplots_adjust(hspace = 0.15)
+    fig.savefig("./experiments/plots/" + identifier + "_reconstruct.png")
+    plt.clf()
+    plt.close()
+    return
+
+def save_plot_vary_dimension(samples_list, idx, identifier, n_dim):
+    """
+    """
+    assert len(samples_list) == n_dim
+    sample_length = samples_list[0].shape[1]
+  
+    x_points = np.arange(sample_length)
+
+    nrow = samples_list[0].shape[0]
+    sidelength = n_dim*1.5
+    fig, axarr = plt.subplots(nrow, n_dim, sharex=True, sharey=True, figsize=(sidelength, sidelength))
+    for dim in range(n_dim):
+        sample_dim = samples_list[dim]
+        axarr[0, dim].set_title(dim)
+        h = dim*1.0/n_dim       # hue
+        for n in range(nrow):
+            sample = sample_dim[n, :, 0]
+            axarr[n, dim].plot(x_points, sample, color='black')
+            axarr[n, dim].spines["top"].set_visible(False)
+            axarr[n, dim].spines["bottom"].set_visible(False)
+            axarr[n, dim].spines["right"].set_visible(False)
+            axarr[n, dim].spines["left"].set_visible(False)
+            axarr[n, dim].tick_params(bottom='off', left='off')
+            axarr[n, dim].get_xaxis().set_visible(False)
+            axarr[n, dim].set_facecolor(hsv_to_rgb((h, 0 + 0.25*n/nrow, 0.96)))
+        axarr[-1, dim].xaxis.set_ticks(range(0, sample_length, int(sample_length/4)))
+    fig.suptitle(idx)
+    fig.subplots_adjust(hspace = 0.11, wspace=0.11)
+    fig.savefig("./experiments/plots/" + identifier + "_epoch" + str(idx).zfill(4) + ".png")
+    plt.clf()
+    plt.close()
+    return True
+
+def interpolate(sampleA, sampleB=None, n_steps=6):
+    """
+    Plot the linear interpolation between two latent space points.
+    """
+    weights = np.linspace(0, 1, n_steps)
+    if sampleB is None:
+        # do it "close by"
+        sampleB = sampleA + np.random.normal(size=sampleA.shape, scale=0.05)
+    samples = np.array([w*sampleB + (1-w)*sampleA for w in weights])
+    return samples
+
+def vary_latent_dimension(sample, dimension, n_steps=6):
+    """
+    """
+    assert dimension <= sample.shape[1]
+    scale = np.mean(np.abs(sample[:, dimension]))
+    deviations = np.linspace(0, 2*scale, n_steps)
+    samples = np.array([sample[:, :]]*n_steps)
+    for n in range(n_steps):
+        samples[n, :, dimension] += deviations[n]
+    return samples
+
+def plot_sine_evaluation(real_samples, fake_samples, idx, identifier):
+    """ 
+    Create histogram of fake (generated) samples frequency, amplitude distribution.
+    Also for real samples.
+    """
+    ### frequency
+    seq_length = len(real_samples[0])    # assumes samples are all the same length
+    frate = seq_length
+    freqs_hz = np.fft.rfftfreq(seq_length)*frate      # this is for labelling the plot
+    # TODO, just taking axis 0 for now...
+    w_real = np.mean(np.abs(np.fft.rfft(real_samples[:, :, 0])), axis=0)
+    w_fake = np.mean(np.abs(np.fft.rfft(fake_samples[:, :, 0])), axis=0)
+    ### amplitude
+    A_real = np.max(np.abs(real_samples[:, :, 0]), axis=1) 
+    A_fake = np.max(np.abs(fake_samples[:, :, 0]), axis=1) 
+    ### now plot
+    nrow = 2
+    ncol = 2
+    fig, axarr = plt.subplots(nrow, ncol, sharex='col', figsize=(6, 6))
+    # freq
+    axarr[0, 0].vlines(freqs_hz, ymin=np.minimum(np.zeros_like(w_real), w_real), ymax=np.maximum(np.zeros_like(w_real), w_real), color='#30ba50')
+    axarr[0, 0].set_title("frequency", fontsize=16)
+    axarr[0, 0].set_ylabel("real", fontsize=16)
+    axarr[1, 0].vlines(freqs_hz, ymin=np.minimum(np.zeros_like(w_fake), w_fake), ymax=np.maximum(np.zeros_like(w_fake), w_fake), color='#ba4730')
+    axarr[1, 0].set_ylabel("generated", fontsize=16)
+    # amplitude
+    axarr[0, 1].hist(A_real, normed=True, color='#30ba50', bins=30)
+    axarr[0, 1].set_title("amplitude", fontsize=16)
+    axarr[1, 1].hist(A_fake, normed=True, color='#ba4730', bins=30)
+   
+    fig.savefig('./experiments/plots/' + identifier + '_eval' + str(idx).zfill(4) +'.png')
+    plt.clf()
+    plt.close()
+    return True
+
+def plot_trace(identifier, xmax=250, final=False, dp=False):
+    """
+    """
+
+    trace_path = './experiments/traces/' + identifier + '.trace.txt'
+    da = read_table(trace_path, sep=' ')
+    nrow = 3
+    if dp:
+        trace_dp_path = './experiments/traces/' + identifier + '.dptrace.txt'
+        da_dp = read_table(trace_dp_path, sep=' ')
+        nrow += 1
+    
+    ncol=1  
+    fig, axarr = plt.subplots(nrow, ncol, sharex='col', figsize=(6, 6))
+
+    # D_loss
+    d_handle,  = axarr[0].plot(da.epoch, da.D_loss, color='red', label='discriminator')
+    axarr[0].set_ylabel('D loss')
+#    axarr[0].set_ylim(0.9, 1.6)
+    if final:
+        #D_ticks = [1.0, 1.2, 1.5]
+        D_ticks = [0.5, 1.0, 1.5]
+        axarr[0].get_yaxis().set_ticks(D_ticks)
+        for tick in D_ticks:
+            axarr[0].plot((-10, xmax+10), (tick, tick), ls='dotted', lw=0.5, color='black', alpha=0.4, zorder=0)
+    # G loss
+    ax_G = axarr[0].twinx()
+    g_handle,  = ax_G.plot(da.epoch, da.G_loss, color='green', ls='dashed', label='generator')
+    ax_G.set_ylabel('G loss')
+    if final:
+        G_ticks = [2.5, 5]
+        ax_G.get_yaxis().set_ticks(G_ticks)
+#        for tick in G_ticks:
+#            axarr[0].plot((-10, xmax+10), (tick, tick), ls='dotted', lw=0.5, color='green', alpha=1.0, zorder=0)
+
+    ax_G.spines["top"].set_visible(False)
+    ax_G.spines["bottom"].set_visible(False)
+    ax_G.spines["right"].set_visible(False)
+    ax_G.spines["left"].set_visible(False)
+    ax_G.tick_params(bottom='off', right='off')
+    axarr[0].legend(handles=[d_handle, g_handle], labels=['discriminator', 'generator'])
+
+    # mmd
+    da_mmd = da.loc[:, ['epoch', 'mmd2']].dropna()
+    axarr[1].plot(da_mmd.epoch, da_mmd.mmd2, color='purple')
+    axarr[1].set_ylabel('MMD$^2$')
+    #axarr[1].set_ylim(0.0, 0.04)
+
+    #ax_that = axarr[1].twinx()
+    #ax_that.plot(da.that)
+    #ax_that.set_ylabel('$\hat{t}$')
+    #ax_that.set_ylim(0, 50)
+    if final:
+        mmd_ticks = [0.01, 0.02, 0.03]
+        axarr[1].get_yaxis().set_ticks(mmd_ticks)
+        for tick in mmd_ticks:
+            axarr[1].plot((-10, xmax+10), (tick, tick), ls='dotted', lw=0.5, color='black', alpha=0.4, zorder=0)
+
+    # log likelihood
+    da_ll = da.loc[:, ['epoch', 'll', 'real_ll']].dropna()
+    axarr[2].plot(da_ll.epoch, da_ll.ll, color='orange')
+    axarr[2].plot(da_ll.epoch, da_ll.real_ll, color='orange', alpha=0.5)
+    axarr[2].set_ylabel('likelihood')
+    axarr[2].set_xlabel('epoch')
+    axarr[2].set_ylim(-750, 100)
+    #axarr[2].set_ylim(-10000000, 500)
+    if final:
+#        ll_ticks = [-1.0*1e7, -0.5*1e7, 0]
+        ll_ticks = [-500 ,-250, 0]
+        axarr[2].get_yaxis().set_ticks(ll_ticks)
+        for tick in ll_ticks:
+            axarr[2].plot((-10, xmax+10), (tick, tick), ls='dotted', lw=0.5, color='black', alpha=0.4, zorder=0)
+   
+    if dp:
+        assert da_dp.columns[0] == 'epoch'
+        epochs = da_dp['epoch']
+        eps_values = da_dp.columns[1:]
+        for eps_string in eps_values:
+            if 'eps' in eps_string:
+                eps = eps_string[3:]
+            else:
+                eps = eps_string
+            deltas = da_dp[eps_string]
+            axarr[3].plot(epochs, deltas, label=eps)
+            axarr[3].set_ylabel('delta')
+            axarr[3].set_xlabel('epoch')
+        axarr[3].legend()
+
+    # beautify
+    for ax in axarr:
+        #ax.spines["top"].set_visible(True)
+        ax.spines["top"].set_color((0, 0, 0, 0.3))
+        #ax.spines["bottom"].set_visible(False)
+        ax.spines["bottom"].set_color((0, 0, 0, 0.3))
+        #ax.spines["right"].set_visible(False)
+        ax.spines["right"].set_color((0, 0, 0, 0.3))
+        #ax.spines["left"].set_visible(False)
+        ax.spines["left"].set_color((0, 0, 0, 0.3))
+        ax.tick_params(bottom='off', left='off')
+        # make background grey
+    #    ax.set_facecolor((0.96, 0.96, 0.96))
+        ymin, ymax = ax.get_ylim()
+        for x in np.arange(0, xmax+10, 10):
+            ax.plot((x, x), (ymin, ymax), ls='dotted', lw=0.5, color='black', alpha=0.40, zorder=0)
+        ax.set_xlim(-5, xmax)
+        ax.get_yaxis().set_label_coords(-0.11,0.5)
+
+    # bottom one
+
+    fig.savefig('./experiments/traces/' + identifier + '_trace.png')
+    fig.savefig('./experiments/traces/' + identifier + '_trace.pdf')
+    plt.clf()
+    plt.close()
+    return True
+
+
+def save_samples(vis_sample, identifier, epoch):
+
+    np.save('./experiments/plots/gs/' + identifier + '_gs_%s.npy' % epoch, vis_sample)
+
+    return True
+
+def save_samples_real(vis_real, identifier):
+
+    np.save('./experiments/plots/gs/' + identifier + '_gs_real.npy',  vis_real)
+
+    return True
+
+def save_mnist_plot_sample(samples, idx, identifier, n_samples, labels=None):
+    """
+    Generates a grid showing mnist digits.
+
+    """
+    assert n_samples <= samples.shape[0]
+    if not labels is None:
+        assert n_samples <= len(labels)
+        if len(labels.shape) > 1 and not labels.shape[1] == 1:
+            # one-hot
+            label_titles = np.argmax(labels, axis=1)
+        else:
+            label_titles = labels
+    else:
+        label_titles = ['NA']*n_samples
+    assert n_samples % 2 == 0
+    img_size = int(np.sqrt(samples.shape[1]))
+    
+    nrow = int(n_samples/2)
+    ncol = 2
+    fig, axarr = plt.subplots(nrow, ncol, sharex=True, figsize=(8, 8))
+    for m in range(nrow):
+        # first column
+        sample = samples[m, :, 0]
+        axarr[m, 0].imshow(sample.reshape([img_size,img_size]), cmap='gray')            
+        axarr[m, 0].set_title(str(label_titles[m]))
+        # second column
+        sample = samples[nrow + m, :, 0]
+        axarr[m, 1].imshow(sample.reshape([img_size,img_size]), cmap='gray')           
+        axarr[m, 1].set_title(str(label_titles[m + nrow]))
+    fig.suptitle(idx)
+    fig.suptitle(idx)
+    fig.subplots_adjust(hspace = 0.15)
+    fig.savefig("./experiments/plots/" + identifier + "_epoch" + str(idx).zfill(4) + ".png")
+    plt.clf()
+    plt.close()
+    return
+
+def visualise_latent(Z, identifier):
+    """
+    visualise a SINGLE point in the latent space
+    """
+    seq_length = Z.shape[0]
+    latent_dim = Z.shape[1]
+    if latent_dim > 2: 
+        print('WARNING: Only visualising first two dimensions of latent space.')
+    h = np.random.random()
+    colours = np.array([hsv_to_rgb((h, i/seq_length, 0.96)) for i in range(seq_length)])
+#    plt.plot(Z[:, 0], Z[:, 1], c='grey', alpha=0.5)
+    for i in range(seq_length):
+        plt.scatter(Z[i, 0], Z[i, 1], marker='o', c=colours[i])
+    plt.savefig('./experiments/plots/' + identifier + '_Z.png')
+    plt.clf()
+    plt.close()
+    return True
+
+
+# --- to do with the model --- #
+def plot_parameters(parameters, identifier):
+    """
+    visualise the parameters of a GAN
+    """
+    generator_out = parameters['generator/W_out_G:0']
+    generator_weights = parameters['generator/rnn/lstm_cell/weights:0'] # split this into four
+    generator_matrices = np.split(generator_weights, 4, 1)
+    fig, axarr = plt.subplots(5, 1, sharex=True, 
+            gridspec_kw = {'height_ratios':[0.2, 1, 1, 1, 1]}, figsize=(3,13))
+
+    axarr[0].matshow(generator_out.T, extent=[0,100,0,100])
+    axarr[0].set_title('W_out_G')
+    axarr[1].matshow(generator_matrices[0])
+    axarr[1].set_title('LSTM weights (1)')
+    axarr[2].matshow(generator_matrices[1])
+    axarr[2].set_title('LSTM weights (2)')
+    axarr[3].matshow(generator_matrices[2])
+    axarr[3].set_title('LSTM weights (3)')
+    axarr[4].matshow(generator_matrices[3])
+    axarr[4].set_title('LSTM weights (4)')
+    for a in axarr:
+        a.set_xlim(0, 100)
+        a.set_ylim(0, 100)
+        a.spines["top"].set_visible(False)
+        a.spines["bottom"].set_visible(False)
+        a.spines["right"].set_visible(False)
+        a.spines["left"].set_visible(False)
+        a.get_xaxis().set_visible(False)
+        a.get_yaxis().set_visible(False)
+#        a.tick_params(bottom='off', left='off', top='off')
+    plt.tight_layout()
+    plt.savefig('./experiments/plots/' + identifier + '_weights.png')
+    return True
+
+### TSTR ###
+def view_mnist_eval(identifier, train_X, train_Y, synth_X, synth_Y, test_X, test_Y, synth_predY, real_predY):
+    """
+    Basically just 
+    http://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html
+    """
+    # resize everything
+    side_length = int(np.sqrt(train_X.shape[1]))
+    train_X = train_X.reshape(-1, side_length, side_length)
+    synth_X = synth_X.reshape(-1, side_length, side_length)
+    test_X = test_X.reshape(-1, side_length, side_length)
+    # remember, they're wrecked in the outer function thanks to python
+    synth_images_and_labels = list(zip(synth_X, synth_Y))
+    for index, (image, label) in enumerate(synth_images_and_labels[:4]):
+        plt.subplot(4, 4, index + 1)
+        plt.axis('off')
+        plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
+        if index == 0:
+            plt.title('synth train: %i' % label)
+        else:
+            plt.title('%i' % label)
+    train_images_and_labels = list(zip(train_X, train_Y))
+    for index, (image, label) in enumerate(train_images_and_labels[:4]):
+        plt.subplot(4, 4, index + 5)
+        plt.axis('off')
+        plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
+        if index == 0:
+            plt.title('real train: %i' % label)
+        else:
+            plt.title('%i' % label)
+    images_and_synthpreds = list(zip(test_X, synth_predY))
+    for index, (image, prediction) in enumerate(images_and_synthpreds[:4]):
+        plt.subplot(4, 4, index + 9)
+        plt.axis('off')
+        plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
+        if index == 0:
+            plt.title('synth pred: %i' % prediction)
+        else:
+            plt.title('%i' % prediction)
+    images_and_realpreds = list(zip(test_X, real_predY))
+    for index, (image, prediction) in enumerate(images_and_realpreds[:4]):
+        plt.subplot(4, 4, index + 13)
+        plt.axis('off')
+        plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
+        if index == 0:
+            plt.title('real pred: %i' % prediction)
+        else:
+            plt.title('%i' % prediction)
+    plt.tight_layout()
+    plt.title(identifier)
+    plt.savefig('./experiments/tstr/' + identifier + '_preds.png')
+    return True
+
+# --- nips !!! --- #
+def nips_plot_rbf(sample, index, which='train'):
+    if which == 'train':
+#        col = '#167ea0'
+        col = '#13af5f'
+    else:
+        col = 'black'
+    sample_length = len(sample)
+    sample = sample.reshape(sample_length)
+    x_points = np.arange(sample_length)
+    fig, axarr = plt.subplots(1, 1, figsize=(2, 2))
+    axarr.set_facecolor((0.95, 0.96, 0.96))
+    axarr.plot(x_points, sample, color=col)
+    axarr.set_ylim(-1.5, 1.5)
+    axarr.get_xaxis().set_visible(False)
+    axarr.get_yaxis().set_visible(False)
+    axarr.spines["top"].set_visible(False)
+    axarr.spines["bottom"].set_visible(False)
+    axarr.spines["right"].set_visible(False)
+    axarr.spines["left"].set_visible(False)
+    axarr.tick_params(bottom='off', left='off')
+    plt.savefig('./plots/NIPS_rbf_' + which + '_' + str(index) + '.png')
+    plt.savefig('./plots/NIPS_rbf_' + which + '_' + str(index) + '.pdf')
+    plt.clf()
+    plt.close()
+    return True
+
+def nips_plot_sine(sample, index, which='train'):
+    if which == 'train':
+        #col = '#167ea0'
+        #col = '#13af5f'
+        col = '#1188ad'
+    else:
+        col = 'black'
+    sample_length = len(sample)
+    sample = sample.reshape(sample_length)
+    sample_length = len(sample)
+    sample = sample.reshape(sample_length)
+    x_points = np.arange(sample_length)
+    fig, axarr = plt.subplots(1, 1, figsize=(2, 2))
+    axarr.set_facecolor((0.95, 0.96, 0.96))
+    axarr.plot(x_points, sample, color=col)
+    axarr.set_ylim(-1.1, 1.1)
+    axarr.get_xaxis().set_visible(False)
+    axarr.get_yaxis().set_visible(False)
+    axarr.spines["top"].set_visible(False)
+    axarr.spines["bottom"].set_visible(False)
+    axarr.spines["right"].set_visible(False)
+    axarr.spines["left"].set_visible(False)
+    axarr.tick_params(bottom='off', left='off')
+    plt.savefig('./plots/NIPS_sine_' + which + '_' + str(index) + '.png')
+    plt.savefig('./plots/NIPS_sine_' + which + '_' + str(index) + '.pdf')
+    plt.clf()
+    plt.close()
+    return True
+
+def nips_plot_mnist(sample, index, which='train'):
+    plt.axis('off')
+    plt.imshow(sample, cmap=plt.cm.gray, interpolation='nearest')
+    plt.savefig('./plots/NIPS_mnist_' + which + '_' + str(index) + '.png')
+    plt.savefig('./plots/NIPS_mnist_' + which + '_' + str(index) + '.pdf')
+    plt.clf()
+    plt.close()
+    return True