update

jeiros · May 24, 2018 · cc0f6ad · cc0f6ad
1 parent 4c2040c
commit cc0f6ad
Show file tree

Hide file tree

Showing 10 changed files with 2,024 additions and 995 deletions.
diff --git a/.ipynb_checkpoints/Untitled1-checkpoint.ipynb b/.ipynb_checkpoints/Untitled1-checkpoint.ipynb
diff --git a/.ipynb_checkpoints/ala-gan_2-checkpoint.ipynb b/.ipynb_checkpoints/ala-gan_2-checkpoint.ipynb
diff --git a/.ipynb_checkpoints/dialanine-example-checkpoint.ipynb b/.ipynb_checkpoints/dialanine-example-checkpoint.ipynb
diff --git a/README.md b/README.md
@@ -1,12 +1,31 @@
-# MD-GAN
+# MDGAN
+In this repo you will find some experiments I have done with a GAN to generate three-dimensional conformations
+for a given protein.
 
+The GAN is trained using conformations obtained from MD simulations. The Generator aims to get
+better at faking conformations that look like the conformations that are seen during the simulations.
+The Discriminator tries to discern if a given conformation comes from a simulation (real) or from the
+Generator (fake).
 
-## installation
+
+## Installation
 
 
 ```bash
 conda install -c omnia msmbuilder mdtraj msmexplorer
-pip install tensorflow-gpu
+pip install tensorflow-gpu  # or tensorflow if no GPU available
 pip install keras
 ```
 
+## Example
+
+```python
+from msmbuilder.example_datasets import AlanineDipeptide
+from utils import make_trajectory_trainable
+from mdgan import MDGAN
+
+trjs = AlanineDipeptide().get().trajectories
+data, sc = make_trajectory_trainable(trjs)  # sc is the MinMaxScaler we'll need it later
+gan = MDGAN(n_atoms=22)
+losses = gan.train(data, num_epochs=10)  # That's it
+```
diff --git a/__pycache__/mdgan.cpython-36.pyc b/__pycache__/mdgan.cpython-36.pyc
diff --git a/__pycache__/utils.cpython-36.pyc b/__pycache__/utils.cpython-36.pyc
diff --git a/ala-gan.ipynb b/ala-gan.ipynb
diff --git a/dialanine-example.ipynb b/dialanine-example.ipynb
diff --git a/mdgan.py b/mdgan.py
@@ -0,0 +1,117 @@
+"""
+MDGAN generates three-dimensional conformations that resemble the ones provided
+as training data (MD simulations).
+"""
+import numpy as np
+from keras.layers import Dense, Reshape, Flatten, Dropout
+from keras.layers import BatchNormalization
+from keras.layers.advanced_activations import LeakyReLU
+from keras.layers.convolutional import Conv2D
+from keras.models import Sequential
+from keras.optimizers import Adam
+from utils import make_trainable, make_latent_samples, make_labels
+from sklearn.model_selection import train_test_split
+
+
+class MDGAN():
+    def __init__(self, n_atoms, noise_dim=100, gen_lr=2e-4, disc_lr=1e-3,
+                 gan_lr=2e-4):
+        self.n_atoms = n_atoms
+        self.noise_dim = noise_dim
+        self.generator = self.build_generator(lr=gen_lr)
+        self.discriminator = self.build_discriminator(lr=disc_lr)
+        self.gan = self.build_GAN(lr=gan_lr)
+
+    def __repr__(self):
+        self.gan.summary()
+
+    def build_generator(self, lr):
+        g = Sequential([
+            Dense(2 * 2 * self.noise_dim, input_dim=self.noise_dim),
+            BatchNormalization(),
+            LeakyReLU(0.2),
+            Dense(self.n_atoms * 3, input_dim=self.noise_dim * 2 * 2),
+            Reshape((self.n_atoms, 3, 1))
+        ], name='generator')
+        adam = Adam(lr=lr)
+        g.compile(adam, loss='binary_crossentropy')
+        return g
+
+    def build_discriminator(self, lr):
+        d = Sequential([
+            Conv2D(32, 3, padding='same', strides=2, input_shape=(self.n_atoms, 3, 1)),
+            LeakyReLU(0.2),
+            Dropout(0.3),
+
+            Conv2D(64, 3, padding='same', strides=1),
+            LeakyReLU(0.2),
+            Dropout(0.3),
+
+
+            Conv2D(128, 3, padding='same', strides=1),
+            LeakyReLU(0.2),
+            Dropout(0.3),
+
+            Flatten(),
+            Dense(1, activation='sigmoid')
+
+        ], name='discriminator')
+        adam = Adam(lr=lr)
+        d.compile(adam, 'binary_crossentropy')
+        return d
+
+    def build_GAN(self, lr):
+        gan = Sequential([self.generator, self.discriminator])
+        adam = Adam(lr=lr)
+        gan.compile(adam, 'binary_crossentropy')
+        return gan
+
+    def train(self, data, batch_size=250, num_epochs=25, eval_size=200):
+        losses = []
+        train, test = train_test_split(data)
+        for epoch in range(num_epochs):
+            for i in range(len(train) // batch_size):
+                # ------------------
+                # Train Disciminator
+                # ------------------
+                make_trainable(self.discriminator, True)
+                # Get some real conformations from the train data
+                real_confs = train[i * batch_size:(i + 1) * batch_size]
+                real_confs = real_confs.reshape(-1, self.n_atoms, 3, 1)
+
+                # Sample high dimensional noise and generate fake conformations
+                noise = make_latent_samples(batch_size, self.noise_dim)
+                fake_confs = self.generator.predict_on_batch(noise)
+
+                # Label the conformations accordingly
+                real_confs_labels, fake_confs_labels = make_labels(batch_size)
+
+                self.discriminator.train_on_batch(real_confs, real_confs_labels)
+                self.discriminator.train_on_batch(fake_confs, fake_confs_labels)
+
+                # --------------------------------------------------
+                #  Train Generator via GAN (swith off discriminator)
+                # --------------------------------------------------
+                noise = make_latent_samples(batch_size, self.noise_dim)
+                make_trainable(self.discriminator, False)
+                g_loss = self.gan.train_on_batch(noise, real_confs_labels)
+
+            # Evaluate performance after epoch
+            conf_eval_real = test[np.random.choice(len(test), eval_size, replace=False)]
+            conf_eval_real = conf_eval_real.reshape(-1, self.n_atoms, 3, 1)
+            noise = make_latent_samples(eval_size, self.noise_dim)
+            conf_eval_fake = self.generator.predict_on_batch(noise)
+
+            eval_real_labels, eval_fake_labels = make_labels(eval_size)
+
+            d_loss_r = self.discriminator.test_on_batch(conf_eval_real, eval_real_labels)
+            d_loss_f = self.discriminator.test_on_batch(conf_eval_fake, eval_fake_labels)
+            d_loss = (d_loss_r + d_loss_f) / 2
+
+            # we want the fake to be realistic!
+            g_loss = self.gan.test_on_batch(noise, eval_real_labels)
+
+            print("Epoch: {:>3}/{} Discriminator Loss: {:>6.4f} Generator Loss: {:>6.4f}".format(epoch + 1, num_epochs, d_loss, g_loss))
+
+            losses.append((d_loss, g_loss))
+        return losses
diff --git a/utils.py b/utils.py
@@ -8,8 +8,61 @@
 from keras.layers.convolutional import UpSampling2D, Conv2D
 from keras.models import Sequential, Model
 from keras.optimizers import Adam
+from msmbuilder.preprocessing import MinMaxScaler
+import mdtraj
+from matplotlib import pyplot as plt
 
 
+def plot_losses(losses):
+    losses = np.array(losses)
+    fig, ax = plt.subplots()
+    plt.plot(losses.T[0], label='Discriminator')
+    plt.plot(losses.T[1], label='Generator')
+    plt.title("Training Losses")
+    plt.legend()
+    ax.set(ylabel='BCE', xlabel='Epoch')
+    return fig, ax
+
+
+def make_trajectory_trainable(traj_list):
+    """
+    Build a train/test splittable array of cartesian coordinates from a list
+    of mdtraj.Trajectory objects
+
+    Parameters
+    ----------
+    traj_list: list of mdtraj.Trajectory objects
+
+    Returns
+    -------
+    data: np.array, shape=(frames, n_atoms, 3)
+        A numpy array of the XYZ coordinates of all the frames in the list of
+        trajs. Coordinates are squised from -1 to 1.
+        Use a MinMaxScaler.inverse_transform to map them back to the original
+        space.
+    sc: MinMaxScaler, The scaler used to squish the coordinates.
+    """
+    frame00 = traj_list[0][0]
+    trjs = [t.superpose(frame00) for t in traj_list]
+    sc = MinMaxScaler(feature_range=(-1, 1))
+    frames = []
+    for t in trjs:
+        for f in t:
+            frames.append(f.xyz.reshape(frame00.n_atoms, 3))
+    f_txx_sc = sc.fit_transform(frames)
+    data = np.dstack(f_txx_sc)
+    data = data.transpose(2, 0, 1)
+    return data, sc
+
+
+def fake_traj_from_samples(samples, top, scaler):
+    fake_tr = samples[:, :, :, 0]
+    fake_traj_orig_space = [scaler.inverse_transform(t) for t in fake_tr]
+    fake_traj = mdtraj.Trajectory(fake_traj_orig_space, topology=top)
+    fake_traj.center_coordinates()
+    fake_traj.superpose(fake_traj, 0)
+    return fake_traj
+
 
 def scatter(arr, ax=None, scatter_kws=None):
     if ax is None:
@@ -20,79 +73,80 @@ def scatter(arr, ax=None, scatter_kws=None):
     return ax
 
 
-def make_latent_samples(n_samples, sample_size):
-    return np.random.normal(loc=0, scale=1, size=(n_samples, sample_size))
+def make_latent_samples(n_samples, sample_dim):
+    return np.random.normal(loc=0, scale=1, size=(n_samples, sample_dim))
+
 
 def make_trainable(model, trainable):
     for layer in model.layers:
         layer.trainable = trainable
-
+
+
 def make_labels(size):
     return np.ones([size, 1]), np.zeros([size, 1])
 
 
-
-def make_2dtraj_GAN(sample_size, 
-                    g_hidden_size, 
-                    d_hidden_size, 
-                    leaky_alpha, 
+def make_2dtraj_GAN(sample_size,
+                    g_hidden_size,
+                    d_hidden_size,
+                    leaky_alpha,
                     g_learning_rate,
                     d_learning_rate):
     K.clear_session()
-    
+
     generator = Sequential([
         Dense(g_hidden_size, input_shape=(sample_size,)),
         LeakyReLU(alpha=leaky_alpha),
-        Dense(2),        
+        Dense(2),
         Activation('tanh')
-    ], name='generator')    
+    ], name='generator')
 
     discriminator = Sequential([
         Dense(d_hidden_size, input_shape=(2,)),
         LeakyReLU(alpha=leaky_alpha),
         Dense(1),
         Activation('sigmoid')
-    ], name='discriminator')    
-    
+    ], name='discriminator')
+
     gan = Sequential([
         generator,
         discriminator
     ])
-    
+
     discriminator.compile(optimizer=Adam(lr=d_learning_rate), loss='binary_crossentropy')
     gan.compile(optimizer=Adam(lr=g_learning_rate), loss='binary_crossentropy')
-    
+
     return gan, generator, discriminator
 
 
-def make_3dtraj_GAN(sample_size, 
-                    g_hidden_size, 
-                    d_hidden_size, 
-                    leaky_alpha, 
+def make_3dtraj_GAN(sample_size,
+                    g_hidden_size,
+                    d_hidden_size,
+                    leaky_alpha,
                     g_learning_rate,
                     d_learning_rate):
     K.clear_session()
-    
+
     generator = Sequential([
         Dense(g_hidden_size, input_shape=(sample_size,)),
         LeakyReLU(alpha=leaky_alpha),
-        Dense(2),        
+        Dense(2),
         Activation('tanh')
-    ], name='generator')    
+    ], name='generator')
 
     discriminator = Sequential([
         Dense(d_hidden_size, input_shape=(2,)),
         LeakyReLU(alpha=leaky_alpha),
         Dense(1),
         Activation('sigmoid')
-    ], name='discriminator')    
-    
+    ], name='discriminator')
+
     gan = Sequential([
         generator,
         discriminator
     ])
-    
+
     discriminator.compile(optimizer=Adam(lr=d_learning_rate), loss='binary_crossentropy')
     gan.compile(optimizer=Adam(lr=g_learning_rate), loss='binary_crossentropy')
-    
-    return gan, generator, discriminator
+
+    return gan, generator, discriminator