HumanCompatibleAI · rohinmshah · Jul 22, 2020 · Jul 22, 2020 · Aug 5, 2020 · Aug 5, 2020
diff --git a/requirements.txt b/requirements.txt
@@ -25,6 +25,7 @@ torchsummary~=1.5.1
 #webdataset introduces breaking changes in 0.1.49, so setting this to an exact equality
 webdataset==0.1.40
 tqdm~=4.48.0
+procgen==0.10.4
 
 # Jupyter Lab is used for our experiment analysis notebook
 jupyterlab~=2.2.6

diff --git a/src/il_representations/algos/__init__.py b/src/il_representations/algos/__init__.py
@@ -1,7 +1,7 @@
 from il_representations.algos.representation_learner import RepresentationLearner, DEFAULT_HARDCODED_PARAMS
 from il_representations.algos.encoders import MomentumEncoder, InverseDynamicsEncoder, TargetStoringActionEncoder, \
     RecurrentEncoder, BaseEncoder, VAEEncoder, ActionEncodingEncoder, ActionEncodingInverseDynamicsEncoder, \
-    infer_action_shape_info
+    infer_action_shape_info, SimCLRModel
 from il_representations.algos.decoders import NoOp, MomentumProjectionHead, \
     BYOLProjectionHead, ActionConditionedVectorDecoder, ContrastiveInverseDynamicsConcatenationHead, \
     ActionPredictionHead, PixelDecoder, SymmetricProjectionHead, AsymmetricProjectionHead

diff --git a/src/il_representations/algos/augmenters.py b/src/il_representations/algos/augmenters.py
@@ -11,12 +11,18 @@
 either augment just the context, or both the context and the target, depending on the algorithm.
 """
 
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
 
 class Augmenter(ABC):
-    def __init__(self, augmenter_spec, color_space):
-        augment_op = StandardAugmentations.from_string_spec(
-            augmenter_spec, color_space)
-        self.augment_op = augment_op
+    def __init__(self, augmenter_spec, color_space, augment_func=None):
+        self.augment_func = augment_func
+        if augment_func:
+            self.augment_op = augment_func
+        else:
+            augment_op = StandardAugmentations.from_string_spec(
+                augmenter_spec, color_space)
+            self.augment_op = augment_op
 
     @abstractmethod
     def __call__(self, contexts, targets):
@@ -33,6 +39,21 @@ def __call__(self, contexts, targets):
 
 class AugmentContextAndTarget(Augmenter):
     def __call__(self, contexts, targets):
+        pil_process_func = transforms.Compose([
+            transforms.ToPILImage()
+        ])
+        if self.augment_func:
+            context_ret, target_ret = [], []
+            for context, target in zip(contexts, targets):
+                if isinstance(context, torch.Tensor) and \
+                   isinstance(self.augment_op.transforms[0],
+                              transforms.RandomResizedCrop):
+                    context, target = pil_process_func(context.cpu()), \
+                                      pil_process_func(target.cpu())
+                context_ret.append(self.augment_op(context))
+                target_ret.append(self.augment_op(target))
+            return torch.stack(context_ret, dim=0).to(device), \
+                   torch.stack(target_ret, dim=0).to(device)
         return self.augment_op(contexts), self.augment_op(targets)
 
 

diff --git a/src/il_representations/algos/decoders.py b/src/il_representations/algos/decoders.py
@@ -64,8 +64,8 @@ def get_sequential_from_architecture(architecture, representation_dim, projectio
     input_dim = representation_dim
     for layer_def in architecture:
         layers.append(nn.Linear(input_dim, layer_def['output_dim']))
-        layers.append(nn.ReLU())
         layers.append(nn.BatchNorm1d(num_features=layer_def['output_dim']))
+        layers.append(nn.ReLU(inplace=True))
         input_dim = layer_def['output_dim']
     layers.append(nn.Linear(input_dim, projection_dim))
     return nn.Sequential(*layers)
@@ -131,7 +131,7 @@ def _apply_projection_layer(self, z_dist, mean_layer, stdev_layer):
             # We better not have had a learned standard deviation in
             # the encoder, since there's no clear way on how to pass
             # it forward
-            assert np.all((z_dist.stddev == 1).numpy())
+            assert np.all((z_dist.stddev == 1).cpu().numpy())
             stddev = self.ones_like_projection_dim(mean)
         else:
             stddev = stdev_layer(z_vector)

diff --git a/src/il_representations/algos/encoders.py b/src/il_representations/algos/encoders.py
@@ -10,6 +10,8 @@
 from torchvision.models.resnet import BasicBlock as BasicResidualBlock
 import torch
 from torch import nn
+from torchvision.models.resnet import resnet50, resnet34
+import torch.nn.functional as F
 from pyro.distributions import Delta
 
 from gym import spaces
@@ -197,8 +199,10 @@ def __init__(self,
                  use_sn=False,
                  arch_str='MAGICALCNN-resnet-128',
                  ActivationCls=torch.nn.ReLU):
+
         super().__init__()
 
+
         # If block_type == resnet, use ResNet's basic block.
         # If block_type == magical, use MAGICAL block from its paper.
         assert arch_str in NETWORK_ARCHITECTURE_DEFINITIONS.keys()
@@ -265,11 +269,35 @@ def forward(self, x):
         warn_on_non_image_tensor(x)
         return self.shared_network(x)
 
+
+class SimCLRModel(nn.Module):
+    def __init__(self, observation_space, representation_dim=128):
+        super(SimCLRModel, self).__init__()
+
+        self.f = []
+        in_channel = observation_space.shape[0]
+        for name, module in resnet34().named_children():
+            if name == 'conv1':
+                module = nn.Conv2d(in_channel, 64, kernel_size=3, stride=1, padding=1, bias=False)
+            if not isinstance(module, nn.Linear) and not isinstance(module, nn.MaxPool2d):
+                self.f.append(module)
+        # encoder
+        # Temporarily add an extra layer to be closer to our model implementation
+        self.f = nn.Sequential(*self.f)
+
+
+    def forward(self, x):
+        x = self.f(x)
+        feature = torch.flatten(x, start_dim=1)
+        return F.normalize(feature, dim=-1)
+
+
 # string names for convolutional networks; this makes it easier to choose
 # between them from the command line
 NETWORK_SHORT_NAMES = {
     'BasicCNN': BasicCNN,
     'MAGICALCNN': MAGICALCNN,
+    'SimCLRModel': SimCLRModel
 }
 
 
@@ -348,22 +376,22 @@ class BaseEncoder(Encoder):
     def __init__(self, obs_space, representation_dim, obs_encoder_cls=None,
                  learn_scale=False, latent_dim=None, scale_constant=1, obs_encoder_cls_kwargs=None):
         """
-                :param obs_space: The observation space that this Encoder will be used on
-                :param representation_dim: The number of dimensions of the representation
-                       that will be learned
-                :param obs_encoder_cls: An internal architecture implementing `forward`
-                       to return a single vector representing the mean representation z
-                       of a fixed-variance representation distribution (in the deterministic
-                       case), or a latent dimension, in the stochastic case. This is
-                       expected NOT to end in a ReLU (i.e. final layer should be linear).
-                :param learn_scale: A flag for whether we want to learn a parametrized
-                       standard deviation. If this is set to False, a constant value of
-                       <scale_constant> will be returned as the standard deviation
-                :param latent_dim: Dimension of the latents that feed into mean and std networks
-                       If not set, this defaults to representation_dim * 2.
-                :param scale_constant: The constant value that will be returned if learn_scale is
-                       set to False.
-                :param obs_encoder_cls_kwargs: kwargs the encoder class will take.
+        :param obs_space: The observation space that this Encoder will be used on
+        :param representation_dim: The number of dimensions of the representation
+               that will be learned
+        :param obs_encoder_cls: An internal architecture implementing `forward`
+               to return a single vector representing the mean representation z
+               of a fixed-variance representation distribution (in the deterministic
+               case), or a latent dimension, in the stochastic case. This is
+               expected NOT to end in a ReLU (i.e. final layer should be linear).
+        :param learn_scale: A flag for whether we want to learn a parametrized
+               standard deviation. If this is set to False, a constant value of
+               <scale_constant> will be returned as the standard deviation
+        :param latent_dim: Dimension of the latents that feed into mean and std networks
+               If not set, this defaults to representation_dim * 2.
+        :param scale_constant: The constant value that will be returned if learn_scale is
+               set to False.
+        :param obs_encoder_cls_kwargs: kwargs the encoder class will take.
          """
         super().__init__()
         if obs_encoder_cls_kwargs is None:
@@ -380,6 +408,13 @@ def __init__(self, obs_space, representation_dim, obs_encoder_cls=None,
             self.network = obs_encoder_cls(obs_space, representation_dim, **obs_encoder_cls_kwargs)
             self.scale_constant = scale_constant
 
+        if torch.cuda.device_count() > 1:
+            print("Using", torch.cuda.device_count(), "GPUs!")
+            self.network = nn.DataParallel(self.network)
+
+        self.network.to(self.device)
+
+
     def forward(self, x, traj_info):
         if self.learn_scale:
             return self.forward_with_stddev(x, traj_info)

diff --git a/src/il_representations/algos/losses.py b/src/il_representations/algos/losses.py
@@ -1,5 +1,6 @@
 from abc import ABC, abstractmethod
 import torch
+import numpy as np
 import torch.nn.functional as F
 import stable_baselines3.common.logger as sb_logger
 from pyro.distributions import Delta
@@ -161,11 +162,12 @@ class SymmetricContrastiveLoss(RepresentationLoss):
     all similarities with J, and also all similarities with I, and calculates cross-entropy on both
     """
 
-    def __init__(self, device, sample=False, temp=0.1, normalize=True):
+    def __init__(self, device, sample=False, temp=0.1, normalize=True, use_repo_loss=False):
         super(SymmetricContrastiveLoss, self).__init__(device, sample)
 
         self.criterion = torch.nn.CrossEntropyLoss()
         self.temp = temp
+        self.use_repo_loss = use_repo_loss
 
         # Most methods use either cosine similarity or matrix multiplication similarity. Since cosine similarity equals
         # taking MatMul on normalized vectors, setting normalize=True is equivalent to using torch.CosineSimilarity().
@@ -180,50 +182,74 @@ def __call__(self, decoded_context_dist, target_dist, encoded_context_dist=None)
         # decoded_context -> representation of context + optional projection head
         # target -> representation of target + optional projection head
         # encoded_context -> not used by this loss
+
         decoded_contexts, targets = self.get_vector_forms(decoded_context_dist, target_dist)
         z_i = decoded_contexts
         z_j = targets
         batch_size = z_i.shape[0]
 
-        if self.normalize:  # Use cosine similarity
+
+        if self.use_repo_loss:
+            # Normalize to avoid infinities
             z_i = F.normalize(z_i, dim=1)
             z_j = F.normalize(z_j, dim=1)
+            out = torch.cat([z_i, z_j], dim=0)
+            # [2*B, 2*B]
+            sim_matrix = torch.exp(torch.mm(out, out.t().contiguous()) / self.temp)
+            mask = (torch.ones_like(sim_matrix) - torch.eye(2 * batch_size, device=sim_matrix.device)).bool()
+            # [2*B, 2*B-1]
+            sim_matrix = sim_matrix.masked_select(mask).view(2 * batch_size, -1)
+
+            # compute loss
+            pos_sim = torch.exp(torch.sum(z_i * z_j, dim=-1) / self.temp)
+            # [2*B]
+            pos_sim = torch.cat([pos_sim, pos_sim], dim=0)
+            loss = (- torch.log(pos_sim / sim_matrix.sum(dim=-1))).mean()
+            if torch.isnan(loss):
+                breakpoint()
+            return loss
+        else:
+            if not self.normalize:
+                breakpoint()
+            if self.normalize:  # Use cosine similarity
+                z_i = F.normalize(z_i, dim=1)
+                z_j = F.normalize(z_j, dim=1)
 
-        mask = (torch.eye(batch_size) * self.large_num).to(self.device)
-
-        # Similarity of the original images with all other original images in current batch. Return a matrix of NxN.
-        logits_aa = torch.matmul(z_i, z_i.T)  # NxN
-
-        # Values on the diagonal line are each image's similarity with itself
-        logits_aa = logits_aa - mask
-        # Similarity of the augmented images with all other augmented images.
-        logits_bb = torch.matmul(z_j, z_j.T)  # NxN
-        logits_bb = logits_bb - mask
-        # Similarity of original images and augmented images
-        logits_ab = torch.matmul(z_i, z_j.T)  # NxN
-        logits_ba = torch.matmul(z_j, z_i.T)  # NxN
 
-        avg_self_similarity = logits_ab.diag().mean().item()
-        logits_other_sim_mask = ~torch.eye(batch_size, dtype=bool, device=logits_ab.device)
-        avg_other_similarity = logits_ab.masked_select(logits_other_sim_mask).mean().item()
+            mask = (torch.eye(batch_size) * self.large_num).to(self.device)
 
-        sb_logger.record('avg_self_similarity', avg_self_similarity)
-        sb_logger.record('avg_other_similarity', avg_other_similarity)
-        sb_logger.record('self_other_sim_delta', avg_self_similarity - avg_other_similarity)
+            # Similarity of the original images with all other original images in current batch. Return a matrix of NxN.
+            logits_aa = torch.matmul(z_i, z_i.T)  # NxN
 
-        # Each row now contains an image's similarity with the batch's augmented images & original images. This applies
-        # to both original and augmented images (hence "symmetric").
-        logits_i = torch.cat((logits_ab, logits_aa), 1)  # Nx2N
-        logits_j = torch.cat((logits_ba, logits_bb), 1)  # Nx2N
-        logits = torch.cat((logits_i, logits_j), axis=0)  # 2Nx2N
-        logits /= self.temp
+            # Values on the diagonal line are each image's similarity with itself
+            logits_aa = logits_aa - mask
+            # Similarity of the augmented images with all other augmented images.
+            logits_bb = torch.matmul(z_j, z_j.T)  # NxN
+            logits_bb = logits_bb - mask
+            # Similarity of original images and augmented images
+            logits_ab = torch.matmul(z_i, z_j.T)  # NxN
+            logits_ba = torch.matmul(z_j, z_i.T)  # NxN
+
+            avg_self_similarity = logits_ab.diag().mean().item()
+            logits_other_sim_mask = ~torch.eye(batch_size, dtype=bool, device=logits_ab.device)
+            avg_other_similarity = logits_ab.masked_select(logits_other_sim_mask).mean().item()
+            sb_logger.record('avg_self_similarity', avg_self_similarity)
+            sb_logger.record('avg_other_similarity', avg_other_similarity)
+            sb_logger.record('self_other_sim_delta', avg_self_similarity - avg_other_similarity)
+
+            # Each row now contains an image's similarity with the batch's augmented images & original images. This applies
+            # to both original and augmented images (hence "symmetric").
+            logits_i = torch.cat((logits_ab, logits_aa), 1)  # Nx2N
+            logits_j = torch.cat((logits_ba, logits_bb), 1)  # Nx2N
+            logits = torch.cat((logits_i, logits_j), axis=0)  # 2Nx2N
+            logits /= self.temp
 
-        # The values we want to maximize lie on the i-th index of each row i. i.e. the dot product of
-        # represent(image_i) and represent(augmented_image_i).
-        label = torch.arange(batch_size, dtype=torch.long).to(self.device)
-        labels = torch.cat((label, label), axis=0)
+            # The values we want to maximize lie on the i-th index of each row i. i.e. the dot product of
+            # represent(image_i) and represent(augmented_image_i).
+            label = torch.arange(batch_size, dtype=torch.long).to(self.device)
+            labels = torch.cat((label, label), axis=0)
 
-        return self.criterion(logits, labels)
+            return self.criterion(logits, labels)
 
 
 class NegativeLogLikelihood(RepresentationLoss):