index.html

<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>On the Data-Efficiency with Contrastive Image Transformation in Reinforcement Learning</title>

    <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
        var task_map = {
            "simple-object-manipulation": "simple_object_manipulation",
            "visual-goal-reaching": "visual_goal_reaching",
            "novel-concept-grounding": "novel_concept_grounding",
            "one-shot-video-imitation": "one_shot_video_imitation",
            "visual-constraint-satisfaction": "visual_constraint_satisfaction",
            "visual-reasoning": "visual_reasoning"
        };

        function updateDemoVideo(category) {
            // var demo = document.getElementById("single-menu-demos").value;
            var task = document.getElementById(category + "-menu-tasks").value;
            var inst = document.getElementById(category + "-menu-instances").value;

            console.log(task_map[category], task, inst)

            var video = document.getElementById(category + "-single-task-video");
            video.src = "assets/videos/demos/" +
                task_map[category] +
                "/" +
                task +
                "/" +
                inst +
                ".mp4";
            video.playbackRate = 2.0;
            video.play();
        }
    </script>

    <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
          rel="stylesheet">

    <link rel="stylesheet" href="./static/css/bulma.min.css">
    <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
    <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
    <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
    <link rel="stylesheet" href="./static/css/academicons.min.css">
    <link rel="stylesheet"
          href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
    <link rel="stylesheet" href="./static/css/index.css">

    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
    <script defer src="./static/js/fontawesome.all.min.js"></script>
    <script src="./static/js/bulma-carousel.min.js"></script>
    <script src="./static/js/bulma-slider.min.js"></script>
    <script src="./static/js/index.js"></script>
</head>
<body>

<header style="background-color:hsl(210, 27%, 96%);width:100%;margin:auto;">
    <div class="container">
<section class="hero">
    <div class="hero-body">
        <div class="container is-max-desktop">
            <div class="columns is-centered">
                <div class="column has-text-centered">
                    <h1 class="title is-1 publication-title">On the Data-Efficiency with Contrastive Image Transformation in 
                        Reinforcement Learning</h1>
                    <div class="is-size-4 publication-authors">
            <!-- <span class="author-block"> -->
            <span class="author-block">Sicong&#160;Liu</a><sup>1 2 3 *</sup>, </span>
            <span class="author-block">Xi&#160;Sheryl&#160;Zhang</a><sup>2 3 5&dagger;</sup>, </span>
            <span class="author-block">Yushuo&#160;Li</a><sup>2</sup>, </span>
            <span class="author-block">Yifan&#160;Zhang</a><sup>2 3 5</sup>, </span>
            <span class="author-block">Jian&#160;Cheng</a><sup>2 3 4</sup>, </span>
                <!-- <a target="_blank">Sicong&#160;Liu</a><sup>1 2 3 *</sup>,
                <a target="_blank">Xi&#160;Sheryl&#160;Zhang</a><sup>2 3 5&dagger;</sup>,
                <a target="_blank">Yushuo&#160;Li</a><sup>2</sup>,
                <a target="_blank">Yifan&#160;Zhang</a><sup>2 3 5</sup>,
                <a target="_blank">Jian&#160;Cheng</a><sup>2 3 4</sup>, -->
                <!-- <a target="_blank" href="">Jian&#160;Cheng</a><sup>2 3 4</sup>, -->
                <!-- <br> -->
                <!-- &dagger; 十字架符号 -->
            <!-- </span> -->
                    </div>

                    <div class="is-size-5 publication-authors">
                        <span class="author-block"><sup>1</sup>NJUST, </span>
                        <span class="author-block"><sup>2</sup>
                            Institute of Automation, Chinese Academy of Sciences, </span>
                        <span class="author-block"><sup>3</sup>AIRIA, </span>
                        <span class="author-block"><sup>4</sup>School of Future Technology, University of Chinese Academy of 
                            Sciences, </span>
                        <span class="author-block"><sup>5</sup>University of Chinese Academy of Sciences, Nanjing
                            , </span>
                    </div>

                    <div class="is-size-5 publication-authors">
                        <span class="author-block"><sup>*</sup>Work done during CASIA internship</span>
                        <!-- <br> -->
                        <span class="author-block"><sup>&dagger;</sup>Corresponding Author </span>
                    </div>

                    <div class="column has-text-centered">
                        <div class="is-size-5 publication-links">
                            <!-- TODO PDF Link. -->
                            <span class="link-block">
                <a target="_blank" href="https://openreview.net/forum?id=-nm-rHXi5ga">
                   <!-- class="external-link button is-normal is-rounded is-dark"> -->
                  <!-- <span class="icon"> -->
                      <!-- <i class="ai ai-arxiv"></i> -->
                      <!-- <i class="ai ai-openreview"></i> -->
                  <!-- </span> -->
                  <span>OpenReview</span>
                </a>
              </span>

              <span>/</span>

                            <span class="link-block">
                <a target="_blank" href="assets/coit_paper.pdf">
                   <!-- class="external-link button is-normal is-rounded is-dark"> -->
                  <!-- <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                  </span> -->
                  <span>PDF</span>
                </a>
              </span>

              <span>/</span>
                            <!-- Code Link. -->
                            <span class="link-block">
                <a target="_blank" href="https://github.com/mooricAnna/CoIT">
                   <!-- class="external-link button is-normal is-rounded is-dark"> -->
                  <!-- <span class="icon">
                      <i class="fab fa-github"></i>
                  </span> -->
                  <span>Code</span>
                </a>
                <!-- <a target="_blank" href="https://github.com/vimalabs/VIMA#pretrained-models"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fa fa-network-wired"></i>
                  </span>
                  <span>Models</span>
                </a>
                <a target="_blank" href="https://github.com/vimalabs/VimaBench"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-robot"></i>
                  </span>
                  <span>Benchmark</span>
                </a> -->
                <!-- <a target="_blank" href="https://huggingface.co/datasets/VIMA/VIMA-Data"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-database"></i>
                  </span>
                  <span>Dataset</span> -->
                <!-- </a> -->
              </span>
                        </div>

                    </div>
                </div>
            </div>
        </div>
    </div>
</section>
    </div>
</header>

<!-- <section class="hero is-light is-small">
    <div class="hero-body">
        <div class="container">
            <div id="results-carousel" class="carousel results-carousel">
                <div class="item item-sweep_without_exceeding">
                    <video poster="" id="sweep_without_exceeding" autoplay controls muted loop height="100%"
                           playbackRate=2.0>
                        <source src="assets/videos/demos/visual_constraint_satisfaction/sweep_without_exceeding/1.mp4"
                                type="video/mp4">
                    </video>
                </div>
                <div class="item item-sweep_without_touching">
                    <video poster="" id="sweep_without_touching" autoplay controls muted loop height="100%"
                           playbackRate=2.0>
                        <source src="assets/videos/demos/visual_goal_reaching/rearrange_then_restore/1.mp4"
                                type="video/mp4">
                    </video>
                </div>
                <div class="item item-rotate">
                    <video poster="" id="rotate" autoplay controls muted loop height="100%">
                        <source src="assets/videos/demos/one_shot_video_imitation/follow_motion/1.mp4"
                                type="video/mp4">
                    </video>
                </div>
                <div class="item item-scene_understanding">
                    <video poster="" id="scene_understanding" autoplay controls muted loop height="100%">
                        <source src="assets/videos/demos/novel_concept_grounding/twist/1.mp4"
                                type="video/mp4">
                    </video>
                </div>
                <div class="item item-simple_manipulation">
                    <video poster="" id="simple_manipulation" autoplay controls muted loop height="100%">
                        <source src="assets/videos/demos/visual_reasoning/manipulate_old_neighbor/1.mp4"
                                type="video/mp4">
                    </video>
                </div>
                <div class="item item-novel_adj">
                    <video poster="" id="novel_adj" autoplay controls muted loop height="100%">
                        <source src="assets/videos/demos/one_shot_video_imitation/follow_order/1.mp4"
                                type="video/mp4">
                    </video>
                </div>
                <div class="item item-novel_adj_and_noun">
                    <video poster="" id="novel_adj_and_noun" autoplay controls muted loop height="100%">
                        <source src="assets/videos/demos/simple_object_manipulation/scene_understanding/1.mp4"
                                type="video/mp4">
                    </video>
                </div>
                <div class="item item-novel_noun">
                    <video poster="" id="novel_noun" autoplay controls muted loop height="100%">
                        <source src="assets/videos/demos/visual_constraint_satisfaction/sweep_without_touching/1.mp4"
                                type="video/mp4">
                    </video>
                </div>
                <div class="item item-twist">
                    <video poster="" id="twist" autoplay controls muted loop height="100%">
                        <source src="assets/videos/demos/novel_concept_grounding/novel_adj/1.mp4"
                                type="video/mp4">
                    </video>
                </div>
                <div class="item item-follow_motion">
                    <video poster="" id="follow_motion" autoplay controls muted loop height="100%">
                        <source src="assets/videos/demos/simple_object_manipulation/simple_manipulation/1.mp4"
                                type="video/mp4">
                    </video>
                </div>
                <div class="item item-follow_order">
                    <video poster="" id="follow_order" autoplay controls muted loop height="100%">
                        <source src="assets/videos/demos/visual_goal_reaching/rearrange/1.mp4"
                                type="video/mp4">
                    </video>
                </div>
                <div class="item item-rearrange">
                    <video poster="" id="rearrange" autoplay controls muted loop height="100%">
                        <source src="assets/videos/demos/simple_object_manipulation/rotate/1.mp4"
                                type="video/mp4">
                    </video>
                </div>
                <div class="item item-manipulate_old_neighbor">
                    <video poster="" id="manipulate_old_neighbor" autoplay controls muted loop height="100%">
                        <source src="assets/videos/demos/visual_reasoning/same_color/1.mp4"
                                type="video/mp4">
                    </video>
                </div>
                <div class="item item-pick_in_order_then_restore">
                    <video poster="" id="pick_in_order_then_restore" autoplay controls muted loop height="100%">
                        <source src="assets/videos/demos/visual_reasoning/pick_in_order_then_restore/1.mp4"
                                type="video/mp4">
                    </video>
                </div>
                <div class="item item-same_color">
                    <video poster="" id="same_color" autoplay controls muted loop height="100%">
                        <source src="assets/videos/demos/novel_concept_grounding/novel_noun/1.mp4"
                                type="video/mp4">
                    </video>
                </div>
                <div class="item item-rearrange_then_restore">
                    <video poster="" id="rearrange_then_restore" autoplay controls muted loop height="100%">
                        <source src="assets/videos/demos/visual_reasoning/same_profile/1.mp4"
                                type="video/mp4">
                    </video>
                </div>
                <div class="item item-same_profile">
                    <video poster="" id="same_profile" autoplay controls muted loop height="100%">
                        <source src="assets/videos/demos/novel_concept_grounding/novel_adj_and_noun/1.mp4"
                                type="video/mp4">
                    </video>
                </div>
            </div>
        </div>
    </div>
</section> -->
<!-- <div style="border:1px groove #CCC"> -->
<!-- <hr width="1000" style="margin-left: auto; margin-right: auto"> -->

<section class="section">
    <div class="container is-max-desktop">
        <!-- Gifs. -->
        <div class="rows is-centered has-text-centered">
            <div class="row">
            <div class="content has-text-justified">
                <img src="assets/gifs/cartpole_swingup.gif" style="width: 19%" float="center"/>  
                <img src="assets/gifs/pendulum_swingup.gif" style="width: 19%" float="center"/>  
                <img src="assets/gifs/cup_catch.gif" style="width: 19%" float="center"/>  
                <img src="assets/gifs/quadruped_walk.gif" style="width: 19%" float="center"/>  
                <img src="assets/gifs/finger_spin.gif" style="width: 19%" float="center"/> 
                <br>
                <img src="assets/gifs/acrobot_swingup.gif" style="width: 19%" float="center"/> 
                <img src="assets/gifs/cheetah_run.gif" style="width: 19%" float="center"/>  
                <img src="assets/gifs/finger_turn_hard.gif" style="width: 19%" float="center"/>  
                <img src="assets/gifs/reacher_hard.gif" style="width: 19%" float="center"/>  
                <img src="assets/gifs/walker_run.gif" style="width: 19%" float="center"/>  
            </div>
        </div>
    </div>
</section>

<!-- <hr width="1200" style="margin-left: auto; margin-right: auto"> -->

<section class="section">
    <div class="container is-max-desktop">
        <!-- Abstract. -->
        <div class="columns is-centered has-text-centered">
            <div class="column">
                <h2 class="title is-3">Abstract</h2>
                <div class="content has-text-justified">
                    <p style="font-size: 125%; width: 83%; height: auto; margin-left: auto; margin-right: auto">
                        Data-efficiency has always been an essential issue in pixel-based reinforcement learning (RL). As the agent 
                        not only learns decision-making but also meaningful representations from images. The line of reinforcement 
                        learning with data augmentation shows significant improvements in sample-efficiency. However, it is 
                        challenging to guarantee the optimality invariant transformation, that is, the augmented data are readily 
                        recognized as a completely different state by the agent. In the end, we propose a contrastive invariant 
                        transformation (CoIT), a simple yet promising learnable data augmentation combined with standard model-free 
                        algorithms to improve sample-efficiency. Concretely, the differentiable CoIT leverages original samples with 
                        augmented samples and hastens the state encoder for a contrastive invariant embedding. We evaluate our 
                        approach on DeepMind Control Suite and Atari100K. Empirical results verify advances using CoIT, enabling 
                        it to outperform the new state-of-the-art on various tasks. Source code is available at 
                        <a href="https://github.com/mooricAnna/CoIT"><span>https://github.com/mooricAnna/CoIT</span></a>.
                    </p>
                </div>
            </div>
        </div>
    </div>
</section>

<!-- <div style="border:1px solid #CCC; width: 70%; height: auto; margin-left: auto; margin-right: auto"> -->
<hr width="1200" style="margin-left: auto; margin-right: auto">

<!-- Method -->
<section class="section">
    <div class="container is-max-widescreen">
        <div class="row">
            <!-- <div class="rows is-centered has-text-centered"> -->
            <div class="rows is-centered">
                    <h2 class="title is-3">
                        <span class="dvima">
                            <!-- &#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; -->
                            Method
                        </span>
                    </h2>

                <h3 class="title is-4">
                        <span class="dvima">
                        <!-- &#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; -->
                        Learnable Invariant Transformation
                    </span>
                </h3>
                <div class="content has-text-justified">
                    <p style="font-size: 110%; margin-left: auto; margin-right: auto">
                        Our method, <b>CoIT</b> introduces a learnable image augmentation to achieve an invariant image transformation.
                        We propose a theoretical analysis of how a learnable invariant image transformation can approximate
                        a stationary distribution over the transformed data by the optimal invariant metric, thus learning
                        better representations.
                    </p>
                
                <img src="assets/images/lemma.png" style="width: 49%" float="left"/>    
                <img src="assets/images/theo_coit.png" style=" width: 49%" float="left"/>
                <!-- <span style="font-size: 110%">
                    <span style="font-weight: bold">Scaling model and data</span>. <i>Top:</i> We compare performance of different methods with model sizes ranging from 2M to 200M parameters. Across all model sizes and generalization levels VIMA outperforms prior works. <i>Bottom:</i> For a fixed model size of 92M parameters we compare the effect of imitation learning dataset size of 0.1%, 1%, 10%, and full imitation data. VIMA is extremely sample efficient and can achieve performance comparable to other methods with 10&times; less data.
                </span> -->
                <!-- <br> -->
                <h3 class="title is-4">
                    <span class="dvima">
                    <!-- &#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; -->
                    Stabilizing Reward Function
                    </span>
                </h3>   
                <p style="font-size: 110%; margin-left: auto; margin-right: auto">
                    To further stabilize the reward function, we propose a mixed CoIT that samples multiple
                    transformed data from the learned distribution, and mix up them together for model training.
                </p>
                    <div class="content has-text-justified">
                        <img src="assets/images/mixed_coit.png" class="interpolation-image" alt=""
                        style="display: block; width: 75%; margin-left: auto; margin-right: auto"/>  
                <h3 class="title is-4">
                    <span class="dvima">
                        Framework Design
                    </span>
                </h3> 
                <p style="font-size: 110%; margin-left: auto; margin-right: auto">
                    We present a new framework with normalization variants to ensure above discussed learning guarantees by optimizing
                    paraneters. Below are key points of our method.

                    <ul style="font-size: 110%">
                        <li> We borrow a similar idea from <a href="https://arxiv.org/abs/1912.08795"><span>Yin et al.</span></a> to 
                            regularize and smooth the distribution shift between
                             the transformed and overall data. We use the statistical data stored in the BN layers to approximate
                             the distribution of the overall data,
                             <br>
                             <!-- <div class="row">
                                <div class="rows is-centered has-text-centered"> -->
                                <!-- <div class="rows is-centered">
                             \(         
                            \mathcal{K}_\omega(\mathbf{x}_t')= \sum_{l} \left\|\tilde{\mu}(\mathbf{x}_t')-\mathbb{E}(\tilde{\mu}_l(\mathbf{x})|\mathcal{O}) \right\|_2 + 
                             \sum_{l} \left\|\tilde{\sigma}^2(\mathbf{x}_t')-\mathbb{E}(\tilde{\sigma}_l^2(\mathbf{x})|\mathcal{O}) \right\|_2
                              \)
                                </div> -->
                             <!-- </div> -->
                             <img src="assets/images/bn.png" class="interpolation-image"
                         alt="" style="display: block; margin-left: auto; margin-right: auto; 
                         width: 60%; height: auto"/>
                         where \(\tilde{\mu}(\mathbf{x}_t')\) and \(\tilde{\sigma}^2(\mathbf{x}_t')\) are the <i>mean</i> and 
                         <i>variance</i> of the transformed data and \(\omega\) represents the parameters of the learnable image
                         transformation. The expectation terms \(\mathbb{E}(\cdot)\) denotes the statistical estimation of the
                         batch-wise data stored in the \(l\)-th conv layer, and \(\mathcal{O}\) is the given observations.
                        </li>
                        <li> We utilize the similarity metric proposed by <a href="https://arxiv.org/abs/2002.05709"><span>Chen et al.</span></a> for learning the encoder that encodes the
                            observations into latent space to meet the invariant transformation,
                            <img src="assets/images/byol.png" class="interpolation-image"
                         alt="" style="display: block; margin-left: auto; margin-right: auto; 
                         width: 70%; height: auto"/>
                         Here \(\bar{\xi}\) denotes the momentum version of parameters \(\xi\), and \(\mathcal{D}\) indicates the replay buffer.
                        </li>
                        <li> We update the critic network with transformed data \(\mathbf{x}_t'\) and \(\mathbf{x}_{t+n}'\) to 
                            minimize the TD error for <i>n</i>-steps returns,
                            <img src="assets/images/critic.png" class="interpolation-image"
                         alt="" style="display: block; margin-left: auto; margin-right: auto; 
                         width: 70%; height: auto"/>
                        </li>

                    </ul>
                    </p>
                    <p style="font-size: 110%; margin-left: auto; margin-right: auto">
                    Eventually, we give the unified objective function of the CoIT,
                    where \(\alpha \) and \(\lambda\) are hyper-parameters and the overall architecture is presented below.
                    <br>
                    <br>
                    <img src="assets/images/overall.png" class="interpolation-image"
                         alt="" style="display: block; margin-left: auto; margin-right: auto; 
                         width: 40%; height: auto"/>
                    <!-- where \(\alpha \) and \(\lambda\) are hyper-parameters and the overall architecture is presented below. -->
                </p>
                    <img src="assets/images/pull.png" class="interpolation-image"
                         alt="" style="display: block; margin-left: auto; margin-right: auto; 
                         width: 70%; height: auto"/>
                    <p style="font-size: 100%; display: block; margin-left: auto; margin-right: auto; 
                    width: 77%; height: auto">
                    <b>Overall architecture of CoIT.</b> The observations are transformed following a Gaussian distribution 
                    \(\mathcal{G}(\mu, \sigma)\) and encoded by the state encoder \(g_\xi\).
                    The observation encoder \(g_{\bar{\xi}}\) and projection \(f_{\bar{\xi}}\) are the exponentially moving average 
                    version of the state encoder and projection.
                    </p>
                </div>
            </div>
        </div>
    </div>
</section>

<!-- Model
<section class="section">
    <div class="container is-max-widescreen">
        <div class="rows">
            <div class="rows is-centered ">
                <div class="row is-full-width">
                    <h2 class="title is-3"><span
                            class="dvima">VIMA: Visuomotor Attention Model</span></h2>
                    <video poster="" autoplay controls muted loop height="100%">
                        <source src="assets/videos/vima_arch_animation.mp4"
                                type="video/mp4">
                    </video>
                    <span style="font-size: 110%">
<span style="font-weight: bold">VIMA architecture.</span> We encode the multimodal prompts with a pre-trained T5 model, and condition the robot controller on the prompt through cross-attention layers. The controller is a causal transformer decoder consisting of alternating self and cross attention layers that predicts motor commands conditioned on prompts and interaction history.</span>
                </div>
            </div>

        </div>
    </div>
</section> -->

<!-- <hr width="1150" style="margin-left: auto; margin-right: auto">

<section class="section">
    <div class="container is-max-widescreen">

        <div class="row">
            <div class="rows is-centered has-text-centered">
                <div class="row is-full-width">
                    <h2 class="title is-3"><span
                            class="dvima">VIMA-Bench: Benchmark for Multimodal Robot Learning</span></h2>
                    <div class="content has-text-justified">
                    <span style="font-size: 125%">
                        We provide 17 representative meta-tasks with multimodal prompt templates, which can be procedurally instantiated into thousands of individual tasks by various combinations of textures and tabletop objects.
                    </span>

                    <br>
                    <br>
                    <br>

                    <div class="columns">
                        Simple Object Manipulation
                        <div class="column has-text-left">
                            <h3 class="title is-5">Simple Object Manipulation</h3>
                            <div class="select is-medium">
                                <select id="simple-object-manipulation-menu-tasks"
                                        onchange="updateDemoVideo('simple-object-manipulation')">
                                    <option value="simple_manipulation" selected="selected">Visual Manipulation</option>
                                    <option value="rotate">Rotate</option>
                                    <option value="scene_understanding">Scene Understanding</option>
                                </select>
                            </div>
                            <div class="select is-medium">
                                <select id="simple-object-manipulation-menu-instances"
                                        onchange="updateDemoVideo('simple-object-manipulation')">
                                    <option value="1" selected="selected">1</option>
                                    <option value="2">2</option>
                                    <option value="3">3</option>
                                    <option value="4">4</option>
                                    <option value="5">5</option>
                                </select>
                            </div>
                            <video id="simple-object-manipulation-single-task-video"
                                   controls
                                   muted
                                   autoplay
                                   loop
                                   width="100%">
                                <source src="assets/videos/demos/simple_object_manipulation/simple_manipulation/1.mp4"
                                        type="video/mp4">
                            </video>
                        </div>

                        Visual Goal Reaching
                        <div class="column has-text-left">
                            <h3 class="title is-5">Visual Goal Reaching</h3>
                            <div class="select is-medium">
                                <select id="visual-goal-reaching-menu-tasks"
                                        onchange="updateDemoVideo('visual-goal-reaching')">
                                    <option value="rearrange" selected="selected">Rearrange</option>
                                    <option value="rearrange_then_restore">Rearrange Then Restore</option>
                                </select>
                            </div>
                            <div class="select is-medium">
                                <select id="visual-goal-reaching-menu-instances"
                                        onchange="updateDemoVideo('visual-goal-reaching')">
                                    <option value="1" selected="selected">1</option>
                                    <option value="2">2</option>
                                    <option value="3">3</option>
                                    <option value="4">4</option>
                                    <option value="5">5</option>
                                </select>
                            </div>
                            <video id="visual-goal-reaching-single-task-video"
                                   controls
                                   muted
                                   autoplay
                                   loop
                                   width="100%">
                                <source src="assets/videos/demos/visual_goal_reaching/rearrange/1.mp4"
                                        type="video/mp4">
                            </video>
                        </div>


                    </div>

                    <br>
                    <div class="columns">
                        Novel Concept Grounding
                        <div class="column has-text-left">
                            <h3 class="title is-5">Novel Concept Grounding</h3>
                            <div class="select is-medium">
                                <select id="novel-concept-grounding-menu-tasks"
                                        onchange="updateDemoVideo('novel-concept-grounding')">
                                    <option value="novel_adj_and_noun" selected="selected">Novel Adjective and Noun
                                    </option>
                                    <option value="novel_adj">Novel Adjective</option>
                                    <option value="novel_noun">Novel Noun</option>
                                    <option value="twist">Twist</option>
                                </select>
                            </div>
                            <div class="select is-medium">
                                <select id="novel-concept-grounding-menu-instances"
                                        onchange="updateDemoVideo('novel-concept-grounding')">
                                    <option value="1" selected="selected">1</option>
                                    <option value="2">2</option>
                                    <option value="3">3</option>
                                    <option value="4">4</option>
                                    <option value="5">5</option>
                                </select>
                            </div>
                            <video id="novel-concept-grounding-single-task-video"
                                   controls
                                   muted
                                   autoplay
                                   loop
                                   width="100%">
                                <source src="assets/videos/demos/novel_concept_grounding/novel_adj_and_noun/1.mp4"
                                        type="video/mp4">
                            </video>
                        </div>

                        One Shot Video Imitation
                        <div class="column has-text-left">
                            <h3 class="title is-5">One-shot Video Imitation</h3>
                            <div class="select is-medium">
                                <select id="one-shot-video-imitation-menu-tasks"
                                        onchange="updateDemoVideo('one-shot-video-imitation')">
                                    <option value="follow_motion" selected="selected">Follow Motion</option>
                                    <option value="follow_order">Follow Order</option>
                                </select>
                            </div>
                            <div class="select is-medium">
                                <select id="one-shot-video-imitation-menu-instances"
                                        onchange="updateDemoVideo('one-shot-video-imitation')">
                                    <option value="1" selected="selected">1</option>
                                    <option value="2">2</option>
                                    <option value="3">3</option>
                                    <option value="4">4</option>
                                    <option value="5">5</option>
                                </select>
                            </div>
                            <video id="one-shot-video-imitation-single-task-video"
                                   controls
                                   muted
                                   autoplay
                                   loop
                                   width="100%">
                                <source src="assets/videos/demos/one_shot_video_imitation/follow_motion/1.mp4"
                                        type="video/mp4">
                            </video>
                        </div>
                    </div>

                    <br>
                    <div class="columns">
                        Visual Constraint Satisfaction
                        <div class="column has-text-left">
                            <h3 class="title is-5">Visual Constraint Satisfaction</h3>
                            <div class="select is-medium">
                                <select id="visual-constraint-satisfaction-menu-tasks"
                                        onchange="updateDemoVideo('visual-constraint-satisfaction')">
                                    <option value="sweep_without_exceeding" selected="selected">Sweep without
                                        Exceeding
                                    </option>
                                    <option value="sweep_without_touching">Sweep without Touching</option>
                                </select>
                            </div>
                            <div class="select is-medium">
                                <select id="visual-constraint-satisfaction-menu-instances"
                                        onchange="updateDemoVideo('visual-constraint-satisfaction')">
                                    <option value="1" selected="selected">1</option>
                                    <option value="2">2</option>
                                    <option value="3">3</option>
                                    <option value="4">4</option>
                                    <option value="5">5</option>
                                </select>
                            </div>
                            <video id="visual-constraint-satisfaction-single-task-video"
                                   controls
                                   muted
                                   autoplay
                                   loop
                                   playbackRate="2.0"
                                   width="100%">
                                <source src="assets/videos/demos/visual_constraint_satisfaction/sweep_without_exceeding/1.mp4"
                                        type="video/mp4">
                            </video>
                        </div>

                        Visual Reasoning
                        <div class="column has-text-left">
                            <h3 class="title is-5">Visual Reasoning</h3>
                            <div class="select is-medium">
                                <select id="visual-reasoning-menu-tasks" onchange="updateDemoVideo('visual-reasoning')">
                                    <option value="manipulate_old_neighbor" selected="selected">Manipulate Old
                                        Neighbor
                                    </option>
                                    <option value="pick_in_order_then_restore">Pick in Order Then Restore</option>
                                    <option value="same_color">Same Texture</option>
                                    <option value="same_profile">Same Shape</option>
                                </select>
                            </div>
                            <div class="select is-medium">
                                <select id="visual-reasoning-menu-instances"
                                        onchange="updateDemoVideo('visual-reasoning')">
                                    <option value="1" selected="selected">1</option>
                                    <option value="2">2</option>
                                    <option value="3">3</option>
                                    <option value="4">4</option>
                                    <option value="5">5</option>
                                </select>
                            </div>
                            <video id="visual-reasoning-single-task-video"
                                   controls
                                   muted
                                   autoplay
                                   loop
                                   width="100%">
                                <source src="assets/videos/demos/visual_reasoning/manipulate_old_neighbor/1.mp4"
                                        type="video/mp4">
                            </video>
                        </div>
                    </div>
                </div>
            </div>

        </div>
    </div>
</section> -->

<hr width="1200" style="margin-left: auto; margin-right: auto">

<!--Experiments-->
<section class="section">
    <div class="container is-max-widescreen">
        <div class="rows">
            <div class="rows is-centered">
                <div class="row is-full-width">
                    <h2 class="title is-3">
                        <span class="dvima">
                            <!-- &#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; -->
                            Experiments
                        </span>
                    </h2>
                    <div class="content has-text-justified">
                    <p style="font-size: 125%">
                        We benchmark our method on the DeepMind control suite and compare CoIT with prior model-free methods which
                        use data augmentation and contrastive learning to improve data-efficiency:
                    <ul style="font-size: 125%">
                        <li> <a href="https://arxiv.org/abs/2107.09645"><span>DrQ-v2</span></a>: introducing a simple data augmentation called <i>random shift</i> into a 
                            fine-tuned DDPG.
                        </li>
                        <li> <a href="https://arxiv.org/abs/2004.04136"><span>CURL</span></a>: combining SAC with a self-supervised learning framework.
                        </li>
                        <li> Vanilla SAC and DDPG that directly trained from the image input
                        </li>

                    </ul>
                    </p>
                    <p style="font-size: 125%">
                        We also present ablation studies to show the details of our method.
                    </p>
                    <!-- <br> -->
                    <!-- <br> -->

                    <h3 class="title is-4"><span
                            class="dvima">
                            <!-- &#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; -->
                            Evaluation Results</span></h3>
                    <div class="content has-text-justified">
                    <img src="assets/images/dmc_results.png" class="interpolation-image"
                         alt="" style="display: block; margin-left: auto; margin-right: auto"/>
                        <br>
                    <span style="font-size: 110%">
                        <b>Results of 8 complex tasks in DMControl.</b> These tasks are chosen to offer multiple degrees of challenges, 
                        including complex dynamics, sparse rewards, hard exploration, and more. Below are key findings: 
                        <!-- (i)  CoIT outperforms 
                        vanilla DDPG and SAC in a wide range.  -->
                        (i) Although DrQ-v2 has already performed remarkably for continuous control, CoIT is 
                        more data-efficient on multiple tasks.
                        (ii) From general trends of the learning curves,
                        CoIT improves or keeps the data-efficiency in a more stable manner which is not trivial on DMControl tasks.
                    </span>
                    <br>
                    <!-- <br> -->
                    <!-- <br> -->

                    <h3 class="title is-4"><span
                            class="dvima">
                            <!-- &#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; -->
                            Ablation Studies </span></h3>
                    <div class="content has-text-justified">
                    <br>

                    <img src="assets/images/heatmap.png" class="interpolation-image"
                         alt="" style="display: block; margin-left: auto; margin-right: auto"/>
                         <br>
                    <span style="font-size: 110%">
                        <span style="font-weight: bold">Saliency maps for CoIT on 6 tasks:</span> 
                            (a) Cartpole Swingup Sparse,
                            (b) Hopper Stand,
                            (c) Walker Run,
                            (d) Cheetah Run,
                            (e) Quadruped Walk, and
                            (f) Finger Turn Hard.
                            These saliency maps demonstrate that CoIT is beneficial for the agent to focus on task-relevant elements
                             like the whole robot body and ignore the task-irrelevant information like the floor and background. 
                             Especially in <i>Finger Turn Hard</i>, the lightest part in the saliency map is a red ball in the 
                             observation, which is highly related to the reward.
                    </span>

                    <br>
                    <br>
                    <br>
                    <!-- <br> -->

                    <img src="assets/images/gaussian_dis.png" class="interpolation-image"
                         alt="" style="display: block; margin-left: auto; margin-right: auto"/>
                         <br>
                    <span style="font-size: 110%">
                        <span style="font-weight: bold">Visualization of the parameters of the Gaussian distribution for image transformation. </span>
                        These curves demonstrate that the gaussian distribution proposed in CoIT
                        could automatically find an appropriate transformation to smooth the distribution shift between the different views
                        of the same observation, therefore being beneficial to the representation learning.
                </div>
            </div>

        </div>
    </div>
</section>

<hr width="1200" style="margin-left: auto; margin-right: auto">

<!--Conclusion-->
<section class="section">
    <div class="container is-max-widescreen">
        <div class="row">
            <div class="rows is-centered">
                <div class="row is-full-width">
                    <h2 class="title is-3">
                        <span class="dvima">
                            <!-- &#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; -->
                            Conclusion
                        </span></h2>
                    <div class="content has-text-justified">
                    <p style="font-size: 125%">
                        A novel pixel transformation CoIT under model-free RL algorithms that significantly improves the
                        data-efficiency and stability for visual tasks is introduced in this work. We theoretically analyze how
                        the learnable transformation constrains the distribution of the abstracted data, and dissect its benefits
                        to representation learning. CoIT is no need for any additional modifications to the backbone RL
                        algorithm and is easy to implement. We compare CoIT to SOTA methods on popular benchmarks and
                        certify that it gains promising performance with advanced stability. Hopefully, contrastive invariant
                        transformation can lead to a new branch for representation learning in RL.

                    </p>

                </div>
            </div>

        </div>
    </div>
</section>

<hr width="1200" style="margin-left: auto; margin-right: auto">

<section class="section" id="BibTeX">
    <div class="row">
        <div class="rows is-centered">
            <div class="row is-full-width">
    <div class="container is-max-widescreen content">
        <h2 class="title">BibTeX</h2>
        <pre>
            <code style="font-size: 100%; margin-left: auto; margin-right: auto; 
            width: auto; height: auto">
            @inproceedings{liudata,
                    title={On the Data-Efficiency with Contrastive Image Transformation in Reinforcement Learning},
                    author={Liu, Sicong and Zhang, Xi Sheryl and Li, Yushuo and Zhang, Yifan and Cheng, Jian},
                    booktitle={The Eleventh International Conference on Learning Representations}
            }
        </code>
    </pre>
    </div>
</section>

<footer class="footer">
    <div class="container">
        <div class="columns is-centered has-text-centered">
            <div class="column">
                <div class="content">
                    <p>
                        Website template borrowed from <a
                            href="https://github.com/nerfies/nerfies.github.io">NeRFies</a> and <a
                            href="https://github.com/cliport/cliport.github.io">CLIPort</a>.
                    </p>
                </div>
            </div>
        </div>
    </div>
</footer>

</body>
</html>