index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="PQ3D">
  <meta name="keywords" content="PQ3D">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>PQ3D</title>

  <!-- Global site tag (gtag.js) - Google Analytics -->
  <script async src="https://www.googletagmanager.com/gtag/js?id=G-PYVRSFMDRL"></script>
  <script>
    window.dataLayer = window.dataLayer || [];

    function gtag() {
      dataLayer.push(arguments);
    }

    gtag('js', new Date());

    gtag('config', 'G-PYVRSFMDRL');
  </script>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <link rel="icon" href="./file/pq3d-logo.png">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
  <!-- from sam graph -->
  <link href="css/bootstrap-4.4.1.css" rel="stylesheet">
  <link href="css/css2" rel="stylesheet">
  <link rel="stylesheet" href="css/index.css">
  <meta charset="UTF-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body>

  <nav class="navbar" role="navigation" aria-label="main navigation">
    <div class="navbar-brand">
      <a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false">
        <span aria-hidden="true"></span>
        <span aria-hidden="true"></span>
        <span aria-hidden="true"></span>
      </a>
    </div>
    <div class="navbar-menu">
      <div class="navbar-start" style="flex-grow: 1; justify-content: center;">
        <a class="navbar-item" target="_blank" href="https://zhuziyu-edward.github.io">
          <span class="icon">
            <i class="fas fa-home"></i>
          </span>
        </a>
  
        <div class="navbar-item has-dropdown is-hoverable">
          <a class="navbar-link">
          More Research
          </a>
          <div class="navbar-dropdown">
          <a class="navbar-item" target="_blank" href="https://sqa3d.github.io/">
              SQA3D
          </a>
          <a class="navbar-item" target="_blank" href="https://3d-vista.github.io/">
              3D-VisTA
          </a>
          <a class="navbar-item" target="_blank" href="https://embodied-generalist.github.io/">
            LEO
          </a>
          <a class="navbar-item" target="_blank" href="https://scene-verse.github.io/">
            SceneVerse
          </a>
          <a class="navbar-item" target="_blank" href="https://sg-3d.github.io/">
            SG3D
          </a>
          </div>
        </div>
      </div>
    </div>
  </nav>

<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <img src="file/pq3d-logo.png" width="35%">
          <h1 class="title is-2 publication-title">Unifying 3D Vision-Language Understanding via Promptable Queries</h1>
          <div class="is-size-4"><b>ECCV 2024</b></div>
          <div class="is-size-5 publication-authors">
            <span class="author-block">
              <a href="https://zhuziyu-edward.github.io">Ziyu Zhu</a><sup>1,2</sup> &nbsp;&nbsp;&nbsp;
              <a href="https://tongclass.ac.cn/author/zhuofan-zhang/">Zhuofan Zhang</a><sup>1,2</sup> &nbsp;&nbsp;&nbsp;
              <a href="https://jeasinema.github.io">Xiaojian Ma</a><sup>2</sup> &nbsp;&nbsp;&nbsp;
              <a href="https://nxsedson.github.io">Xuesong Niu</a><sup>2</sup> &nbsp;&nbsp;&nbsp;
              <a href="https://yixchen.github.io">Yixin Chen</a><sup>2</sup> &nbsp;&nbsp;&nbsp;
              <a href="https://buzz-beater.github.io">Baoxiong Jia</a><sup>2</sup> &nbsp;&nbsp;&nbsp;
              <br>
              <a href="https://www.cs.tsinghua.edu.cn/csen/info/1165/4052.htm">Zhidong Deng</a><sup>1&#x1f4e7</sup> &nbsp;&nbsp;&nbsp;
              <a href="https://siyuanhuang.com/">Siyuan Huang</a><sup>2&#x1f4e7;</sup> &nbsp;&nbsp;&nbsp;
              <a href="https://liqing.io/">Qing Li</a><sup>2&#x1f4e7;</sup> &nbsp;&nbsp;&nbsp;
            </span>
          </div>

          <div class="is-size-5 publication-authors">
            <!-- <span class="author-block">✉️ corresponding author</span> <br> -->
            <span class="author-block"><sup>1</sup>Tsinghua University</span> &nbsp;&nbsp;&nbsp;
            <span class="author-block"><sup>2</sup>Beijing Institute for General Artificial Intelligence (BIGAI)</span> &nbsp;&nbsp;&nbsp;<br>
          </div>

          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- PDF Link. -->
              <!-- <span class="link-block">
                <a href=""
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                  </span>
                  <span>Paper</span>
                </a>
              </span> -->
              <span class="link-block">
                <a href="https://arxiv.org/abs/2405.11442"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
              <!-- Video Link. -->
              <!-- <span class="link-block">
                <a href=""
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-youtube"></i>
                  </span>
                  <span>Video</span>
                </a>
              </span> -->
              <!-- Code Link. -->
              <span class="link-block">
                <a href="https://github.com/PQ3D/PQ3D"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                  </a>
              </span>
              <span class="link-block">
                <a href="https://twitter.com/jeasinema/status/1805791108271948287" class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fa-brands fa-x-twitter" aria-hidden="true"></i>
                  </span>
                  <span>Thread on X</span>
                  </a>
              </span>
              
              <!-- Dataset Link. -->
              <span class="link-block">
                <a href="https://huggingface.co/spaces/li-qing/PQ3D-Demo"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fa regular fa-comments"></i>
                  </span>
                  <span>Demo</span>
                </a>
              </span>

              <!-- Poster Link. -->
              <span class="link-block">
                <a href="file/poster.pdf"
                  class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fa fa-book"></i>
                  </span>
                  <span>Poster</span>
                </a>
              </span>

            </div>
          </div>

          
        </div>
      </div>
    </div>
  </div>
</section>

<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <h2 class="subtitle has-text-justified">
        <b>TL;DR:</b> We propose PQ3D, a unfied model for 3D vision-language understanding, capable of taking various prompts and representations to perform a wide range of tasks in a 3D Scene.

      </h2>
      <iframe width="1050" height="560"
         src="https://www.youtube.com/embed/XRxnEpgeljE?si=v1QTDo1d_hc6wHi1">
      </iframe>
    </div>
  </div>
</section>


<section class="section" >
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            A unified model for 3D vision-language (3D-VL) understanding is expected to take various scene representations and perform a wide
            range of tasks in a 3D scene. However, a considerable gap exists between existing methods and such a unified model, due to the independent application of representation and insufficient exploration of 3D multi-task
            training. In this paper, we introduce PQ3D, a unified model capable of using Promptable Queries to tackle a wide range of 3D-VL tasks,
            from low-level instance segmentation to high-level reasoning and planning.Tested across ten diverse 3D-VL datasets,
           
          </p>
          <p>
            This is achieved through three key innovations: (1) unifying various 3D scene representations (i.e., voxels, point clouds, multi-view im-
            ages) into a shared 3D coordinate space by segment-level grouping, (2) an attention-based query decoder for task-specific information retrieval
            guided by prompts, and (3) universal output heads for different tasks to support multi-task training. 
          </p>
          <p>
            PQ3D demonstrates impressive performance on these tasks, setting new records on most benchmarks. Particularly, PQ3D improves the state-
            of-the-art on ScanNet200 by 4.9% (AP25), ScanRefer by 5.4% (acc@0.5), Multi3DRefer by 11.7% (F1@0.5), and Scan2Cap by 13.4% (CIDEr@0.5).Moreover, PQ3D supports flexible inference with individual or combined forms of available 3D representations, e.g., solely voxel input
          </p>
        </div>
        <img id="painting_icon" width="100%" src="file/teaser.png">
      </div>
    </div>
</section>

<section class="section" style="background-color:#efeff081">
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-six-fifths">
        <h2 class="title is-3">Contribution</h2>
        <div class="content has-text-justified">
          <p>
            Our main contributions are
            </p><ol type="1">
              <li><b>PQ3D model</b><span style="font-size: 95%;"></span> We propose a unified model adept at handling a broad spectrum of 3D vision-language tasks, rang
                ing from low-level instance segmentation to high-level
                reasoning and planning.</span></li>
              <li><b>Representations alignment</b>. <span style="font-size: 95%;"> Our model uniquely aligns voxels, point clouds, and multi-view images into a
                shared 3D space and employs an attention-based query decoder to adaptively extract task-relevant features guided by prompts, offering a flexible approach to model all 3D-VL tasks.</span></li>
              <li><b>Performance</b>. <span style="font-size: 95%;">In our extensive experimentation across various 3D-VL tasks, PQ3D not only achieves competitive results but also sets new records in most of the tasks
                </span></li>
            </ol>  
          <p></p>

        </div>
      </div>
    </div>      
  </div>
</section>

<section class="section" >
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column">
        <h2 class="title is-3"> <img id="painting_icon" width="4%" src="https://img.icons8.com/?size=1x&id=oOOSYZyuA844&format=png">PQ3D Model</h2>
        <div class="content has-text-justified">
          <p>
          </p>
         
        </div>
        <div style="width: 100%; margin: 0 auto;">
          <video poster="" id="scene_representation" autoplay muted loop height="100%">
            <source src="file/model_web.mp4" type="video/mp4">
          </video>
        </div>
      </div>
    </div>
</section>

<section class="section" >
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column">
        <h2 class="title is-3"> <img id="painting_icon" width="4%" src="https://img.icons8.com/?size=1x&id=sei9JmRv5YVb&format=png"> Examples on 3D VL Understanding </h2>
        <div class="content has-text-justified">
          <p>
            Qualitative results on promptable segmentation, visual grounding, question answering, dense captioning, object navigation, and task planning.
          </p>
        </div>
        <img id="painting_icon" width="100%" src="file/qualitative.png">
      </div>
    </div>
</section>


<!-- reconstruction showcase -->
<section>
  <div class="container">
    <div class="row">
      <div class="col-12 text-center">
        <h2 class="title is-3"> <img id="painting_icon" width="4%" src="https://img.icons8.com/?size=1x&id=43611&format=png"> Promptable Segmentation Results</h3>
          <hr style="margin-top:0px">
          <img id="painting_icon" width="90%" src="file/segmentation_prompt.png"> 
          <div class="embed-responsive embed-responsive-16by9">

              <iframe style="clip-path: inset(1px 1px)" width="100%" height="100%" src="https://sketchfab.com/playlists/embed?collection=25b7b3544c884c9e8f0ca319c1950bc8&autostart=0" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture; fullscreen" mozallowfullscreen="true" webkitallowfullscreen="true"></iframe>
          </div>
          <br>
      </div>
    </div>
  </div>
</section>
<br>

<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title">BibTeX</h2>
    <pre><code>@article{zhu2024unifying,
      title={Unifying 3D Vision-Language Understanding via Promptable Queries},
      author={Zhu, Ziyu and Zhang, Zhuofan and Ma, Xiaojian and Niu, Xuesong and Chen, Yixin and Jia, Baoxiong and Deng, Zhidong and Huang, Siyuan and Li, Qing},
      journal={ECCV},
      year={2024}
}</code></pre>
  </div>
</section>

<footer class="footer">
    <div class="container">
        <div class="columns is-centered">
            <div class="column">
                <div class="content has-text-centered">
                    <p>
                        Website template borrowed from <a href="https://github.com/nerfies/nerfies.github.io">NeRFies</a>.
                    </p>
                </div>
            </div>
        </div>
    </div>
</footer>

<script src="js/script.js"></script>