index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="Use co-occurrence to discover region-word pairs from image-text pairs for open-vocabulary object detection pre-training.">
  <meta name="keywords" content="CoDet, Co-occurrence, Open-vocabulary Detection">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>CoDet: Co-Occurrence Guided Region-Word Alignment for Open-Vocabulary Object Detection</title>

  <!-- Global site tag (gtag.js) - Google Analytics -->
  <script async src="https://www.googletagmanager.com/gtag/js?id=G-PYVRSFMDRL"></script>
  <script>
    window.dataLayer = window.dataLayer || [];

    function gtag() {
      dataLayer.push(arguments);
    }

    gtag('js', new Date());

    gtag('config', 'G-PYVRSFMDRL');
  </script>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
<!--  <link rel="icon" href="./static/images/favicon.svg">-->

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
</head>
<body>

<!--<nav class="navbar" role="navigation" aria-label="main navigation">-->
<!--  <div class="navbar-brand">-->
<!--    <a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false">-->
<!--      <span aria-hidden="true"></span>-->
<!--      <span aria-hidden="true"></span>-->
<!--      <span aria-hidden="true"></span>-->
<!--    </a>-->
<!--  </div>-->
<!--  <div class="navbar-menu">-->
<!--    <div class="navbar-start" style="flex-grow: 1; justify-content: center;">-->
<!--      <a class="navbar-item" href="https://keunhong.com">-->
<!--      <span class="icon">-->
<!--          <i class="fas fa-home"></i>-->
<!--      </span>-->
<!--      </a>-->

<!--      <div class="navbar-item has-dropdown is-hoverable">-->
<!--        <a class="navbar-link">-->
<!--          More Research-->
<!--        </a>-->
<!--        <div class="navbar-dropdown">-->
<!--          <a class="navbar-item" href="https://hypernerf.github.io">-->
<!--            HyperNeRF-->
<!--          </a>-->
<!--          <a class="navbar-item" href="https://nerfies.github.io">-->
<!--            Nerfies-->
<!--          </a>-->
<!--          <a class="navbar-item" href="https://latentfusion.github.io">-->
<!--            LatentFusion-->
<!--          </a>-->
<!--          <a class="navbar-item" href="https://photoshape.github.io">-->
<!--            PhotoShape-->
<!--          </a>-->
<!--        </div>-->
<!--      </div>-->
<!--    </div>-->

<!--  </div>-->
<!--</nav>-->


<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-3 publication-title">CoDet: Co-Occurrence Guided Region-Word Alignment for Open-Vocabulary Object Detection</h1>
          <div class="is-size-5 publication-authors">
            <span class="author-block">
              <a href="https://machuofan.github.io/">Chuofan Ma</a><sup>1,2</sup>,</span>
            <span class="author-block">
              <a href="https://enjoyyi.github.io/">Yi Jiang</a><sup>2</sup>,</span>
            <span class="author-block">
              <a href="https://wen-xin.info/">Xin Wen</a><sup>1</sup>,
            </span>
            <span class="author-block">
              Zehuan Yuan<sup>2</sup>,
            </span>
            <span class="author-block">
              <a href="https://xjqi.github.io/">Xiaojuan Qi</a><sup>1</sup>,
            </span>
          </div>

          <div class="is-size-5 publication-authors">
            <span class="author-block"><sup>1</sup>University of Hong Kong,</span>
            <span class="author-block"><sup>2</sup>ByteDance</span>
          </div>

          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- PDF Link. -->
<!--              <span class="link-block">-->
<!--                <a href="https://arxiv.org/pdf/2011.12948"-->
<!--                   class="external-link button is-normal is-rounded is-dark">-->
<!--                  <span class="icon">-->
<!--                      <i class="fas fa-file-pdf"></i>-->
<!--                  </span>-->
<!--                  <span>Paper</span>-->
<!--                </a>-->
<!--              </span>-->
              <span class="link-block">
                <a href="https://arxiv.org/abs/2310.16667"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
              <!-- Video Link. -->
<!--              <span class="link-block">-->
<!--                <a href="https://www.youtube.com/watch?v=MrKrnHhk8IA"-->
<!--                   class="external-link button is-normal is-rounded is-dark">-->
<!--                  <span class="icon">-->
<!--                      <i class="fab fa-youtube"></i>-->
<!--                  </span>-->
<!--                  <span>Video</span>-->
<!--                </a>-->
<!--              </span>-->
              <!-- Code Link. -->
              <span class="link-block">
                <a href="https://github.com/CVMI-Lab/CoDet"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                  </a>
              </span>
              <!-- Dataset Link. -->
<!--              <span class="link-block">-->
<!--                <a href="https://github.com/google/nerfies/releases/tag/0.1"-->
<!--                   class="external-link button is-normal is-rounded is-dark">-->
<!--                  <span class="icon">-->
<!--                      <i class="far fa-images"></i>-->
<!--                  </span>-->
<!--                  <span>Data</span>-->
<!--                </a>-->
<!--              </span>  -->
            </div>

          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <img src="static/images/teaser.png" alt="Teaser image."/>
      <h2 class="subtitle has-text-centered">
        CoDet leverages concept co-occurrence among image-text pairs to discover pseudo region-word pairs for open-vocabulary detection pre-training.
      </h2>
    </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-fullwidth">
        <h2 class="title is-3">Breaking the Chicken-and-Egg Problem</h2>
        <img src="static/images/meme.png"/>
        <div class="content has-text-justified">
          <p>
            Previous studies typically rely on region-text similarity to discover pseudo region-text pairs from image-text pairs.
            But a tricky thing is that, to get accurate similarity estimation, you need a region-level aligned vision-language sapce,
            which in turn requires abundant region-text pairs to train.
            <br><br>
            To break the chicken-and-egg problem, we introduce co-occurrence based region-word alignment,
            which solely relies on region-region similarity to discover pseudo region-text pairs.
            A nice property of co-occurrence is that you only need to scale up visual pre-training to get higher-quality pseudo-labels.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            Deriving reliable region-word alignment from image-text pairs is critical to learn object-level
            vision-language representations for open-vocabulary object detection. Existing methods typically rely on
            pre-trained or self-trained vision-language models for alignment, which are prone to limitations in
            localization accuracy or generalization capabilities. In this paper, we propose CoDet, a novel approach
            that overcomes the reliance on pre-aligned vision-language space by reformulating region-word alignment
            as a co-occurring object discovery problem. Intuitively, by grouping images that mention a shared concept
            in their captions, objects corresponding to the shared concept shall exhibit high co-occurrence among the group.
            CoDet then leverages visual similarities to discover the co-occurring objects and align them with the shared concept.
            Extensive experiments demonstrate that CoDet has superior performances and compelling scalability
            in open-vocabulary detection, e.g., by scaling up the visual backbone, CoDet achieves 37.0 mask AP for novel classes
            and 44.7 mask AP for all classes on OV-LVIS, surpassing the previous SoTA by 4.2 and 9.8 mask AP, respectively.
          </p>
        </div>
      </div>
    </div>
    <!--/ Abstract. -->

    <!-- Main Figure. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
<!--        <h2 class="title is-3">Method</h2>-->
        <img src="./static/images/framework.png" />
      </div>
    </div>
    <!--/ Paper video. -->
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-fullwidth">
        <h2 class="title is-3">Pseudo-Label Quality</h2>
        <img src="static/images/comparison.png"/>
      </div>
    </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Benchmark Results</h2>
        <img src="./static/images/ov_lvis_results.png"/>
      </div>
    </div>
  </div>
</section>

<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title">BibTeX</h2>
    <pre><code>@inproceedings{ma2023codet,
    title={CoDet: Co-Occurrence Guided Region-Word Alignment for Open-Vocabulary Object Detection},
    author={Ma, Chuofan and Jiang, Yi and Wen, Xin and Yuan, Zehuan and Qi, Xiaojuan},
    booktitle={Advances in Neural Information Processing Systems},
    year={2023}
  }
}</code></pre>
  </div>
</section>


<footer class="footer">
  <div class="container">
<!--    <div class="content has-text-centered">-->
<!--      <a class="icon-link"-->
<!--         href="./static/videos/nerfies_paper.pdf">-->
<!--        <i class="fas fa-file-pdf"></i>-->
<!--      </a>-->
<!--      <a class="icon-link" href="https://github.com/keunhong" class="external-link" disabled>-->
<!--        <i class="fab fa-github"></i>-->
<!--      </a>-->
<!--    </div>-->
    <div class="columns is-centered">
      <a href="https://github.com/nerfies/nerfies.github.io">Website template</a>
<!--      <div class="column is-8">-->
<!--        <div class="content">-->
<!--          <p>-->
<!--            This website is licensed under a <a rel="license"-->
<!--                                                href="http://creativecommons.org/licenses/by-sa/4.0/">Creative-->
<!--            Commons Attribution-ShareAlike 4.0 International License</a>.-->
<!--          </p>-->
<!--          <p>-->
<!--            This means you are free to borrow the <a-->
<!--              href="https://github.com/nerfies/nerfies.github.io">source code</a> of this website,-->
<!--            we just ask that you link back to this page in the footer.-->
<!--            Please remember to remove the analytics code included in the header of the website which-->
<!--            you do not want on your website.-->
<!--          </p>-->
<!--        </div>-->
      </div>
    </div>
  </div>
</footer>

</body>
</html>