huggingface · conditionedstimulus · May 14, 2024 · May 16, 2024 · May 21, 2024 · May 24, 2024
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -617,6 +617,8 @@
         title: ConvNeXTV2
       - local: model_doc/cvt
         title: CvT
+      - local: model_doc/dab-detr
+        title: DAB-DETR
       - local: model_doc/deformable_detr
         title: Deformable DETR
       - local: model_doc/deit

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
@@ -105,6 +105,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                       [CPM-Ant](model_doc/cpmant)                        |       ✅        |         ❌         |      ❌      |
 |                          [CTRL](model_doc/ctrl)                          |       ✅        |         ✅         |      ❌      |
 |                           [CvT](model_doc/cvt)                           |       ✅        |         ✅         |      ❌      |
+|                      [DAB-DETR](model_doc/dab-detr)                      |       ✅        |         ❌         |      ❌      |
 |                           [DAC](model_doc/dac)                           |       ✅        |         ❌         |      ❌      |
 |                   [Data2VecAudio](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
 |                    [Data2VecText](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |

diff --git a/docs/source/en/model_doc/dab-detr.md b/docs/source/en/model_doc/dab-detr.md
@@ -0,0 +1,86 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DAB-DETR
+
+## Overview
+
+The DAB-DETR model was proposed in [DAB-DETR: Dynamic Anchor Boxes are Better Queries for DETR](https://arxiv.org/abs/2201.12329) by Shilong Liu, Feng Li, Hao Zhang, Xiao Yang, Xianbiao Qi, Hang Su, Jun Zhu, Lei Zhang.
+DAB-DETR is an enhanced variant of Conditional DETR. It utilizes dynamically updated anchor boxes to provide both a reference query point (x, y) and a reference anchor size (w, h), improving cross-attention computation. This new approach achieves 45.7% AP when trained for 50 epochs with a single ResNet-50 model as the backbone.
+
+<img src="https://github.com/conditionedstimulus/hf_media/blob/main/dab_detr_convergence_plot.png"
+alt="drawing" width="600"/>
+
+The abstract from the paper is the following:
+
+*We present in this paper a novel query formulation using dynamic anchor boxes
+for DETR (DEtection TRansformer) and offer a deeper understanding of the role
+of queries in DETR. This new formulation directly uses box coordinates as queries
+in Transformer decoders and dynamically updates them layer-by-layer. Using box
+coordinates not only helps using explicit positional priors to improve the query-to-feature similarity and eliminate the slow training convergence issue in DETR,
+but also allows us to modulate the positional attention map using the box width
+and height information. Such a design makes it clear that queries in DETR can be
+implemented as performing soft ROI pooling layer-by-layer in a cascade manner.
+As a result, it leads to the best performance on MS-COCO benchmark among
+the DETR-like detection models under the same setting, e.g., AP 45.7% using
+ResNet50-DC5 as backbone trained in 50 epochs. We also conducted extensive
+experiments to confirm our analysis and verify the effectiveness of our methods.*
+
+This model was contributed by [davidhajdu](https://huggingface.co/davidhajdu).
+The original code can be found [here](https://github.com/IDEA-Research/DAB-DETR).
+
+There are three ways to instantiate a DAB-DETR model (depending on what you prefer):
+
+Option 1: Instantiate DAB-DETR with pre-trained weights for entire model
+```py
+>>> from transformers import DABDETRForObjectDetection
+
+>>> model = DABDETRForObjectDetection.from_pretrained("IDEA-Research/dab_detr_resnet50")
+```
+
+Option 2: Instantiate DAB-DETR with randomly initialized weights for Transformer, but pre-trained weights for backbone
+```py
+>>> from transformers import DABDETRConfig, DABDETRForObjectDetection
+
+>>> config = DABDETRConfig()
+>>> model = DABDETRForObjectDetection(config)
+```
+Option 3: Instantiate DAB-DETR with randomly initialized weights for backbone + Transformer
+```py
+>>> config = DABDETRConfig(use_pretrained_backbone=False)
+>>> model = DABDETRForObjectDetection(config)
+```
+
+
+## DABDETRConfig
+
+[[autodoc]] DABDETRConfig
+
+## DABDETRImageProcessor
+
+[[autodoc]] DABDETRImageProcessor
+    - preprocess
+    - post_process_object_detection
+
+## DABDETRModel
+
+[[autodoc]] DABDETRModel
+    - forward
+
+## DABDETRForObjectDetection
+
+[[autodoc]] DABDETRForObjectDetection
+    - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -315,6 +315,7 @@
         "CTRLTokenizer",
     ],
     "models.cvt": ["CvtConfig"],
+    "models.dab_detr": ["DABDETRConfig"],
     "models.dac": ["DacConfig", "DacFeatureExtractor"],
     "models.data2vec": [
         "Data2VecAudioConfig",
@@ -1176,6 +1177,7 @@
         ["ConditionalDetrFeatureExtractor", "ConditionalDetrImageProcessor"]
     )
     _import_structure["models.convnext"].extend(["ConvNextFeatureExtractor", "ConvNextImageProcessor"])
+    _import_structure["models.dab_detr"].extend(["DABDETRImageProcessor"])
     _import_structure["models.deformable_detr"].extend(
         ["DeformableDetrFeatureExtractor", "DeformableDetrImageProcessor"]
     )
@@ -1795,6 +1797,13 @@
             "CvtPreTrainedModel",
         ]
     )
+    _import_structure["models.dab_detr"].extend(
+        [
+            "DABDETRForObjectDetection",
+            "DABDETRModel",
+            "DABDETRPreTrainedModel",
+        ]
+    )
     _import_structure["models.dac"].extend(
         [
             "DacModel",
@@ -5129,6 +5138,9 @@
         CTRLTokenizer,
     )
     from .models.cvt import CvtConfig
+    from .models.dab_detr import (
+        DABDETRConfig,
+    )
     from .models.dac import (
         DacConfig,
         DacFeatureExtractor,
@@ -6045,6 +6057,7 @@
             ConditionalDetrImageProcessor,
         )
         from .models.convnext import ConvNextFeatureExtractor, ConvNextImageProcessor
+        from .models.dab_detr import DABDETRImageProcessor
         from .models.deformable_detr import (
             DeformableDetrFeatureExtractor,
             DeformableDetrImageProcessor,
@@ -6596,6 +6609,11 @@
             CvtModel,
             CvtPreTrainedModel,
         )
+        from .models.dab_detr import (
+            DABDETRForObjectDetection,
+            DABDETRModel,
+            DABDETRPreTrainedModel,
+        )
         from .models.dac import (
             DacModel,
             DacPreTrainedModel,

diff --git a/src/transformers/activations.py b/src/transformers/activations.py
@@ -217,6 +217,7 @@ def __getitem__(self, key):
     "silu": nn.SiLU,
     "swish": nn.SiLU,
     "tanh": nn.Tanh,
+    "prelu": nn.PReLU,
 }
 ACT2FN = ClassInstantier(ACT2CLS)
 

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -59,6 +59,7 @@
     cpmant,
     ctrl,
     cvt,
+    dab_detr,
     dac,
     data2vec,
     dbrx,

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -74,6 +74,7 @@
         ("cpmant", "CpmAntConfig"),
         ("ctrl", "CTRLConfig"),
         ("cvt", "CvtConfig"),
+        ("dab-detr", "DABDETRConfig"),
         ("dac", "DacConfig"),
         ("data2vec-audio", "Data2VecAudioConfig"),
         ("data2vec-text", "Data2VecTextConfig"),
@@ -369,6 +370,7 @@
         ("cpmant", "CPM-Ant"),
         ("ctrl", "CTRL"),
         ("cvt", "CvT"),
+        ("dab-detr", "DAB-DETR"),
         ("dac", "DAC"),
         ("data2vec-audio", "Data2VecAudio"),
         ("data2vec-text", "Data2VecText"),

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -67,6 +67,7 @@
             ("convnext", ("ConvNextImageProcessor",)),
             ("convnextv2", ("ConvNextImageProcessor",)),
             ("cvt", ("ConvNextImageProcessor",)),
+            ("dab-detr", "DABDETRImageProcessor"),
             ("data2vec-vision", ("BeitImageProcessor",)),
             ("deformable_detr", ("DeformableDetrImageProcessor",)),
             ("deit", ("DeiTImageProcessor",)),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -74,6 +74,7 @@
         ("cpmant", "CpmAntModel"),
         ("ctrl", "CTRLModel"),
         ("cvt", "CvtModel"),
+        ("dab-detr", "DABDETRModel"),
         ("dac", "DacModel"),
         ("data2vec-audio", "Data2VecAudioModel"),
         ("data2vec-text", "Data2VecTextModel"),
@@ -559,6 +560,7 @@
         ("conditional_detr", "ConditionalDetrModel"),
         ("convnext", "ConvNextModel"),
         ("convnextv2", "ConvNextV2Model"),
+        ("dab-detr", "DABDETRModel"),
         ("data2vec-vision", "Data2VecVisionModel"),
         ("deformable_detr", "DeformableDetrModel"),
         ("deit", "DeiTModel"),
@@ -812,6 +814,7 @@
     [
         # Model for Object Detection mapping
         ("conditional_detr", "ConditionalDetrForObjectDetection"),
+        ("dab-detr", "DABDETRForObjectDetection"),
         ("deformable_detr", "DeformableDetrForObjectDetection"),
         ("deta", "DetaForObjectDetection"),
         ("detr", "DetrForObjectDetection"),

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -52,7 +52,7 @@ class ConditionalDetrConfig(PretrainedConfig):
             Number of object queries, i.e. detection slots. This is the maximal number of objects
             [`ConditionalDetrModel`] can detect in a single image. For COCO, we recommend 100 queries.
         d_model (`int`, *optional*, defaults to 256):
-            Dimension of the layers.
+            This parameter is a general dimension parameter, defining dimensions for components such as the encoder layer and projection parameters in the decoder layer, among others.
         encoder_layers (`int`, *optional*, defaults to 6):
             Number of encoder layers.
         decoder_layers (`int`, *optional*, defaults to 6):

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -86,6 +86,8 @@ class ConditionalDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
         intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
             Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
             layernorm.
+        reference_points (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, 2 (anchor points))`):
+            Reference points (reference points of each layer of the decoder).
     """
 
     intermediate_hidden_states: Optional[torch.FloatTensor] = None
@@ -128,6 +130,8 @@ class ConditionalDetrModelOutput(Seq2SeqModelOutput):
         intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
             Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
             layernorm.
+        reference_points (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, 2 (anchor points))`):
+            Reference points (reference points of each layer of the decoder).
     """
 
     intermediate_hidden_states: Optional[torch.FloatTensor] = None

diff --git a/src/transformers/models/dab_detr/__init__.py b/src/transformers/models/dab_detr/__init__.py
@@ -0,0 +1,78 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_dab_detr": [
+        "DABDETRConfig",
+        "DABDETROnnxConfig",
+    ]
+}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_dab_detr"] = ["DABDETRImageProcessor"]
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_dab_detr"] = [
+        "DABDETRForObjectDetection",
+        "DABDETRModel",
+        "DABDETRPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_dab_detr import (
+        DABDETRConfig,
+        DABDETROnnxConfig,
+    )
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_dab_detr import DABDETRImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_dab_detr import (
+            DABDETRForObjectDetection,
+            DABDETRModel,
+            DABDETRPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
-Original file line number
+Diff line change
@@ Expand Up / @@ -59,6 +59,7 @@ @@
         cpmant,
         ctrl,
         cvt,
+        dab_detr,
         dac,
         data2vec,
         dbrx,
@@ Expand Down @@