adapter-hub · calpt · Nov 25, 2024 · Oct 18, 2024 · Oct 19, 2024 · Oct 19, 2024
diff --git a/src/adapters/__init__.py b/src/adapters/__init__.py
@@ -40,6 +40,7 @@
         "DEFAULT_ADAPTER_CONFIG",
         "DEFAULT_ADAPTERFUSION_CONFIG",
         "AdapterConfig",
+        "AdapterPlusConfig",
         "AdapterFusionConfig",
         "BnConfig",
         "CompacterConfig",

diff --git a/src/adapters/configuration/adapter_config.py b/src/adapters/configuration/adapter_config.py
@@ -156,13 +156,14 @@ class BnConfig(AdapterConfig):
         ln_after (:obj:`bool`, optional): If True, add a new layer normalization after the adapter bottleneck.
             Defaults to False.
         init_weights (:obj:`str`, optional): Initialization method for the weights of the adapter modules.
-            Currently, this can be either "bert" (default) or "mam_adapter".
+            Currently, this can be either "bert" (default) or "mam_adapter" or "houlsby".
         is_parallel (:obj:`bool`, optional): If True, apply adapter transformations in parallel.
             By default (False), sequential application is used.
         scaling (:obj:`float` or :obj:`str`, optional):
             Scaling factor to use for scaled addition of adapter outputs as done by He et al. (2021). Can be either a
-            constant factor (float) or the string "learned", in which case the scaling factor is learned. Defaults to
-            1.0.
+            constant factor (float), or the string "learned", in which case the scaling factor is learned, or the string
+            "channel", in which case we initialize a scaling vector of the channel shape that is then learned.
+            Defaults to 1.0.
         use_gating (:obj:`bool`, optional):
             Place a trainable gating module besides the added parameter module to control module activation. This is
             e.g. used for UniPELT. Defaults to False.
@@ -213,6 +214,10 @@ class BnConfig(AdapterConfig):
         phm_bias (:obj:`bool`, optional):
             If True the down and up projection PHMLayer has a bias term. If `phm_layer` is False this is ignored.
             Defaults to True
+        stochastic_depth (:obj:`float`, optional):
+            This value specifies the probability of the model dropping entire layers during
+            training. This parameter should be only used for vision based tasks involving
+            residual networks.
     """
 
     # Required options
@@ -250,6 +255,7 @@ class BnConfig(AdapterConfig):
     hypercomplex_nonlinearity: Optional[str] = "glorot-uniform"
     phm_rank: Optional[int] = 1
     phm_bias: Optional[bool] = True
+    stochastic_depth: Optional[float] = 0.0
 
     # We want to emulate a simple form of immutability while keeping the ability to add custom attributes.
     # Therefore, we don't allow changing attribute values if set once.
@@ -364,6 +370,24 @@ class ParBnConfig(BnConfig):
     scaling: Union[float, str] = 4.0
 
 
+@dataclass(eq=False)
+class AdapterPlusConfig(BnConfig):
+    """
+    The AdapterPlus config architecture proposed by Jan-Martin O, Steitz and Stefan Roth. See https://arxiv.org/pdf/2406.06820
+    """
+
+    original_ln_after: bool = False
+    residual_before_ln: bool = True
+    stochastic_depth: float = 0.1
+    init_weights: str = "houlsby"
+    scaling: Union[float, str] = "channel"
+
+    mh_adapter: bool = False
+    output_adapter: bool = True
+    reduction_factor: Union[float, Mapping] = 2
+    non_linearity: str = "gelu"
+
+
 @dataclass(eq=False)
 class PrefixTuningConfig(AdapterConfig):
     """

diff --git a/src/adapters/methods/modeling.py b/src/adapters/methods/modeling.py
@@ -7,6 +7,7 @@
 
 from ..configuration import AdapterFusionConfig, BnConfig
 from ..context import ForwardContext
+from .vision import StochasticDepth
 
 
 class Activation_Function_Class(nn.Module):
@@ -99,6 +100,8 @@ def __init__(
             self.scaling = config["scaling"]
         elif config["scaling"] == "learned":
             self.scaling = nn.Parameter(torch.ones(1))
+        elif config["scaling"] == "channel":
+            self.scaling = nn.Parameter(torch.ones(input_size))
         else:
             raise ValueError("Unknown scaling type: {}".format(config["scaling"]))
 
@@ -126,9 +129,24 @@ def __init__(
                 nn.init.zeros_(self.adapter_up.bias)
                 if self.use_gating:
                     self.gate.apply(self.init_bert_weights)
+        elif config["init_weights"] == "houlsby":
+            for layer in self.adapter_down:
+                if isinstance(layer, nn.Linear) or isinstance(layer, PHMLayer):
+                    nn.init.trunc_normal_(
+                        layer.weight, mean=0, std=1e-2, a=-2 * 1e-2, b=2 * 1e-2
+                    )
+                    nn.init.zeros_(layer.bias)
+
+            nn.init.trunc_normal_(
+                self.adapter_up.weight, mean=0, std=1e-2, a=-2 * 1e-2, b=2 * 1e-2
+            )
+            nn.init.zeros_(self.adapter_up.bias)
         else:
             raise ValueError("Unknown init_weights type: {}".format(config["init_weights"]))
 
+        if config["stochastic_depth"] > 0.0:
+            self.DropPath = StochasticDepth(drop_prob=config["stochastic_depth"])
+
     def pre_forward(
         self,
         hidden_states,
@@ -176,6 +194,8 @@ def forward(self, x, residual_input, output_gating=False):
         down = self.adapter_down(x)
 
         up = self.adapter_up(down)
+        if hasattr(self, "DropPath"):
+            up = self.DropPath(up)
         up = up * self.scaling
         output = self.dropout(up)
 
@@ -364,7 +384,6 @@ def __init__(
         self.reduction = self.T / 1000.0
 
     def forward(self, query, key, value, residual, output_attentions: bool = False):
-
         if self.config["residual_before"]:
             value += residual[:, :, None, :].repeat(1, 1, value.size(2), 1)
 

diff --git a/src/adapters/methods/vision.py b/src/adapters/methods/vision.py
@@ -0,0 +1,66 @@
+# Module to support adapter training for vision related tasks
+
+import torch.nn as nn
+
+
+class StochasticDepth(nn.Module):
+    """
+    Applies Stochastic Depth (aka Drop Path) to residual networks.
+    Constructed loosely upon the implementations in the `torchvision` library
+    and the `timm` library.
+
+    Randomly drops samples post-layer inside a batch via a `drop_prob` probability
+    and scales them by `1-drop_prob` if the layer is kept if keep_prob_scaling is True.
+
+    Paper: https://arxiv.org/pdf/1603.09382
+    References: https://pytorch.org/vision/main/_modules/torchvision/ops/stochastic_depth.html#stochastic_depth
+
+    """
+
+    def __init__(self, drop_prob: float = 0.0, keep_prob_scaling: bool = True):
+        super().__init__()
+        self.drop_prob = drop_prob
+        self.keep_prob_scaling = keep_prob_scaling
+
+    def forward(self, x):
+        return stochastic_depth(
+            x, self.drop_prob, self.keep_prob_scaling, self.training
+        )
+
+
+def stochastic_depth(
+    x, drop_prob: float = 0.0, keep_prob_scaling: bool = True, training: bool = False
+):
+    """
+    Applies stochastic_depth to a batch.
+
+    Args:
+        x: torch.Tensor of size (batch_size, ...)
+            A residual block
+        drop_prob: float between 0.0 <= drop_prob <= 1.0
+            The probability of dropping the sample inside the batch
+        keep_prob_scaling: bool, optional
+            Boolean parameter to specify whether to scale samples by keep_prob if
+            they are kept
+        training: bool, optional
+            Boolean parameter to specify whether or not the model is in training
+            or inference mode. Stochastic Depth is not applied during inference
+            similar to Dropout.
+    """
+    if drop_prob >= 1.0 or drop_prob < 0.0:
+        raise ValueError("drop_prob must be between 0.0 and 1.0")
+
+    if drop_prob == 0.0 or not training:
+        return x
+
+    keep_prob = 1.0 - drop_prob
+    # get the number of samples in the batch i.e input.shape[0]
+    sample_shape = [x.shape[0]] + [1] * (x.ndim - 1)
+
+    bernoulli_tensor = x.new_empty(
+        sample_shape, dtype=x.dtype, device=x.device
+    ).bernoulli_(keep_prob)
+    if keep_prob_scaling:
+        bernoulli_tensor.div_(keep_prob)
+
+    return x * bernoulli_tensor