diff --git a/timm/layers/__init__.py b/timm/layers/__init__.py index 38c824077c..6111558908 100644 --- a/timm/layers/__init__.py +++ b/timm/layers/__init__.py @@ -29,6 +29,7 @@ from .helpers import to_ntuple, to_2tuple, to_3tuple, to_4tuple, make_divisible, extend_tuple from .hybrid_embed import HybridEmbed, HybridEmbedWithSize from .inplace_abn import InplaceAbn +from .layer_scale import LayerScale, LayerScale2d from .linear import Linear from .mixed_conv2d import MixedConv2d from .mlp import Mlp, GluMlp, GatedMlp, SwiGLU, SwiGLUPacked, ConvMlp, GlobalResponseNormMlp diff --git a/timm/layers/layer_scale.py b/timm/layers/layer_scale.py new file mode 100644 index 0000000000..08566b2bd1 --- /dev/null +++ b/timm/layers/layer_scale.py @@ -0,0 +1,38 @@ +import torch +from torch import nn + + +class LayerScale(nn.Module): + """ LayerScale on tensors with channels in last-dim. + """ + def __init__( + self, + dim: int, + init_values: float = 1e-5, + inplace: bool = False, + ) -> None: + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x.mul_(self.gamma) if self.inplace else x * self.gamma + + +class LayerScale2d(nn.Module): + """ LayerScale for tensors with torch 2D NCHW layout. + """ + def __init__( + self, + dim: int, + init_values: float = 1e-5, + inplace: bool = False, + ): + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x): + gamma = self.gamma.view(1, -1, 1, 1) + return x.mul_(gamma) if self.inplace else x * gamma + diff --git a/timm/models/hiera.py b/timm/models/hiera.py index 69af2f48ad..808053e9ee 100644 --- a/timm/models/hiera.py +++ b/timm/models/hiera.py @@ -31,10 +31,8 @@ import torch.nn.functional as F from torch.utils.checkpoint import checkpoint - from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD -from timm.layers import DropPath, Mlp, use_fused_attn, _assert, get_norm_layer, to_2tuple - +from timm.layers import DropPath, Mlp, LayerScale, use_fused_attn, _assert, get_norm_layer, to_2tuple from ._registry import generate_default_cfgs, register_model from ._builder import build_model_with_cfg @@ -289,7 +287,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: """ Input should be of shape [batch, tokens, channels]. """ B, N, _ = x.shape num_windows = (N // (self.q_stride * self.window_size)) if self.use_mask_unit_attn else 1 - qkv = self.qkv(x).reshape(B, -1, num_windows, 3, self.heads, self.head_dim).permute(3, 0, 4, 2, 1, 5) q, k, v = qkv.unbind(0) @@ -310,21 +307,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x -class LayerScale(nn.Module): - def __init__( - self, - dim: int, - init_values: float = 1e-5, - inplace: bool = False, - ) -> None: - super().__init__() - self.inplace = inplace - self.gamma = nn.Parameter(init_values * torch.ones(dim)) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - return x.mul_(self.gamma) if self.inplace else x * self.gamma - - class HieraBlock(nn.Module): def __init__( self, @@ -342,7 +324,6 @@ def __init__( use_mask_unit_attn: bool = False, ): super().__init__() - self.dim = dim self.dim_out = dim_out @@ -631,10 +612,8 @@ def __init__( nn.init.trunc_normal_(self.pos_embed_win, std=0.02) if weight_init != 'skip': - if weight_init == 'jax': - named_apply(partial(_init_weight_jax, head_bias=-math.log(self.num_classes)), self) - else: - named_apply(_init_weight_vit, self) + init_fn = _init_weight_jax if weight_init == 'jax' else _init_weight_vit + named_apply(init_fn, self) if fix_init: self.fix_init_weight() if isinstance(self.head.fc, nn.Linear): @@ -868,11 +847,13 @@ def _init_weight_vit(module, name, init_bias=0.02, head_bias=0.): nn.init.trunc_normal_(module.weight, std=0.02) if isinstance(module, nn.Linear) and module.bias is not None: nn.init.constant_(module.bias, init_bias) + elif hasattr(module, 'init_weights'): + module.init_weights() def _init_weight_jax(module, name, head_bias=0.): if isinstance(module, nn.Linear): - if name.startswith('head'): + if name.startswith('head.fc'): nn.init.zeros_(module.weight) nn.init.constant_(module.bias, head_bias) else: @@ -960,7 +941,7 @@ def _cfg(url='', **kwargs): num_classes=0, ), - "hiera_small_abswin_256.sbb2_ep200_in12k": _cfg( + "hiera_small_abswin_256.sbb2_e200_in12k": _cfg( hf_hub_id='timm/', num_classes=11821, input_size=(3, 256, 256), crop_pct=0.95, @@ -1007,6 +988,7 @@ def _create_hiera(variant: str, pretrained: bool = False, **kwargs) -> Hiera: **kwargs, ) + @register_model def hiera_tiny_224(pretrained=False, **kwargs): model_args = dict(embed_dim=96, num_heads=1, stages=(1, 2, 7, 2)) diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index 64cab9ee71..afb5e00200 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -1967,7 +1967,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: 'vit_mediumd_patch16_reg4_gap_256.sbb_in12k_ft_in1k': _cfg( hf_hub_id='timm/', input_size=(3, 256, 256), crop_pct=0.95), - 'vit_mediumd_patch16_reg4_gap_256.sbb2_ep200_in12k': _cfg( + 'vit_mediumd_patch16_reg4_gap_256.sbb2_e200_in12k': _cfg( hf_hub_id='timm/', num_classes=11821, input_size=(3, 256, 256), crop_pct=0.95), @@ -1984,7 +1984,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: 'vit_betwixt_patch16_reg4_gap_256.sbb_in1k': _cfg( hf_hub_id='timm/', input_size=(3, 256, 256), crop_pct=0.95), - 'vit_betwixt_patch16_reg4_gap_256.sbb2_ep200_in12k': _cfg( + 'vit_betwixt_patch16_reg4_gap_256.sbb2_e200_in12k': _cfg( hf_hub_id='timm/', num_classes=11821, input_size=(3, 256, 256), crop_pct=0.95),