From c089cce53ec60871e24b81e6af6f6ccaab19a0dc Mon Sep 17 00:00:00 2001 From: liuxuewen <18435135529@163.com> Date: Mon, 29 May 2023 14:34:17 +0800 Subject: [PATCH] feat(BeitV2): modify some format for review --- ppcls/arch/backbone/__init__.py | 3 +- .../{modeling_finetune.py => BeiTV2.py} | 346 ++++++++++++++++- .../backbone/model_zoo/modeling_pretrain.py | 365 ------------------ .../backbone/model_zoo/norm_ema_quantizer.py | 242 ------------ ppcls/arch/backbone/model_zoo/vqkd.py | 243 +++++++++++- .../BeitV2/BeitV2_base_patch16_224_pt.yaml | 2 +- ppcls/engine/engine.py | 4 - ppcls/engine/train/__init__.py | 1 + ppcls/engine/train/train.py | 6 +- ppcls/engine/train/train_mask_samples.py | 110 ++++++ ppcls/loss/distillationloss.py | 2 +- 11 files changed, 700 insertions(+), 624 deletions(-) rename ppcls/arch/backbone/model_zoo/{modeling_finetune.py => BeiTV2.py} (64%) delete mode 100644 ppcls/arch/backbone/model_zoo/modeling_pretrain.py delete mode 100644 ppcls/arch/backbone/model_zoo/norm_ema_quantizer.py create mode 100644 ppcls/engine/train/train_mask_samples.py diff --git a/ppcls/arch/backbone/__init__.py b/ppcls/arch/backbone/__init__.py index 9dc12bd3dd..1f964c482a 100644 --- a/ppcls/arch/backbone/__init__.py +++ b/ppcls/arch/backbone/__init__.py @@ -76,8 +76,7 @@ from .model_zoo.nextvit import NextViT_small_224, NextViT_base_224, NextViT_large_224, NextViT_small_384, NextViT_base_384, NextViT_large_384 from .model_zoo.cae import cae_base_patch16_224, cae_large_patch16_224 from .model_zoo.vqkd import vqkd_encoder_base_decoder_3x768x12_clip -from .model_zoo.modeling_pretrain import beit_base_patch16_224_8k_vocab_cls_pt -from .model_zoo.modeling_finetune import beit_base_patch16_224 +from .model_zoo.BeiTV2 import beit_base_patch16_224, beit_base_patch16_224_8k_vocab_cls_pt from .variant_models.resnet_variant import ResNet50_last_stage_stride1 from .variant_models.resnet_variant import ResNet50_adaptive_max_pool2d diff --git a/ppcls/arch/backbone/model_zoo/modeling_finetune.py b/ppcls/arch/backbone/model_zoo/BeiTV2.py similarity index 64% rename from ppcls/arch/backbone/model_zoo/modeling_finetune.py rename to ppcls/arch/backbone/model_zoo/BeiTV2.py index 6f2ffbdbbe..8a431dec65 100644 --- a/ppcls/arch/backbone/model_zoo/modeling_finetune.py +++ b/ppcls/arch/backbone/model_zoo/BeiTV2.py @@ -10,6 +10,7 @@ from paddle.nn.initializer import TruncatedNormal, Constant, Normal from collections import OrderedDict + def _cfg(url='', **kwargs): return { 'url': url, @@ -35,7 +36,7 @@ def drop_path(x, drop_prob=0., training=False): """ if drop_prob == 0. or not training: return x - keep_prob = paddle.to_tensor(1 - drop_prob) + keep_prob = paddle.to_tensor(1 - drop_prob).astype(x.dtype) shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype) random_tensor = paddle.floor(random_tensor) # binarize @@ -545,8 +546,6 @@ def get_intermediate_layers(self, x, use_last_norm=False): return features - - def beit_base_patch16_224(pretrained=False, finetune_weight=None, model_filter_name='', **kwargs): model = VisionTransformer( patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, # qkv_bias=True, @@ -676,4 +675,345 @@ def geometric_progression(a, r, n): checkpoint_model['pos_embed'] = new_pos_embed model.set_dict(checkpoint_model) model.default_cfg = _cfg() + return model + +''' pretrain ''' + +class VisionTransformerForMaskedImageModeling(nn.Layer): + def __init__(self, img_size=224, patch_size=16, in_chans=3, vocab_size=8192, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=None, init_values=None, attn_head_dim=None, + use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, init_std=0.02): + super().__init__() + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + self.num_heads = num_heads + + self.cls_token = self.create_parameter(shape=(1, 1, embed_dim), default_initializer=zeros_) + self.add_parameter("cls_token", self.cls_token) + self.mask_token = self.create_parameter(shape=(1, 1, embed_dim), default_initializer=zeros_) + self.add_parameter("mask_token", self.mask_token) + if use_abs_pos_emb: + self.pos_embed = self.create_parameter(shape=(1, num_patches + 1, embed_dim), default_initializer=zeros_) + self.add_parameter("pos_embed", self.pos_embed) + else: + self.pos_embed = None + self.pos_drop = nn.Dropout(p=drop_rate) + + if use_shared_rel_pos_bias: + self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads) + else: + self.rel_pos_bias = None + + dpr = np.linspace(0, drop_path_rate, depth, dtype=np.float32) + self.use_rel_pos_bias = use_rel_pos_bias + self.blocks = nn.LayerList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + init_values=init_values, + window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None, + attn_head_dim=attn_head_dim) + for i in range(depth) + ]) + self.norm = eval(norm_layer)(embed_dim, epsilon=1e-6) + + self.init_std = init_std + self.lm_head = nn.Linear(embed_dim, vocab_size) + + if self.pos_embed is not None: + trunc_normal_(self.pos_embed) + trunc_normal_(self.cls_token) + trunc_normal_(self.mask_token) + trunc_normal_(self.lm_head.weight) + self.apply(self._init_weights) + self.fix_init_weight() + + def fix_init_weight(self): + def rescale(param, layer_id): + x = param.divide(paddle.to_tensor([math.sqrt(2.0 * layer_id)])) + param = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight, layer_id + 1) + rescale(layer.mlp.fc2.weight, layer_id + 1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + elif isinstance(m, nn.Conv2D): + trunc_normal_(m.weight) + if m.bias is not None: + zeros_(m.bias) + + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def get_num_layers(self): + return len(self.blocks) + + def forward_features(self, x, bool_masked_pos): + x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) + batch_size, seq_len, _ = x.shape + + cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks + mask_token = self.mask_token.expand((batch_size, seq_len, -1)) + + # replace the masked visual tokens by mask_token + w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) + x = x * (1 - w) + mask_token * w + + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for i, blk in enumerate(self.blocks): + x = blk(x, rel_pos_bias=rel_pos_bias) + + x = self.norm(x) + return x + + + def forward(self, x, bool_masked_pos=None, return_all_tokens=False, return_patch_tokens=False): + if bool_masked_pos is None: + bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) + x = self.forward_features(x, bool_masked_pos=bool_masked_pos) + x = x[:, 1:] + if return_patch_tokens: + return x + if return_all_tokens: + return self.lm_head(x) + else: + # return the masked tokens + return self.lm_head(x[bool_masked_pos]) + + def forward_return_qkv(self, x, bool_masked_pos=None, split_out_as_qkv=False): + if bool_masked_pos is None: + bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) + x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) + batch_size, seq_len, _ = x.shape + + cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks + mask_token = self.mask_token.expand((batch_size, seq_len, -1)) + + # replace the masked visual tokens by mask_token + w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) + x = x * (1 - w) + mask_token * w + + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for i, blk in enumerate(self.blocks): + if i < len(self.blocks) - 1: + x = blk(x, rel_pos_bias=rel_pos_bias) + else: + x, qkv = blk(x, rel_pos_bias=rel_pos_bias, return_qkv=True) + + if split_out_as_qkv: + x = self.norm(x) + x = self.lm_head(x) + q, k, v = x.chunks(3, axis=-1) + b, n, c = q.shape + q = q.reshape(b, n, self.num_heads, -1).transpose([0, 2, 1, 3]) + k = k.reshape(b, n, self.num_heads, -1).transpose([0, 2, 1, 3]) + v = v.reshape(b, n, self.num_heads, -1).transpose([0, 2, 1, 3]) + return x, q, k, v + else: + x = self.norm(x) + x = x[:, 1:] + x = self.lm_head(x[bool_masked_pos]) + + q, k, v = qkv[0], qkv[1], qkv[2] + + return x, q, k, v + + def forward_intermediate(self, x, bool_masked_pos=None, layer_id=12): + if bool_masked_pos is None: + bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) + x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) + batch_size, seq_len, _ = x.shape + + cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks + mask_token = self.mask_token.expand((batch_size, seq_len, -1)) + + # replace the masked visual tokens by mask_token + w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) + x = x * (1 - w) + mask_token * w + + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + if isinstance(layer_id, list): + output_list = [] + for l, blk in enumerate(self.blocks): + x = blk(x, rel_pos_bias=rel_pos_bias) + if l in layer_id: + output_list.append(x[:, 1:]) + return output_list + elif isinstance(layer_id, int): + for l, blk in enumerate(self.blocks): + if l < layer_id: + x = blk(x, rel_pos_bias=rel_pos_bias) + elif l == layer_id: + x = blk.norm1(x) + else: + break + return x[:, 1:] + else: + raise NotImplementedError(f"Not support for layer id is {layer_id} now!") + + def get_last_selfattention(self, x): + x = self.patch_embed(x) + batch_size, seq_len, _ = x.shape + cls_tokens = self.cls_token.expand((batch_size, -1, -1)) + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + x = x + self.patch_embed + x = self.pos_drop(x) + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + + for i, blk in enumerate(self.blocks): + if i < len(self.blocks) - 1: + x = blk(x, rel_pos_bias=rel_pos_bias) + else: + # return attention of the last block + return blk(x, rel_pos_bias=rel_pos_bias, return_attention=True) + + +class VisionTransformerForMaskedImageModelingCLS(VisionTransformerForMaskedImageModeling): + def __init__(self, img_size=224, patch_size=16, in_chans=3, vocab_size=8192, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=None, init_values=None, attn_head_dim=None, + use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, init_std=0.02, + early_layers=6, head_layers=2, shared_lm_head=True): + super().__init__(img_size=img_size, patch_size=patch_size, in_chans=in_chans, vocab_size=vocab_size, embed_dim=embed_dim, depth=depth, + num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, + drop_path_rate=drop_path_rate, norm_layer=norm_layer, init_values=init_values, attn_head_dim=attn_head_dim, + use_abs_pos_emb=use_abs_pos_emb, use_rel_pos_bias=use_rel_pos_bias, use_shared_rel_pos_bias=use_shared_rel_pos_bias, init_std=init_std) + + self.early_layers = early_layers + print(f'early layer {early_layers}, late layer {depth - early_layers}, condenser head layers {head_layers}, shared_lm_head {shared_lm_head}') + + dpr = np.linspace(0, drop_path_rate, max(depth, early_layers + head_layers), dtype=np.float32) + self.cls_pt_layers = nn.LayerList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + init_values=init_values, + window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None, + attn_head_dim=attn_head_dim) + for i in range(early_layers, early_layers + head_layers) + ]) + self.fix_init_cls_pt_weight() + + self.shared_lm_head = shared_lm_head + if not self.shared_lm_head: + self.cls_pt_norm = norm_layer(embed_dim) + self.cls_pt_lm_head = nn.Linear(embed_dim, vocab_size) + + self.cls_pt_norm.apply(self._init_weights) + self.cls_pt_lm_head.apply(self._init_weights) + + def fix_init_cls_pt_weight(self): + def rescale(param, layer_id): + x = param.divide(paddle.to_tensor([math.sqrt(2.0 * layer_id)])) + param = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight, self.early_layers + layer_id + 1) + rescale(layer.mlp.fc2.weight, self.early_layers + layer_id + 1) + + def forward_features(self, x, bool_masked_pos): + x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) + batch_size, seq_len, _ = x.shape + + cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks + mask_token = self.mask_token.expand((batch_size, seq_len, -1)) + + # replace the masked visual tokens by mask_token + w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) + x = x * (1 - w) + mask_token * w + + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for i, blk in enumerate(self.blocks): + x = blk(x, rel_pos_bias=rel_pos_bias) + if i + 1 == self.early_layers: + early_states = x[:, 1:] + + x_cls_pt = paddle.concat((x[:, 0].unsqueeze(1), early_states), axis=1) + for blk in self.cls_pt_layers: + x_cls_pt = blk(x_cls_pt, rel_pos_bias=rel_pos_bias) + + return self.norm(x), self.norm(x_cls_pt) if self.shared_lm_head else self.cls_pt_norm(x_cls_pt) + + def forward(self, x, bool_masked_pos=None, return_all_tokens=False, return_patch_tokens=False): + if bool_masked_pos is None: + bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) + x, x_cls_pt = self.forward_features(x, bool_masked_pos=bool_masked_pos) + x = x[:, 1:] + x_cls_pt = x_cls_pt[:, 1:] + if return_patch_tokens: + return [x, x_cls_pt] + if return_all_tokens: + return [self.lm_head(x), self.lm_head(x_cls_pt) if self.shared_lm_head else self.cls_pt_lm_head(x_cls_pt)] + else: + # return the masked tokens + return [self.lm_head(x[bool_masked_pos]), self.lm_head(x_cls_pt[bool_masked_pos]) if self.shared_lm_head else self.cls_pt_lm_head(x_cls_pt[bool_masked_pos])] + + + +def beit_base_patch16_224_8k_vocab_cls_pt(pretrained=False, pretrained_weight=None, **kwargs): + if "num_classes" in kwargs: + _ = kwargs.pop("num_classes") + if 'vocab_size' in kwargs: + vocab_size = kwargs['vocab_size'] + _ = kwargs.pop("vocab_size") + else: + vocab_size = 8192 + model = VisionTransformerForMaskedImageModelingCLS( + patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, + norm_layer="nn.LayerNorm", vocab_size=vocab_size, **kwargs) + if pretrained: + weight = paddle.load(pretrained_weight) + model.set_dict(weight) + model.default_cfg = _cfg() return model \ No newline at end of file diff --git a/ppcls/arch/backbone/model_zoo/modeling_pretrain.py b/ppcls/arch/backbone/model_zoo/modeling_pretrain.py deleted file mode 100644 index 8e795eec8b..0000000000 --- a/ppcls/arch/backbone/model_zoo/modeling_pretrain.py +++ /dev/null @@ -1,365 +0,0 @@ -# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Code was heavily based on https://github.com/facebookresearch/deit -# reference: https://arxiv.org/abs/2012.12877 - -import math -import numpy as np -import paddle -import paddle.nn as nn - -from paddle.nn.initializer import TruncatedNormal, Constant, Normal -from .modeling_finetune import Block, PatchEmbed, RelativePositionBias, _cfg, zeros_, ones_, Identity -trunc_normal_ = TruncatedNormal(std=.02) - - -class VisionTransformerForMaskedImageModeling(nn.Layer): - def __init__(self, img_size=224, patch_size=16, in_chans=3, vocab_size=8192, embed_dim=768, depth=12, - num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., - drop_path_rate=0., norm_layer=None, init_values=None, attn_head_dim=None, - use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, init_std=0.02): - super().__init__() - self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models - - self.patch_embed = PatchEmbed( - img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) - num_patches = self.patch_embed.num_patches - self.num_heads = num_heads - - self.cls_token = self.create_parameter(shape=(1, 1, embed_dim), default_initializer=zeros_) - self.add_parameter("cls_token", self.cls_token) - self.mask_token = self.create_parameter(shape=(1, 1, embed_dim), default_initializer=zeros_) - self.add_parameter("mask_token", self.mask_token) - if use_abs_pos_emb: - self.pos_embed = self.create_parameter(shape=(1, num_patches + 1, embed_dim), default_initializer=zeros_) - self.add_parameter("pos_embed", self.pos_embed) - else: - self.pos_embed = None - self.pos_drop = nn.Dropout(p=drop_rate) - - if use_shared_rel_pos_bias: - self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads) - else: - self.rel_pos_bias = None - - dpr = np.linspace(0, drop_path_rate, depth, dtype=np.float32) - self.use_rel_pos_bias = use_rel_pos_bias - self.blocks = nn.LayerList([ - Block( - dim=embed_dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - drop_path=dpr[i], - norm_layer=norm_layer, - init_values=init_values, - window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None, - attn_head_dim=attn_head_dim) - for i in range(depth) - ]) - self.norm = eval(norm_layer)(embed_dim, epsilon=1e-6) - - self.init_std = init_std - self.lm_head = nn.Linear(embed_dim, vocab_size) - - if self.pos_embed is not None: - trunc_normal_(self.pos_embed) - trunc_normal_(self.cls_token) - trunc_normal_(self.mask_token) - trunc_normal_(self.lm_head.weight) - self.apply(self._init_weights) - self.fix_init_weight() - - def fix_init_weight(self): - def rescale(param, layer_id): - x = param.divide(paddle.to_tensor([math.sqrt(2.0 * layer_id)])) - param = paddle.create_parameter(shape=x.shape, - dtype=str(x.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(x)) - - for layer_id, layer in enumerate(self.blocks): - rescale(layer.attn.proj.weight, layer_id + 1) - rescale(layer.mlp.fc2.weight, layer_id + 1) - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight) - if isinstance(m, nn.Linear) and m.bias is not None: - zeros_(m.bias) - elif isinstance(m, nn.LayerNorm): - zeros_(m.bias) - ones_(m.weight) - elif isinstance(m, nn.Conv2D): - trunc_normal_(m.weight) - if m.bias is not None: - zeros_(m.bias) - - def no_weight_decay(self): - return {'pos_embed', 'cls_token'} - - def get_num_layers(self): - return len(self.blocks) - - def forward_features(self, x, bool_masked_pos): - x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) - batch_size, seq_len, _ = x.shape - - cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks - mask_token = self.mask_token.expand((batch_size, seq_len, -1)) - - # replace the masked visual tokens by mask_token - w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) - x = x * (1 - w) + mask_token * w - - x = paddle.concat((cls_tokens, x), axis=1) - if self.pos_embed is not None: - x = x + self.pos_embed - x = self.pos_drop(x) - - rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None - for i, blk in enumerate(self.blocks): - x = blk(x, rel_pos_bias=rel_pos_bias) - - x = self.norm(x) - return x - - - def forward(self, x, bool_masked_pos=None, return_all_tokens=False, return_patch_tokens=False): - if bool_masked_pos is None: - bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) - x = self.forward_features(x, bool_masked_pos=bool_masked_pos) - x = x[:, 1:] - if return_patch_tokens: - return x - if return_all_tokens: - return self.lm_head(x) - else: - # return the masked tokens - return self.lm_head(x[bool_masked_pos]) - - def forward_return_qkv(self, x, bool_masked_pos=None, split_out_as_qkv=False): - if bool_masked_pos is None: - bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) - x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) - batch_size, seq_len, _ = x.shape - - cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks - mask_token = self.mask_token.expand((batch_size, seq_len, -1)) - - # replace the masked visual tokens by mask_token - w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) - x = x * (1 - w) + mask_token * w - - x = paddle.concat((cls_tokens, x), axis=1) - if self.pos_embed is not None: - x = x + self.pos_embed - x = self.pos_drop(x) - - rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None - for i, blk in enumerate(self.blocks): - if i < len(self.blocks) - 1: - x = blk(x, rel_pos_bias=rel_pos_bias) - else: - x, qkv = blk(x, rel_pos_bias=rel_pos_bias, return_qkv=True) - - if split_out_as_qkv: - x = self.norm(x) - x = self.lm_head(x) - q, k, v = x.chunks(3, axis=-1) - b, n, c = q.shape - q = q.reshape(b, n, self.num_heads, -1).transpose([0, 2, 1, 3]) - k = k.reshape(b, n, self.num_heads, -1).transpose([0, 2, 1, 3]) - v = v.reshape(b, n, self.num_heads, -1).transpose([0, 2, 1, 3]) - return x, q, k, v - else: - x = self.norm(x) - x = x[:, 1:] - x = self.lm_head(x[bool_masked_pos]) - - q, k, v = qkv[0], qkv[1], qkv[2] - - return x, q, k, v - - def forward_intermediate(self, x, bool_masked_pos=None, layer_id=12): - if bool_masked_pos is None: - bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) - x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) - batch_size, seq_len, _ = x.shape - - cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks - mask_token = self.mask_token.expand((batch_size, seq_len, -1)) - - # replace the masked visual tokens by mask_token - w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) - x = x * (1 - w) + mask_token * w - - x = paddle.concat((cls_tokens, x), axis=1) - if self.pos_embed is not None: - x = x + self.pos_embed - x = self.pos_drop(x) - - rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None - if isinstance(layer_id, list): - output_list = [] - for l, blk in enumerate(self.blocks): - x = blk(x, rel_pos_bias=rel_pos_bias) - if l in layer_id: - output_list.append(x[:, 1:]) - return output_list - elif isinstance(layer_id, int): - for l, blk in enumerate(self.blocks): - if l < layer_id: - x = blk(x, rel_pos_bias=rel_pos_bias) - elif l == layer_id: - x = blk.norm1(x) - else: - break - return x[:, 1:] - else: - raise NotImplementedError(f"Not support for layer id is {layer_id} now!") - - def get_last_selfattention(self, x): - x = self.patch_embed(x) - batch_size, seq_len, _ = x.shape - cls_tokens = self.cls_token.expand((batch_size, -1, -1)) - x = paddle.concat((cls_tokens, x), axis=1) - if self.pos_embed is not None: - x = x + self.patch_embed - x = self.pos_drop(x) - rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None - - for i, blk in enumerate(self.blocks): - if i < len(self.blocks) - 1: - x = blk(x, rel_pos_bias=rel_pos_bias) - else: - # return attention of the last block - return blk(x, rel_pos_bias=rel_pos_bias, return_attention=True) - - -class VisionTransformerForMaskedImageModelingCLS(VisionTransformerForMaskedImageModeling): - def __init__(self, img_size=224, patch_size=16, in_chans=3, vocab_size=8192, embed_dim=768, depth=12, - num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., - drop_path_rate=0., norm_layer=None, init_values=None, attn_head_dim=None, - use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, init_std=0.02, - early_layers=6, head_layers=2, shared_lm_head=True): - super().__init__(img_size=img_size, patch_size=patch_size, in_chans=in_chans, vocab_size=vocab_size, embed_dim=embed_dim, depth=depth, - num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, - drop_path_rate=drop_path_rate, norm_layer=norm_layer, init_values=init_values, attn_head_dim=attn_head_dim, - use_abs_pos_emb=use_abs_pos_emb, use_rel_pos_bias=use_rel_pos_bias, use_shared_rel_pos_bias=use_shared_rel_pos_bias, init_std=init_std) - - self.early_layers = early_layers - print(f'early layer {early_layers}, late layer {depth - early_layers}, condenser head layers {head_layers}, shared_lm_head {shared_lm_head}') - - dpr = np.linspace(0, drop_path_rate, max(depth, early_layers + head_layers), dtype=np.float32) - self.cls_pt_layers = nn.LayerList([ - Block( - dim=embed_dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - drop_path=dpr[i], - norm_layer=norm_layer, - init_values=init_values, - window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None, - attn_head_dim=attn_head_dim) - for i in range(early_layers, early_layers + head_layers) - ]) - self.fix_init_cls_pt_weight() - - self.shared_lm_head = shared_lm_head - if not self.shared_lm_head: - self.cls_pt_norm = norm_layer(embed_dim) - self.cls_pt_lm_head = nn.Linear(embed_dim, vocab_size) - - self.cls_pt_norm.apply(self._init_weights) - self.cls_pt_lm_head.apply(self._init_weights) - - def fix_init_cls_pt_weight(self): - def rescale(param, layer_id): - x = param.divide(paddle.to_tensor([math.sqrt(2.0 * layer_id)])) - param = paddle.create_parameter(shape=x.shape, - dtype=str(x.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(x)) - - for layer_id, layer in enumerate(self.blocks): - rescale(layer.attn.proj.weight, self.early_layers + layer_id + 1) - rescale(layer.mlp.fc2.weight, self.early_layers + layer_id + 1) - - def forward_features(self, x, bool_masked_pos): - x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) - batch_size, seq_len, _ = x.shape - - cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks - mask_token = self.mask_token.expand((batch_size, seq_len, -1)) - - # replace the masked visual tokens by mask_token - w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) - x = x * (1 - w) + mask_token * w - - x = paddle.concat((cls_tokens, x), axis=1) - if self.pos_embed is not None: - x = x + self.pos_embed - x = self.pos_drop(x) - - rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None - for i, blk in enumerate(self.blocks): - x = blk(x, rel_pos_bias=rel_pos_bias) - if i + 1 == self.early_layers: - early_states = x[:, 1:] - - x_cls_pt = paddle.concat((x[:, 0].unsqueeze(1), early_states), axis=1) - for blk in self.cls_pt_layers: - x_cls_pt = blk(x_cls_pt, rel_pos_bias=rel_pos_bias) - - return self.norm(x), self.norm(x_cls_pt) if self.shared_lm_head else self.cls_pt_norm(x_cls_pt) - - def forward(self, x, bool_masked_pos=None, return_all_tokens=False, return_patch_tokens=False): - if bool_masked_pos is None: - bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) - x, x_cls_pt = self.forward_features(x, bool_masked_pos=bool_masked_pos) - x = x[:, 1:] - x_cls_pt = x_cls_pt[:, 1:] - if return_patch_tokens: - return [x, x_cls_pt] - if return_all_tokens: - return [self.lm_head(x), self.lm_head(x_cls_pt) if self.shared_lm_head else self.cls_pt_lm_head(x_cls_pt)] - else: - # return the masked tokens - return [self.lm_head(x[bool_masked_pos]), self.lm_head(x_cls_pt[bool_masked_pos]) if self.shared_lm_head else self.cls_pt_lm_head(x_cls_pt[bool_masked_pos])] - - - -def beit_base_patch16_224_8k_vocab_cls_pt(pretrained=False, pretrained_weight=None, **kwargs): - if "num_classes" in kwargs: - _ = kwargs.pop("num_classes") - if 'vocab_size' in kwargs: - vocab_size = kwargs['vocab_size'] - _ = kwargs.pop("vocab_size") - else: - vocab_size = 8192 - model = VisionTransformerForMaskedImageModelingCLS( - patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, - norm_layer="nn.LayerNorm", vocab_size=vocab_size, **kwargs) - if pretrained: - weight = paddle.load(pretrained_weight) - model.set_dict(weight) - model.default_cfg = _cfg() - return model \ No newline at end of file diff --git a/ppcls/arch/backbone/model_zoo/norm_ema_quantizer.py b/ppcls/arch/backbone/model_zoo/norm_ema_quantizer.py deleted file mode 100644 index 52fe6ac11b..0000000000 --- a/ppcls/arch/backbone/model_zoo/norm_ema_quantizer.py +++ /dev/null @@ -1,242 +0,0 @@ -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -import paddle.distributed as distributed -from einops import rearrange, repeat - -from .modeling_finetune import zeros_, ones_, Identity - -def l2norm(t): - return F.normalize(t, p=2, axis=-1) - -def ema_inplace(moving_avg, new, decay): - x = moving_avg * decay - x = x + new*(1-decay) - moving_avg = paddle.create_parameter(shape=x.shape, - dtype=str(x.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(x)) - -def sample_vectors(samples, num): - num_samples, device = samples.shape[0], samples.device - - if num_samples >= num: - indices = paddle.randperm(num_samples)[:num] - else: - indices = paddle.randint(0, num_samples, [num,]) - - return samples[indices] - -def kmeans(samples, num_clusters, num_iters = 10, use_cosine_sim = False): - dim, dtype, device = samples.shape[-1], samples.dtype, samples.device - - means = sample_vectors(samples, num_clusters) - - for _ in range(num_iters): - if use_cosine_sim: - dists = samples @ means.t() - else: - diffs = rearrange(samples, 'n d -> n () d') \ - - rearrange(means, 'c d -> () c d') - dists = -(diffs ** 2).sum(axis = -1) - - buckets = dists.max(axis = -1).indices - bins = paddle.bincount(buckets, minlength = num_clusters) - zero_mask = bins == 0 - bins_min_clamped = bins.masked_fill(zero_mask, 1) - - new_means = buckets.new_zeros(num_clusters, dim, dtype = dtype) - new_means.scatter_add_(0, repeat(buckets, 'n -> n d', d = dim), samples) - new_means = new_means / bins_min_clamped[..., None] - - if use_cosine_sim: - new_means = l2norm(new_means) - - means = paddle.where(zero_mask[..., None], means, new_means) - - return means, bins - - -class EmbeddingEMA(nn.Layer): - def __init__(self, num_tokens, codebook_dim, decay=0.99, eps=1e-5, kmeans_init=True, codebook_init_path=''): - super().__init__() - self.num_tokens = num_tokens - self.codebook_dim = codebook_dim - self.decay = decay - self.eps = eps - if codebook_init_path == '': - if not kmeans_init: - weight = paddle.randn([num_tokens, codebook_dim]) - weight = l2norm(weight) - else: - weight = paddle.zeros([num_tokens, codebook_dim]) - self.register_buffer('initted', paddle.to_tensor([not kmeans_init], dtype='float32')) - else: - print(f"load init codebook weight from {codebook_init_path}") - codebook_ckpt_weight = paddle.load(codebook_init_path, map_location='cpu') - weight = codebook_ckpt_weight.clone() - self.register_buffer('initted', paddle.to_tensor([True])) - - self.weight = paddle.create_parameter(shape=weight.shape, - dtype=str(weight.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(weight)) - self.cluster_size = self.create_parameter(shape=[num_tokens], default_initializer=zeros_) - self.add_parameter("cluster_size", self.cluster_size) - self.embed_avg = paddle.create_parameter(shape=weight.shape, - dtype=str(weight.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(weight)) - self.update = True - - def init_embed_(self, data): - if self.initted: - return - print("Performing Kemans init for codebook") - embed, cluster_size = kmeans(data, self.num_tokens, 10, use_cosine_sim=True) - self.weight = paddle.create_parameter(shape=embed.shape, - dtype=str(embed.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(embed)) - self.cluster_size = paddle.create_parameter(shape=cluster_size.shape, - dtype=str(cluster_size.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(cluster_size)) - self.initted = paddle.create_parameter(shape=[1], - dtype="bool", - default_initializer=paddle.nn.initializer.Assign(paddle.to_tensor([True]))) - - def forward(self, embed_id): - return F.embedding(embed_id, self.weight) - - def cluster_size_ema_update(self, new_cluster_size): - x = self.cluster_size.multiply(self.decay) - x = x.add(new_cluster_size*(1 - self.decay)) - self.cluster_size = paddle.create_parameter(shape=x.shape, - dtype=str(x.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(x)) - - def embed_avg_ema_update(self, new_embed_avg): - x = self.cluster_size.multiply(self.decay) - x = x.add(new_embed_avg*(1 - self.decay)) - self.embed_avg = paddle.create_parameter(shape=x.shape, - dtype=str(x.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(x)) - - def weight_update(self, num_tokens): - n = self.cluster_size.sum() - smoothed_cluster_size = ( - (self.cluster_size + self.eps) / (n + num_tokens * self.eps) * n - ) - #normalize embedding average with smoothed cluster size - embed_normalized = self.embed_avg / smoothed_cluster_size.unsqueeze(1) - # embed_normalized = l2norm(self.embed_avg / smoothed_cluster_size.unsqueeze(1)) - self.weight = paddle.create_parameter(shape=embed_normalized.shape, - dtype=str(embed_normalized.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(embed_normalized)) - - -def norm_ema_inplace(moving_avg, new, decay): - x = moving_avg.multiply(paddle.to_tensor(decay)) - x = x.add(new*(1 - decay)) - x = l2norm(x) - moving_avg = paddle.create_parameter(shape=x.shape, - dtype=str(x.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(x)) - - - -class NormEMAVectorQuantizer(nn.Layer): - def __init__(self, n_embed, embedding_dim, beta, decay=0.99, eps=1e-5, - statistic_code_usage=True, kmeans_init=False, codebook_init_path=''): - super().__init__() - self.codebook_dim = embedding_dim - self.num_tokens = n_embed - self.beta = beta - self.decay = decay - - # learnable = True if orthogonal_reg_weight > 0 else False - self.embedding = EmbeddingEMA(self.num_tokens, self.codebook_dim, decay, eps, kmeans_init, codebook_init_path) - - self.statistic_code_usage = statistic_code_usage - if statistic_code_usage: - self.register_buffer('cluster_size', paddle.zeros([n_embed])) - # if distributed.is_available() and distributed.is_initialized(): - # print("ddp is enable, so use ddp_reduce to sync the statistic_code_usage for each gpu!") - # self.all_reduce_fn = distributed.all_reduce - # else: - # self.all_reduce_fn = Identity - # self.all_reduce_fn = paddle.distributed.all_reduce() - - def reset_cluster_size(self, device): - if self.statistic_code_usage: - self.register_buffer('cluster_size', paddle.zeros([self.num_tokens])) - self.cluster_size = self.cluster_size.to(device) - - def _masked_fill(self, x, mask, value): - y = paddle.full(x.shape, value, x.dtype) - return paddle.where(mask, y, x) - - def forward(self, z): - # reshape z -> (batch, height, width, channel) and flatten - #z, 'b c h w -> b h w c' - b, c, h, w = z.shape - z = paddle.reshape(z, [b, h, w, c]) - # z = rearrange(z, 'b c h w -> b h w c') - z = l2norm(z) - z_flattened = z.reshape([-1, self.codebook_dim]) - - self.embedding.init_embed_(z_flattened) - - d = z_flattened.pow(2).sum(axis=1, keepdim=True) + \ - self.embedding.weight.pow(2).sum(axis=1) - 2 * \ - paddle.einsum('bd,nd->bn', z_flattened, self.embedding.weight) # 'n d -> d n' - - encoding_indices = paddle.argmin(d, axis=1) - - z_q = self.embedding(encoding_indices).reshape(z.shape) - - encodings = F.one_hot(encoding_indices, self.num_tokens).astype(z.dtype) - - if not self.training: - with paddle.no_grad(): - cluster_size = encodings.sum(0) - # self.all_reduce_fn(cluster_size) - if paddle.distributed.get_world_size() > 1: - paddle.distributed.all_reduce(cluster_size) - ema_inplace(self.cluster_size, cluster_size, self.decay) - - if self.training and self.embedding.update: - # EMA cluster size - - bins = encodings.sum(0) - # self.all_reduce_fn(bins) - if paddle.distributed.get_world_size() > 1: - paddle.distributed.all_reduce(bins) - - # self.embedding.cluster_size_ema_update(bins) - ema_inplace(self.cluster_size, bins, self.decay) - - zero_mask = (bins == 0) - # bins = bins.masked_fill(zero_mask, 1.) - bins = self._masked_fill(bins, zero_mask, 1.) - - embed_sum = z_flattened.t() @ encodings - # self.all_reduce_fn(embed_sum) - if paddle.distributed.get_world_size() > 1: - paddle.distributed.all_reduce(embed_sum) - - embed_normalized = (embed_sum / bins.unsqueeze(0)).t() - embed_normalized = l2norm(embed_normalized) - - embed_normalized = paddle.where(zero_mask[..., None], self.embedding.weight, - embed_normalized) - norm_ema_inplace(self.embedding.weight, embed_normalized, self.decay) - - # compute loss for embedding - loss = self.beta * F.mse_loss(z_q.detach(), z) - - # preserve gradients - z_q = z + (z_q - z).detach() - - # reshape back to match original input shape - #z_q, 'b h w c -> b c h w' - b, h, w, c = z_q.shape - z_q = paddle.reshape(z_q, [b, c, h, w]) - # z_q = rearrange(z_q, 'b h w c -> b c h w') - return z_q, loss, encoding_indices diff --git a/ppcls/arch/backbone/model_zoo/vqkd.py b/ppcls/arch/backbone/model_zoo/vqkd.py index 5a5598f4fc..f387a9fcf7 100644 --- a/ppcls/arch/backbone/model_zoo/vqkd.py +++ b/ppcls/arch/backbone/model_zoo/vqkd.py @@ -19,9 +19,13 @@ import paddle import paddle.nn as nn from paddle.nn.initializer import TruncatedNormal +import paddle.nn.functional as F +import paddle.distributed as distributed +from einops import rearrange, repeat + +from .BeiTV2 import VisionTransformer, zeros_, ones_, Identity + -from .modeling_finetune import VisionTransformer, zeros_, ones_ -from .norm_ema_quantizer import NormEMAVectorQuantizer MODEL_URLS = { "vqkd": @@ -34,6 +38,241 @@ IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) +def l2norm(t): + return F.normalize(t, p=2, axis=-1) + +def ema_inplace(moving_avg, new, decay): + x = moving_avg * decay + x = x + new*(1-decay) + moving_avg = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + +def sample_vectors(samples, num): + num_samples, device = samples.shape[0], samples.device + + if num_samples >= num: + indices = paddle.randperm(num_samples)[:num] + else: + indices = paddle.randint(0, num_samples, [num,]) + + return samples[indices] + +def kmeans(samples, num_clusters, num_iters = 10, use_cosine_sim = False): + dim, dtype, device = samples.shape[-1], samples.dtype, samples.device + + means = sample_vectors(samples, num_clusters) + + for _ in range(num_iters): + if use_cosine_sim: + dists = samples @ means.t() + else: + diffs = rearrange(samples, 'n d -> n () d') \ + - rearrange(means, 'c d -> () c d') + dists = -(diffs ** 2).sum(axis = -1) + + buckets = dists.max(axis = -1).indices + bins = paddle.bincount(buckets, minlength = num_clusters) + zero_mask = bins == 0 + bins_min_clamped = bins.masked_fill(zero_mask, 1) + + new_means = buckets.new_zeros(num_clusters, dim, dtype = dtype) + new_means.scatter_add_(0, repeat(buckets, 'n -> n d', d = dim), samples) + new_means = new_means / bins_min_clamped[..., None] + + if use_cosine_sim: + new_means = l2norm(new_means) + + means = paddle.where(zero_mask[..., None], means, new_means) + + return means, bins + + +class EmbeddingEMA(nn.Layer): + def __init__(self, num_tokens, codebook_dim, decay=0.99, eps=1e-5, kmeans_init=True, codebook_init_path=''): + super().__init__() + self.num_tokens = num_tokens + self.codebook_dim = codebook_dim + self.decay = decay + self.eps = eps + if codebook_init_path == '': + if not kmeans_init: + weight = paddle.randn([num_tokens, codebook_dim]) + weight = l2norm(weight) + else: + weight = paddle.zeros([num_tokens, codebook_dim]) + self.register_buffer('initted', paddle.to_tensor([not kmeans_init], dtype='float32')) + else: + print(f"load init codebook weight from {codebook_init_path}") + codebook_ckpt_weight = paddle.load(codebook_init_path, map_location='cpu') + weight = codebook_ckpt_weight.clone() + self.register_buffer('initted', paddle.to_tensor([True])) + + self.weight = paddle.create_parameter(shape=weight.shape, + dtype=str(weight.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(weight)) + self.cluster_size = self.create_parameter(shape=[num_tokens], default_initializer=zeros_) + self.add_parameter("cluster_size", self.cluster_size) + self.embed_avg = paddle.create_parameter(shape=weight.shape, + dtype=str(weight.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(weight)) + self.update = True + + def init_embed_(self, data): + if self.initted: + return + print("Performing Kemans init for codebook") + embed, cluster_size = kmeans(data, self.num_tokens, 10, use_cosine_sim=True) + self.weight = paddle.create_parameter(shape=embed.shape, + dtype=str(embed.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(embed)) + self.cluster_size = paddle.create_parameter(shape=cluster_size.shape, + dtype=str(cluster_size.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(cluster_size)) + self.initted = paddle.create_parameter(shape=[1], + dtype="bool", + default_initializer=paddle.nn.initializer.Assign(paddle.to_tensor([True]))) + + def forward(self, embed_id): + return F.embedding(embed_id, self.weight) + + def cluster_size_ema_update(self, new_cluster_size): + x = self.cluster_size.multiply(self.decay) + x = x.add(new_cluster_size*(1 - self.decay)) + self.cluster_size = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + def embed_avg_ema_update(self, new_embed_avg): + x = self.cluster_size.multiply(self.decay) + x = x.add(new_embed_avg*(1 - self.decay)) + self.embed_avg = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + def weight_update(self, num_tokens): + n = self.cluster_size.sum() + smoothed_cluster_size = ( + (self.cluster_size + self.eps) / (n + num_tokens * self.eps) * n + ) + #normalize embedding average with smoothed cluster size + embed_normalized = self.embed_avg / smoothed_cluster_size.unsqueeze(1) + # embed_normalized = l2norm(self.embed_avg / smoothed_cluster_size.unsqueeze(1)) + self.weight = paddle.create_parameter(shape=embed_normalized.shape, + dtype=str(embed_normalized.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(embed_normalized)) + + +def norm_ema_inplace(moving_avg, new, decay): + x = moving_avg.multiply(paddle.to_tensor(decay)) + x = x.add(new*(1 - decay)) + x = l2norm(x) + moving_avg = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + + +class NormEMAVectorQuantizer(nn.Layer): + def __init__(self, n_embed, embedding_dim, beta, decay=0.99, eps=1e-5, + statistic_code_usage=True, kmeans_init=False, codebook_init_path=''): + super().__init__() + self.codebook_dim = embedding_dim + self.num_tokens = n_embed + self.beta = beta + self.decay = decay + + # learnable = True if orthogonal_reg_weight > 0 else False + self.embedding = EmbeddingEMA(self.num_tokens, self.codebook_dim, decay, eps, kmeans_init, codebook_init_path) + + self.statistic_code_usage = statistic_code_usage + if statistic_code_usage: + self.register_buffer('cluster_size', paddle.zeros([n_embed])) + # if distributed.is_available() and distributed.is_initialized(): + # print("ddp is enable, so use ddp_reduce to sync the statistic_code_usage for each gpu!") + # self.all_reduce_fn = distributed.all_reduce + # else: + # self.all_reduce_fn = Identity + # self.all_reduce_fn = paddle.distributed.all_reduce() + + def reset_cluster_size(self, device): + if self.statistic_code_usage: + self.register_buffer('cluster_size', paddle.zeros([self.num_tokens])) + self.cluster_size = self.cluster_size.to(device) + + def _masked_fill(self, x, mask, value): + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + + def forward(self, z): + # reshape z -> (batch, height, width, channel) and flatten + #z, 'b c h w -> b h w c' + b, c, h, w = z.shape + z = paddle.reshape(z, [b, h, w, c]) + # z = rearrange(z, 'b c h w -> b h w c') + z = l2norm(z) + z_flattened = z.reshape([-1, self.codebook_dim]) + + self.embedding.init_embed_(z_flattened) + + d = z_flattened.pow(2).sum(axis=1, keepdim=True) + \ + self.embedding.weight.pow(2).sum(axis=1) - 2 * \ + paddle.einsum('bd,nd->bn', z_flattened, self.embedding.weight) # 'n d -> d n' + + encoding_indices = paddle.argmin(d, axis=1) + + z_q = self.embedding(encoding_indices).reshape(z.shape) + + encodings = F.one_hot(encoding_indices, self.num_tokens).astype(z.dtype) + + if not self.training: + with paddle.no_grad(): + cluster_size = encodings.sum(0) + # self.all_reduce_fn(cluster_size) + if paddle.distributed.get_world_size() > 1: + paddle.distributed.all_reduce(cluster_size) + ema_inplace(self.cluster_size, cluster_size, self.decay) + + if self.training and self.embedding.update: + # EMA cluster size + + bins = encodings.sum(0) + # self.all_reduce_fn(bins) + if paddle.distributed.get_world_size() > 1: + paddle.distributed.all_reduce(bins) + + # self.embedding.cluster_size_ema_update(bins) + ema_inplace(self.cluster_size, bins, self.decay) + + zero_mask = (bins == 0) + # bins = bins.masked_fill(zero_mask, 1.) + bins = self._masked_fill(bins, zero_mask, 1.) + + embed_sum = z_flattened.t() @ encodings + # self.all_reduce_fn(embed_sum) + if paddle.distributed.get_world_size() > 1: + paddle.distributed.all_reduce(embed_sum) + + embed_normalized = (embed_sum / bins.unsqueeze(0)).t() + embed_normalized = l2norm(embed_normalized) + + embed_normalized = paddle.where(zero_mask[..., None], self.embedding.weight, + embed_normalized) + norm_ema_inplace(self.embedding.weight, embed_normalized, self.decay) + + # compute loss for embedding + loss = self.beta * F.mse_loss(z_q.detach(), z) + + # preserve gradients + z_q = z + (z_q - z).detach() + + # reshape back to match original input shape + #z_q, 'b h w c -> b c h w' + b, h, w, c = z_q.shape + z_q = paddle.reshape(z_q, [b, c, h, w]) + # z_q = rearrange(z_q, 'b h w c -> b c h w') + return z_q, loss, encoding_indices + class VQKD(nn.Layer): def __init__(self, encoder_config, diff --git a/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml index 01c48c7073..5fc486ce23 100644 --- a/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml +++ b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml @@ -5,6 +5,7 @@ Global: output_dir: ./output/ device: gpu save_interval: 1 + train_mode: mask_samples eval_during_train: False eval_interval: 1 epochs: 300 @@ -30,7 +31,6 @@ Arch: name: "Beitv2Model" drop_path_rate : 0.1 class_num: 1000 - is_beitv2: True # if not null, its lengths should be same as models pretrained_list: # if not null, its lengths should be same as models diff --git a/ppcls/engine/engine.py b/ppcls/engine/engine.py index 21d318d8d1..16599db65e 100755 --- a/ppcls/engine/engine.py +++ b/ppcls/engine/engine.py @@ -60,10 +60,6 @@ def __init__(self, config, mode="train"): self.is_rec = True else: self.is_rec = False - if self.config["Arch"].get("is_beitv2", False): - self.is_beitv2 = True - else: - self.is_beitv2 = False # set seed seed = self.config["Global"].get("seed", False) diff --git a/ppcls/engine/train/__init__.py b/ppcls/engine/train/__init__.py index 50bf9037f4..6fce0672d9 100644 --- a/ppcls/engine/train/__init__.py +++ b/ppcls/engine/train/__init__.py @@ -16,3 +16,4 @@ from ppcls.engine.train.train_fixmatch_ccssl import train_epoch_fixmatch_ccssl from ppcls.engine.train.train_progressive import train_epoch_progressive from ppcls.engine.train.train_metabin import train_epoch_metabin +from ppcls.engine.train.train_mask_samples import train_epoch_mask_samples diff --git a/ppcls/engine/train/train.py b/ppcls/engine/train/train.py index 5d638d79c2..7a253706f4 100644 --- a/ppcls/engine/train/train.py +++ b/ppcls/engine/train/train.py @@ -108,9 +108,7 @@ def train_epoch(engine, epoch_id, print_batch_step): def forward(engine, batch): - if not engine.is_rec and not engine.is_beitv2: + if not engine.is_rec: return engine.model(batch[0]) - elif engine.is_rec: - return engine.model(batch[0], batch[1]) else: - return engine.model(batch[0], batch[1], batch[2]) \ No newline at end of file + return engine.model(batch[0], batch[1]) diff --git a/ppcls/engine/train/train_mask_samples.py b/ppcls/engine/train/train_mask_samples.py new file mode 100644 index 0000000000..f55e9c6656 --- /dev/null +++ b/ppcls/engine/train/train_mask_samples.py @@ -0,0 +1,110 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import, division, print_function + +import time +import paddle +from ppcls.engine.train.utils import update_loss, update_metric, log_info, type_name +from ppcls.utils import profiler + + +def train_epoch_mask_samples(engine, epoch_id, print_batch_step): + tic = time.time() + + if not hasattr(engine, "train_dataloader_iter"): + engine.train_dataloader_iter = iter(engine.train_dataloader) + + for iter_id in range(engine.iter_per_epoch): + # fetch data batch from dataloader + try: + batch = next(engine.train_dataloader_iter) + except Exception: + # NOTE: reset DALI dataloader manually + if engine.use_dali: + engine.train_dataloader.reset() + engine.train_dataloader_iter = iter(engine.train_dataloader) + batch = next(engine.train_dataloader_iter) + + profiler.add_profiler_step(engine.config["profiler_options"]) + if iter_id == 5: + for key in engine.time_info: + engine.time_info[key].reset() + engine.time_info["reader_cost"].update(time.time() - tic) + + batch_size = batch[0].shape[0] + if not engine.config["Global"].get("use_multilabel", False): + batch[1] = batch[1].reshape([batch_size, -1]) + engine.global_step += 1 + + # image input + if engine.amp: + amp_level = engine.config["AMP"].get("level", "O1").upper() + with paddle.amp.auto_cast( + custom_black_list={ + "flatten_contiguous_range", "greater_than" + }, + level=amp_level): + out = forward(engine, batch) + loss_dict = engine.train_loss_func(out) + else: + out = forward(engine, batch) + loss_dict = engine.train_loss_func(out) + + # loss + loss = loss_dict["loss"] / engine.update_freq + + # backward & step opt + if engine.amp: + scaled = engine.scaler.scale(loss) + scaled.backward() + if (iter_id + 1) % engine.update_freq == 0: + for i in range(len(engine.optimizer)): + engine.scaler.minimize(engine.optimizer[i], scaled) + else: + loss.backward() + if (iter_id + 1) % engine.update_freq == 0: + for i in range(len(engine.optimizer)): + engine.optimizer[i].step() + + if (iter_id + 1) % engine.update_freq == 0: + # clear grad + for i in range(len(engine.optimizer)): + engine.optimizer[i].clear_grad() + # step lr(by step) + for i in range(len(engine.lr_sch)): + if not getattr(engine.lr_sch[i], "by_epoch", False): + engine.lr_sch[i].step() + # update ema + if engine.ema: + engine.model_ema.update(engine.model) + + # below code just for logging + # update metric_for_logger + update_metric(engine, out, batch, batch_size) + # update_loss_for_logger + update_loss(engine, loss_dict, batch_size) + engine.time_info["batch_cost"].update(time.time() - tic) + if iter_id % print_batch_step == 0: + log_info(engine, batch_size, epoch_id, iter_id) + tic = time.time() + + # step lr(by epoch) + for i in range(len(engine.lr_sch)): + if getattr(engine.lr_sch[i], "by_epoch", False) and \ + type_name(engine.lr_sch[i]) != "ReduceOnPlateau": + engine.lr_sch[i].step() + + +def forward(engine, batch): + return engine.model(batch[0], batch[1], batch[2]) \ No newline at end of file diff --git a/ppcls/loss/distillationloss.py b/ppcls/loss/distillationloss.py index 49435dba8e..fc12c8f0ef 100644 --- a/ppcls/loss/distillationloss.py +++ b/ppcls/loss/distillationloss.py @@ -439,7 +439,7 @@ def __init__(self, self.model_name_pairs = model_name_pairs self.name = name - def forward(self, predicts, batch): + def forward(self, predicts): loss_dict = dict() loss = dict() for idx, pair in enumerate(self.model_name_pairs):