From a510416cb20df1d1a25ebbe75ef9a8b841d816c0 Mon Sep 17 00:00:00 2001 From: Jiabei-prog <79827906+Jiabei-prog@users.noreply.github.com> Date: Thu, 13 Oct 2022 10:52:17 +0800 Subject: [PATCH 1/9] add backbone model 'EdgeVit' --- .../edgevit/EdgeVit_b1024x8_300e_jpg.py | 133 +++++++ easycv/models/backbones/__init__.py | 1 + easycv/models/backbones/edgevit.py | 342 ++++++++++++++++++ 3 files changed, 476 insertions(+) create mode 100644 configs/classification/imagenet/edgevit/EdgeVit_b1024x8_300e_jpg.py create mode 100644 easycv/models/backbones/edgevit.py diff --git a/configs/classification/imagenet/edgevit/EdgeVit_b1024x8_300e_jpg.py b/configs/classification/imagenet/edgevit/EdgeVit_b1024x8_300e_jpg.py new file mode 100644 index 00000000..8da10c60 --- /dev/null +++ b/configs/classification/imagenet/edgevit/EdgeVit_b1024x8_300e_jpg.py @@ -0,0 +1,133 @@ +_base_ = '../../../base.py' + +log_config = dict( + interval=10, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) + +# model settings +model = dict( + type='Classification', + # train_preprocess=['mixUp'], + # mixup_cfg=dict( + # mixup_alpha=0.8, + # cutmix_alpha=1.0, + # cutmix_minmax=None, + # prob=1.0, + # switch_prob=0.5, + # mode='batch', + # label_smoothing=0.1, + # num_classes=1000), + backbone=dict( + type='EdgeVit', + depth=[1, 1, 3, 2], + embed_dim=[36, 72, 144, 288], + head_dim=36, + mlp_ratio=[4] * 4, + qkv_bias=True, + num_classes=1000, + drop_path_rate=0.1, + sr_ratios=[4, 2, 2, 1]), + head=dict( + type='ClsHead', + with_avg_pool=True, + in_channels=288, + loss_config=dict( + type='CrossEntropyLossWithLabelSmooth', + + label_smooth=0), + # head=dict( + # type='ClsHead', + # with_avg_pool=True, + # in_channels=288, + # loss_config={ + # 'type': 'SoftTargetCrossEntropy', + # }, + # with_fc=True)) + +data_train_list = '/root/database/imagenet-raw/meta/train_labeled.txt' +data_train_root = '/root/database/imagenet-raw/train/' +data_test_list = '/root/database/imagenet-raw/meta/val_labeled.txt' +data_test_root = '/root/database/imagenet-raw/val/' +data_all_list = '/root/database/imagenet-raw/meta/all_labeled.txt' +data_root = '/root/database/imagenet-raw/' + +dataset_type = 'ClsDataset' +img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) +train_pipeline = [ + dict( + type='MAEFtAugment', + input_size=224, + color_jitter=0.4, + auto_augment='rand-m9-mstd0.5-inc1', + interpolation='bicubic', + re_prob=0.25, + re_mode='pixel', + re_count=1, + mean=(0.485, 0.456, 0.406), + std=(0.229, 0.224, 0.225), + is_train=True), +] +test_pipeline = [ + dict( + type='MAEFtAugment', + input_size=224, + mean=(0.485, 0.456, 0.406), + std=(0.229, 0.224, 0.225), + is_train=False, + ), +] + +data = dict( + imgs_per_gpu=1024, # total 256 + workers_per_gpu=10, + use_repeated_augment_sampler=True, + train=dict( + type=dataset_type, + data_source=dict( + list_file=data_train_list, + root=data_train_root, + type='ClsSourceImageList'), + pipeline=train_pipeline), + val=dict( + type=dataset_type, + data_source=dict( + list_file=data_test_list, + root=data_test_root, + type='ClsSourceImageList'), + pipeline=test_pipeline)) + +eval_config = dict(initial=True, interval=1, gpu_collect=True) +eval_pipelines = [ + dict( + mode='test', + data=data['val'], + dist_eval=True, + evaluators=[dict(type='ClsEvaluator', topk=(1, 5))], + ) +] + +# additional hooks +custom_hooks = [] + +# optimizer +optimizer = dict(type='AdamW', lr=2e-3, weight_decay=0.05) + +# learning policy +lr_config = dict( + policy='CosineAnnealingWarmupByEpoch', + min_lr=1e-5, + warmup='linear', + warmup_iters=5, + warmup_ratio=1e-6, + # warmup_lr=1e-6, + warmup_by_epoch=True, + by_epoch=True) + + +checkpoint_config = dict(interval=10) + +# runtime settings +total_epochs = 300 + +ema = dict(decay=0.99996) \ No newline at end of file diff --git a/easycv/models/backbones/__init__.py b/easycv/models/backbones/__init__.py index 625f5282..0328c898 100644 --- a/easycv/models/backbones/__init__.py +++ b/easycv/models/backbones/__init__.py @@ -23,3 +23,4 @@ from .swin_transformer import SwinTransformer from .vision_transformer import VisionTransformer from .vitdet import ViTDet +from .edgevit import EdgeVit \ No newline at end of file diff --git a/easycv/models/backbones/edgevit.py b/easycv/models/backbones/edgevit.py new file mode 100644 index 00000000..4a9913c2 --- /dev/null +++ b/easycv/models/backbones/edgevit.py @@ -0,0 +1,342 @@ +from collections import OrderedDict +import torch +import torch.nn as nn +from functools import partial +import torch.nn.functional as F +import math +from timm.models.vision_transformer import _cfg + +from timm.models.layers import trunc_normal_, DropPath, to_2tuple +import torch.utils.checkpoint as checkpoint +from mmcv.cnn import build_norm_layer, constant_init, kaiming_init +from mmcv.runner import get_dist_info +from torch.nn.modules.batchnorm import _BatchNorm +from easycv.utils.checkpoint import load_checkpoint +from easycv.utils.logger import get_root_logger +from ..registry import BACKBONES +from ..utils import build_conv_layer + + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class CMlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Conv2d(in_features, hidden_features, 1) + self.act = act_layer() + self.fc2 = nn.Conv2d(hidden_features, out_features, 1) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class GlobalSparseAttn(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + # self.upsample = nn.Upsample(scale_factor=sr_ratio, mode='nearest') + self.sr = sr_ratio + if self.sr > 1: + self.sampler = nn.AvgPool2d(1, sr_ratio) + kernel_size = sr_ratio + self.LocalProp = nn.ConvTranspose2d(dim, dim, kernel_size, stride=sr_ratio, groups=dim) + self.norm = nn.LayerNorm(dim) + else: + self.sampler = nn.Identity() + self.upsample = nn.Identity() + self.norm = nn.Identity() + + def forward(self, x, H: int, W: int): + B, N, C = x.shape + if self.sr > 1.: + x = x.transpose(1, 2).reshape(B, C, H, W) + x = self.sampler(x) + x = x.flatten(2).transpose(1, 2) + + qkv = self.qkv(x).reshape(B, -1, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, -1, C) + + if self.sr > 1: + x = x.permute(0, 2, 1).reshape(B, C, int(H / self.sr), int(W / self.sr)) + x = self.LocalProp(x) + x = x.reshape(B, C, -1).permute(0, 2, 1) + x = self.norm(x) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class LocalAgg(nn.Module): + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim) + self.norm1 = nn.BatchNorm2d(dim) + self.conv1 = nn.Conv2d(dim, dim, 1) + self.conv2 = nn.Conv2d(dim, dim, 1) + self.attn = nn.Conv2d(dim, dim, 5, padding=2, groups=dim) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = nn.BatchNorm2d(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = CMlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + x = x + self.pos_embed(x) + x = x + self.drop_path(self.conv2(self.attn(self.conv1(self.norm1(x))))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class SelfAttn(nn.Module): + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1.): + super().__init__() + self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim) + self.norm1 = norm_layer(dim) + self.attn = GlobalSparseAttn( + dim, + num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, + attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + # global layer_scale + # self.ls = layer_scale + + def forward(self, x): + x = x + self.pos_embed(x) + B, N, H, W = x.shape + x = x.flatten(2).transpose(1, 2) + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + x = x.transpose(1, 2).reshape(B, N, H, W) + return x + + +class LGLBlock(nn.Module): + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1.): + super().__init__() + + if sr_ratio > 1: + self.LocalAgg = LocalAgg(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop, attn_drop, drop_path, + act_layer, norm_layer) + else: + self.LocalAgg = nn.Identity() + + self.SelfAttn = SelfAttn(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop, attn_drop, drop_path, act_layer, + norm_layer, sr_ratio) + + def forward(self, x): + x = self.LocalAgg(x) + x = self.SelfAttn(x) + return x + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + self.norm = nn.LayerNorm(embed_dim) + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x): + B, C, H, W = x.shape + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({B}*{C}*{H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x) + B, C, H, W = x.shape + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() + return x + + +@BACKBONES.register_module() +class EdgeVit(nn.Module): + """ Vision Transformer + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` - + https://arxiv.org/abs/2010.11929 + """ + + def __init__(self, depth=[1, 2, 3, 2], img_size=224, in_chans=3, num_classes=1000, embed_dim=[48, 96, 240, 384], + head_dim=48, mlp_ratio=[4] * 4, qkv_bias=True, qk_scale=None, representation_size=None, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-8), + sr_ratios=[4, 2, 2, 1], pretrained=None): + """ + Args: + depth (list): depth of each stage + img_size (int, tuple): input image size + in_chans (int): number of input channels + num_classes (int): number of classes for classification head + embed_dim (list): embedding dimension of each stage + head_dim (int): head dimension + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + qk_scale (float): override default qk scale of head_dim ** -0.5 if set + representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set + drop_rate (float): dropout rate + attn_drop_rate (float): attention dropout rate + drop_path_rate (float): stochastic depth rate + norm_layer (nn.Module): normalization layer + """ + super().__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + + self.patch_embed1 = PatchEmbed( + img_size=img_size, patch_size=4, in_chans=in_chans, embed_dim=embed_dim[0]) + self.patch_embed2 = PatchEmbed( + img_size=img_size // 4, patch_size=2, in_chans=embed_dim[0], embed_dim=embed_dim[1]) + self.patch_embed3 = PatchEmbed( + img_size=img_size // 8, patch_size=2, in_chans=embed_dim[1], embed_dim=embed_dim[2]) + self.patch_embed4 = PatchEmbed( + img_size=img_size // 16, patch_size=2, in_chans=embed_dim[2], embed_dim=embed_dim[3]) + + self.pos_drop = nn.Dropout(p=drop_rate) + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depth))] # stochastic depth decay rule + num_heads = [dim // head_dim for dim in embed_dim] + self.blocks1 = nn.ModuleList([ + LGLBlock( + dim=embed_dim[0], num_heads=num_heads[0], mlp_ratio=mlp_ratio[0], qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + sr_ratio=sr_ratios[0]) + for i in range(depth[0])]) + self.blocks2 = nn.ModuleList([ + LGLBlock( + dim=embed_dim[1], num_heads=num_heads[1], mlp_ratio=mlp_ratio[1], qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i + depth[0]], norm_layer=norm_layer, + sr_ratio=sr_ratios[1]) + for i in range(depth[1])]) + self.blocks3 = nn.ModuleList([ + LGLBlock( + dim=embed_dim[2], num_heads=num_heads[2], mlp_ratio=mlp_ratio[2], qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i + depth[0] + depth[1]], norm_layer=norm_layer, + sr_ratio=sr_ratios[2]) + for i in range(depth[2])]) + self.blocks4 = nn.ModuleList([ + LGLBlock( + dim=embed_dim[3], num_heads=num_heads[3], mlp_ratio=mlp_ratio[3], qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i + depth[0] + depth[1] + depth[2]], + norm_layer=norm_layer, sr_ratio=sr_ratios[3]) + for i in range(depth[3])]) + self.norm = nn.BatchNorm2d(embed_dim[-1]) + + # Representation layer + if representation_size: + self.num_features = representation_size + self.pre_logits = nn.Sequential(OrderedDict([ + ('fc', nn.Linear(embed_dim, representation_size)), + ('act', nn.Tanh()) + ])) + else: + self.pre_logits = nn.Identity() + + self.pretrained = pretrained + self.init_weights() + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + pretrained = pretrained or self.pretrained + + def _init_weights(m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + + if isinstance(pretrained, str): + self.apply(_init_weights) + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + self.apply(_init_weights) + else: + raise TypeError('pretrained must be a str or None') + + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def forward_features(self, x): + x = self.patch_embed1(x) + x = self.pos_drop(x) + for blk in self.blocks1: + x = blk(x) + x = self.patch_embed2(x) + for blk in self.blocks2: + x = blk(x) + x = self.patch_embed3(x) + for blk in self.blocks3: + x = blk(x) + x = self.patch_embed4(x) + for blk in self.blocks4: + x = blk(x) + x = self.norm(x) + x = self.pre_logits(x) + return x + + def forward(self, x): + x = self.forward_features(x) + return [x] \ No newline at end of file From fbef605be1012edcc86aab80cc397b0686bf5019 Mon Sep 17 00:00:00 2001 From: Jiabei-prog <79827906+Jiabei-prog@users.noreply.github.com> Date: Thu, 13 Oct 2022 14:11:05 +0800 Subject: [PATCH 2/9] add backbone model 'EdgeVit' --- .../imagenet/edgevit/EdgeVit_b1024x8_300e_jpg.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/configs/classification/imagenet/edgevit/EdgeVit_b1024x8_300e_jpg.py b/configs/classification/imagenet/edgevit/EdgeVit_b1024x8_300e_jpg.py index 8da10c60..865f1450 100644 --- a/configs/classification/imagenet/edgevit/EdgeVit_b1024x8_300e_jpg.py +++ b/configs/classification/imagenet/edgevit/EdgeVit_b1024x8_300e_jpg.py @@ -35,7 +35,7 @@ loss_config=dict( type='CrossEntropyLossWithLabelSmooth', - label_smooth=0), + label_smooth=0),)) # head=dict( # type='ClsHead', # with_avg_pool=True, @@ -45,12 +45,13 @@ # }, # with_fc=True)) -data_train_list = '/root/database/imagenet-raw/meta/train_labeled.txt' -data_train_root = '/root/database/imagenet-raw/train/' -data_test_list = '/root/database/imagenet-raw/meta/val_labeled.txt' -data_test_root = '/root/database/imagenet-raw/val/' -data_all_list = '/root/database/imagenet-raw/meta/all_labeled.txt' -data_root = '/root/database/imagenet-raw/' + +data_train_list = 'data/imagenet_raw/meta/train_labeled.txt' +data_train_root = 'data/imagenet_raw/train/' +data_test_list = 'data/imagenet_raw/meta/val_labeled.txt' +data_test_root = 'data/imagenet_raw/validation/' +data_all_list = 'data/imagenet_raw/meta/all_labeled.txt' +data_root = 'data/imagenet_raw/' dataset_type = 'ClsDataset' img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) From 757a3a6da16efeb7dd5bff4d34ba7feb6a583115 Mon Sep 17 00:00:00 2001 From: Jiabei-prog <79827906+Jiabei-prog@users.noreply.github.com> Date: Fri, 21 Oct 2022 15:07:21 +0800 Subject: [PATCH 3/9] add config file in different scale and put in benchmark result for edgexxs --- ...300e_jpg.py => EdgeVit_b512x8_300e_jpg.py} | 23 ++++++++------- .../edgevit/imagenet_edgeVIT_s_jpg.py | 28 +++++++++++++++++++ .../edgevit/imagenet_edgeVIT_xs_jpg.py | 28 +++++++++++++++++++ docs/source/model_zoo_cls.md | 1 + easycv/models/backbones/edgevit.py | 4 +-- 5 files changed, 69 insertions(+), 15 deletions(-) rename configs/classification/imagenet/edgevit/{EdgeVit_b1024x8_300e_jpg.py => EdgeVit_b512x8_300e_jpg.py} (90%) create mode 100644 configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py create mode 100644 configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py diff --git a/configs/classification/imagenet/edgevit/EdgeVit_b1024x8_300e_jpg.py b/configs/classification/imagenet/edgevit/EdgeVit_b512x8_300e_jpg.py similarity index 90% rename from configs/classification/imagenet/edgevit/EdgeVit_b1024x8_300e_jpg.py rename to configs/classification/imagenet/edgevit/EdgeVit_b512x8_300e_jpg.py index 865f1450..cd1be5ab 100644 --- a/configs/classification/imagenet/edgevit/EdgeVit_b1024x8_300e_jpg.py +++ b/configs/classification/imagenet/edgevit/EdgeVit_b512x8_300e_jpg.py @@ -35,15 +35,15 @@ loss_config=dict( type='CrossEntropyLossWithLabelSmooth', - label_smooth=0),)) - # head=dict( - # type='ClsHead', - # with_avg_pool=True, - # in_channels=288, - # loss_config={ - # 'type': 'SoftTargetCrossEntropy', - # }, - # with_fc=True)) + label_smooth=0), )) +# head=dict( +# type='ClsHead', +# with_avg_pool=True, +# in_channels=288, +# loss_config={ +# 'type': 'SoftTargetCrossEntropy', +# }, +# with_fc=True)) data_train_list = 'data/imagenet_raw/meta/train_labeled.txt' @@ -80,7 +80,7 @@ ] data = dict( - imgs_per_gpu=1024, # total 256 + imgs_per_gpu=512, # total 256 workers_per_gpu=10, use_repeated_augment_sampler=True, train=dict( @@ -125,10 +125,9 @@ warmup_by_epoch=True, by_epoch=True) - checkpoint_config = dict(interval=10) # runtime settings total_epochs = 300 -ema = dict(decay=0.99996) \ No newline at end of file +ema = dict(decay=0.99996) diff --git a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py new file mode 100644 index 00000000..715ffce2 --- /dev/null +++ b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py @@ -0,0 +1,28 @@ +_base_ = './EdgeVit_b256x4_300e_jpg.py' +# model settings +model = dict( + type='Classification', + backbone=dict( + type='EdgeVit', + depth=[1, 2, 5, 3], + embed_dim=[48, 96, 240, 384], + head_dim=48, + mlp_ratio=[4] * 4, + qkv_bias=True, + num_classes=1000, + drop_path_rate=0.1, + sr_ratios=[4, 2, 2, 1]), + head=dict( + type='ClsHead', + with_avg_pool=True, + in_channels=384, + loss_config=dict( + type='CrossEntropyLossWithLabelSmooth', + + label_smooth=0), + )) + +data = dict( + imgs_per_gpu=128, # total 256 + workers_per_gpu=10, + use_repeated_augment_sampler=True,) diff --git a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py new file mode 100644 index 00000000..f1874903 --- /dev/null +++ b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py @@ -0,0 +1,28 @@ +_base_ = './EdgeVit_b256x4_300e_jpg.py' +# model settings +model = dict( + type='Classification', + backbone=dict( + type='EdgeVit', + depth=[1, 1, 3, 1], + embed_dim=[48, 96, 240, 384], + head_dim=48, + mlp_ratio=[4] * 4, + qkv_bias=True, + num_classes=1000, + drop_path_rate=0.1, + sr_ratios=[4, 2, 2, 1]), + head=dict( + type='ClsHead', + with_avg_pool=True, + in_channels=384, + loss_config=dict( + type='CrossEntropyLossWithLabelSmooth', + + label_smooth=0), + )) + +data = dict( + imgs_per_gpu=256, # total 256 + workers_per_gpu=10, + use_repeated_augment_sampler=True,) diff --git a/docs/source/model_zoo_cls.md b/docs/source/model_zoo_cls.md index a2254ddf..47256a4d 100644 --- a/docs/source/model_zoo_cls.md +++ b/docs/source/model_zoo_cls.md @@ -80,5 +80,6 @@ | efficientformer_l1 | [efficientformer_l1](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/efficientformer/efficientformer_l1.py) | 80.102 | 94.934 | 1820 | 7.5 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/efficientformer/efficientformer_l1_1000d.pth) | | efficientformer_l3 | [efficientformer_l3](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/efficientformer/efficientformer_l3.py) | 82.272 | 96.028 | 2436 | 13.07 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/efficientformer/efficientformer_l3_300d.pth) | | efficientformer_l7 | [efficientformer_l7](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/efficientformer/efficientformer_l7.py) | 83.076 | 96.44 | 1622 | 18.96 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/efficientformer/efficientformer_l7_300d.pth) | +| EdgeVit_xxs_b512_224 | [EdgeVit_xxs_b512_224](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/edgevit/EdgeVit_b512x8_300e_jpg.py) | 75.18 | 92.19 | 13876 | 8.632 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/edgevit/edgexxs/ClsEvaluator_neck_top1_best.pth) | (ps: 通过导入官方模型得到推理结果,需要torch.__version__ >= 1.9.0,推理的输入尺寸默认为224,机器默认为V100 16G,其中gpu memory记录的是gpu peak memory) diff --git a/easycv/models/backbones/edgevit.py b/easycv/models/backbones/edgevit.py index 4a9913c2..fbd1a73e 100644 --- a/easycv/models/backbones/edgevit.py +++ b/easycv/models/backbones/edgevit.py @@ -304,7 +304,6 @@ def _init_weights(m): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) - if isinstance(pretrained, str): self.apply(_init_weights) logger = get_root_logger() @@ -314,7 +313,6 @@ def _init_weights(m): else: raise TypeError('pretrained must be a str or None') - @torch.jit.ignore def no_weight_decay(self): return {'pos_embed', 'cls_token'} @@ -339,4 +337,4 @@ def forward_features(self, x): def forward(self, x): x = self.forward_features(x) - return [x] \ No newline at end of file + return [x] From 3cfb0e0248e3315a10ec42d7c925402d6d6fe528 Mon Sep 17 00:00:00 2001 From: Jiabei-prog <79827906+Jiabei-prog@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:56:10 +0800 Subject: [PATCH 4/9] Fixed #214 --- ...300e_jpg.py => EdgeVit_b512x8_300e_jpg.py} | 31 +- .../edgevit/imagenet_edgeVIT_s_jpg.py | 32 + .../edgevit/imagenet_edgeVIT_xs_jpg.py | 32 + .../edgevit/imagenet_edgeVIT_xxs_jpg.py | 32 + easycv/models/backbones/__init__.py | 2 +- easycv/models/backbones/edgevit.py | 282 +++++---- easycv/models/utils/__init__.py | 2 +- easycv/models/utils/transformer.py | 569 +----------------- 8 files changed, 306 insertions(+), 676 deletions(-) rename configs/classification/imagenet/edgevit/{EdgeVit_b1024x8_300e_jpg.py => EdgeVit_b512x8_300e_jpg.py} (78%) create mode 100644 configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py create mode 100644 configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py create mode 100644 configs/classification/imagenet/edgevit/imagenet_edgeVIT_xxs_jpg.py diff --git a/configs/classification/imagenet/edgevit/EdgeVit_b1024x8_300e_jpg.py b/configs/classification/imagenet/edgevit/EdgeVit_b512x8_300e_jpg.py similarity index 78% rename from configs/classification/imagenet/edgevit/EdgeVit_b1024x8_300e_jpg.py rename to configs/classification/imagenet/edgevit/EdgeVit_b512x8_300e_jpg.py index 865f1450..d9bd6647 100644 --- a/configs/classification/imagenet/edgevit/EdgeVit_b1024x8_300e_jpg.py +++ b/configs/classification/imagenet/edgevit/EdgeVit_b512x8_300e_jpg.py @@ -8,16 +8,6 @@ # model settings model = dict( type='Classification', - # train_preprocess=['mixUp'], - # mixup_cfg=dict( - # mixup_alpha=0.8, - # cutmix_alpha=1.0, - # cutmix_minmax=None, - # prob=1.0, - # switch_prob=0.5, - # mode='batch', - # label_smoothing=0.1, - # num_classes=1000), backbone=dict( type='EdgeVit', depth=[1, 1, 3, 2], @@ -33,25 +23,13 @@ with_avg_pool=True, in_channels=288, loss_config=dict( - type='CrossEntropyLossWithLabelSmooth', - - label_smooth=0),)) - # head=dict( - # type='ClsHead', - # with_avg_pool=True, - # in_channels=288, - # loss_config={ - # 'type': 'SoftTargetCrossEntropy', - # }, - # with_fc=True)) - + type='CrossEntropyLossWithLabelSmooth', label_smooth=0), + )) data_train_list = 'data/imagenet_raw/meta/train_labeled.txt' data_train_root = 'data/imagenet_raw/train/' data_test_list = 'data/imagenet_raw/meta/val_labeled.txt' data_test_root = 'data/imagenet_raw/validation/' -data_all_list = 'data/imagenet_raw/meta/all_labeled.txt' -data_root = 'data/imagenet_raw/' dataset_type = 'ClsDataset' img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) @@ -80,7 +58,7 @@ ] data = dict( - imgs_per_gpu=1024, # total 256 + imgs_per_gpu=512, workers_per_gpu=10, use_repeated_augment_sampler=True, train=dict( @@ -125,10 +103,9 @@ warmup_by_epoch=True, by_epoch=True) - checkpoint_config = dict(interval=10) # runtime settings total_epochs = 300 -ema = dict(decay=0.99996) \ No newline at end of file +ema = dict(decay=0.99996) diff --git a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py new file mode 100644 index 00000000..42650c36 --- /dev/null +++ b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py @@ -0,0 +1,32 @@ +_base_ = './EdgeVit_b512x8_300e_jpg.py' +# model settings +model = dict( + type='Classification', + backbone=dict( + type='EdgeVit', + depth=[1, 2, 5, 3], + embed_dim=[48, 96, 240, 384], + head_dim=48, + mlp_ratio=[4] * 4, + qkv_bias=True, + num_classes=1000, + drop_path_rate=0.1, + sr_ratios=[4, 2, 2, 1]), + head=dict( + type='ClsHead', + with_avg_pool=True, + in_channels=384, + loss_config=dict( + type='CrossEntropyLossWithLabelSmooth', label_smooth=0.1), + )) + +# input data settings +data = dict( + imgs_per_gpu=128, + workers_per_gpu=10, + use_repeated_augment_sampler=True, +) + +# optimizer +update_interval = 8 +optimizer_config = dict(update_interval=update_interval) diff --git a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py new file mode 100644 index 00000000..ebaba756 --- /dev/null +++ b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py @@ -0,0 +1,32 @@ +_base_ = './EdgeVit_b512x8_300e_jpg.py' +# model settings +model = dict( + type='Classification', + backbone=dict( + type='EdgeVit', + depth=[1, 1, 3, 1], + embed_dim=[48, 96, 240, 384], + head_dim=48, + mlp_ratio=[4] * 4, + qkv_bias=True, + num_classes=1000, + drop_path_rate=0.1, + sr_ratios=[4, 2, 2, 1]), + head=dict( + type='ClsHead', + with_avg_pool=True, + in_channels=384, + loss_config=dict( + type='CrossEntropyLossWithLabelSmooth', label_smooth=0.1), + )) + +# input data settings +data = dict( + imgs_per_gpu=256, + workers_per_gpu=10, + use_repeated_augment_sampler=True, +) + +# optimizer +update_interval = 4 +optimizer_config = dict(update_interval=update_interval) diff --git a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xxs_jpg.py b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xxs_jpg.py new file mode 100644 index 00000000..74b53c69 --- /dev/null +++ b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xxs_jpg.py @@ -0,0 +1,32 @@ +_base_ = './EdgeVit_b512x8_300e_jpg.py' +# model settings +model = dict( + type='Classification', + backbone=dict( + type='EdgeVit', + depth=[1, 1, 3, 2], + embed_dim=[36, 72, 144, 288], + head_dim=36, + mlp_ratio=[4] * 4, + qkv_bias=True, + num_classes=1000, + drop_path_rate=0.1, + sr_ratios=[4, 2, 2, 1]), + head=dict( + type='ClsHead', + with_avg_pool=True, + in_channels=288, + loss_config=dict( + type='CrossEntropyLossWithLabelSmooth', label_smooth=0.1), + )) + +# input data settings +data = dict( + imgs_per_gpu=512, + workers_per_gpu=10, + use_repeated_augment_sampler=True, +) + +# optimizer +update_interval = 2 +optimizer_config = dict(update_interval=update_interval) diff --git a/easycv/models/backbones/__init__.py b/easycv/models/backbones/__init__.py index 0328c898..1acb2fcb 100644 --- a/easycv/models/backbones/__init__.py +++ b/easycv/models/backbones/__init__.py @@ -3,6 +3,7 @@ from .bninception import BNInception from .conv_mae_vit import FastConvMAEViT from .conv_vitdet import ConvViTDet +from .edgevit import EdgeVit from .efficientformer import EfficientFormer from .face_keypoint_backbone import FaceKeypointBackbone from .genet import PlainNet @@ -23,4 +24,3 @@ from .swin_transformer import SwinTransformer from .vision_transformer import VisionTransformer from .vitdet import ViTDet -from .edgevit import EdgeVit \ No newline at end of file diff --git a/easycv/models/backbones/edgevit.py b/easycv/models/backbones/edgevit.py index 4a9913c2..f2f4499e 100644 --- a/easycv/models/backbones/edgevit.py +++ b/easycv/models/backbones/edgevit.py @@ -1,67 +1,37 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +""" +This model is taken from +https://github.com/SamsungLabs/EdgeViTs +""" + from collections import OrderedDict +from functools import partial + import torch import torch.nn as nn -from functools import partial -import torch.nn.functional as F -import math -from timm.models.vision_transformer import _cfg - -from timm.models.layers import trunc_normal_, DropPath, to_2tuple -import torch.utils.checkpoint as checkpoint -from mmcv.cnn import build_norm_layer, constant_init, kaiming_init -from mmcv.runner import get_dist_info -from torch.nn.modules.batchnorm import _BatchNorm +from timm.models.layers import DropPath, to_2tuple, trunc_normal_ + +from easycv.models.utils import ConvMlp, Mlp from easycv.utils.checkpoint import load_checkpoint from easycv.utils.logger import get_root_logger from ..registry import BACKBONES -from ..utils import build_conv_layer - - -class Mlp(nn.Module): - def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - self.fc1 = nn.Linear(in_features, hidden_features) - self.act = act_layer() - self.fc2 = nn.Linear(hidden_features, out_features) - self.drop = nn.Dropout(drop) - - def forward(self, x): - x = self.fc1(x) - x = self.act(x) - x = self.drop(x) - x = self.fc2(x) - x = self.drop(x) - return x - - -class CMlp(nn.Module): - def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - self.fc1 = nn.Conv2d(in_features, hidden_features, 1) - self.act = act_layer() - self.fc2 = nn.Conv2d(hidden_features, out_features, 1) - self.drop = nn.Dropout(drop) - - def forward(self, x): - x = self.fc1(x) - x = self.act(x) - x = self.drop(x) - x = self.fc2(x) - x = self.drop(x) - return x class GlobalSparseAttn(nn.Module): - def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1): + + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0., + sr_ratio=1): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights - self.scale = qk_scale or head_dim ** -0.5 + self.scale = qk_scale or head_dim**-0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) @@ -73,7 +43,8 @@ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0. if self.sr > 1: self.sampler = nn.AvgPool2d(1, sr_ratio) kernel_size = sr_ratio - self.LocalProp = nn.ConvTranspose2d(dim, dim, kernel_size, stride=sr_ratio, groups=dim) + self.LocalProp = nn.ConvTranspose2d( + dim, dim, kernel_size, stride=sr_ratio, groups=dim) self.norm = nn.LayerNorm(dim) else: self.sampler = nn.Identity() @@ -87,7 +58,8 @@ def forward(self, x, H: int, W: int): x = self.sampler(x) x = x.flatten(2).transpose(1, 2) - qkv = self.qkv(x).reshape(B, -1, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + qkv = self.qkv(x).reshape(B, -1, 3, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] attn = (q @ k.transpose(-2, -1)) * self.scale @@ -97,7 +69,8 @@ def forward(self, x, H: int, W: int): x = (attn @ v).transpose(1, 2).reshape(B, -1, C) if self.sr > 1: - x = x.permute(0, 2, 1).reshape(B, C, int(H / self.sr), int(W / self.sr)) + x = x.permute(0, 2, 1).reshape(B, C, int(H / self.sr), + int(W / self.sr)) x = self.LocalProp(x) x = x.reshape(B, C, -1).permute(0, 2, 1) x = self.norm(x) @@ -108,41 +81,77 @@ def forward(self, x, H: int, W: int): class LocalAgg(nn.Module): - def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., - drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + + def __init__(self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm): super().__init__() self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim) self.norm1 = nn.BatchNorm2d(dim) self.conv1 = nn.Conv2d(dim, dim, 1) self.conv2 = nn.Conv2d(dim, dim, 1) self.attn = nn.Conv2d(dim, dim, 5, padding=2, groups=dim) - self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() self.norm2 = nn.BatchNorm2d(dim) mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = CMlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + self.mlp = ConvMlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) def forward(self, x): x = x + self.pos_embed(x) - x = x + self.drop_path(self.conv2(self.attn(self.conv1(self.norm1(x))))) + x = x + self.drop_path( + self.conv2(self.attn(self.conv1(self.norm1(x))))) x = x + self.drop_path(self.mlp(self.norm2(x))) return x class SelfAttn(nn.Module): - def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., - drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1.): + + def __init__(self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + sr_ratio=1.): super().__init__() self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim) self.norm1 = norm_layer(dim) self.attn = GlobalSparseAttn( dim, - num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, - attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio) + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + sr_ratio=sr_ratio) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here - self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) # global layer_scale # self.ls = layer_scale @@ -157,17 +166,30 @@ def forward(self, x): class LGLBlock(nn.Module): - def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., - drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1.): + + def __init__(self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + sr_ratio=1.): super().__init__() if sr_ratio > 1: - self.LocalAgg = LocalAgg(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop, attn_drop, drop_path, + self.LocalAgg = LocalAgg(dim, num_heads, mlp_ratio, qkv_bias, + qk_scale, drop, attn_drop, drop_path, act_layer, norm_layer) else: self.LocalAgg = nn.Identity() - self.SelfAttn = SelfAttn(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop, attn_drop, drop_path, act_layer, + self.SelfAttn = SelfAttn(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, + drop, attn_drop, drop_path, act_layer, norm_layer, sr_ratio) def forward(self, x): @@ -184,12 +206,14 @@ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): super().__init__() img_size = to_2tuple(img_size) patch_size = to_2tuple(patch_size) - num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + num_patches = (img_size[1] // patch_size[1]) * ( + img_size[0] // patch_size[0]) self.img_size = img_size self.patch_size = patch_size self.num_patches = num_patches self.norm = nn.LayerNorm(embed_dim) - self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) def forward(self, x): B, C, H, W = x.shape @@ -210,10 +234,23 @@ class EdgeVit(nn.Module): https://arxiv.org/abs/2010.11929 """ - def __init__(self, depth=[1, 2, 3, 2], img_size=224, in_chans=3, num_classes=1000, embed_dim=[48, 96, 240, 384], - head_dim=48, mlp_ratio=[4] * 4, qkv_bias=True, qk_scale=None, representation_size=None, - drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-8), - sr_ratios=[4, 2, 2, 1], pretrained=None): + def __init__(self, + depth=[1, 2, 3, 2], + img_size=224, + in_chans=3, + num_classes=1000, + embed_dim=[48, 96, 240, 384], + head_dim=48, + mlp_ratio=[4] * 4, + qkv_bias=True, + qk_scale=None, + representation_size=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_layer=partial(nn.LayerNorm, eps=1e-8), + sr_ratios=[4, 2, 2, 1], + pretrained=None): """ Args: depth (list): depth of each stage @@ -237,50 +274,91 @@ def __init__(self, depth=[1, 2, 3, 2], img_size=224, in_chans=3, num_classes=100 norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) self.patch_embed1 = PatchEmbed( - img_size=img_size, patch_size=4, in_chans=in_chans, embed_dim=embed_dim[0]) + img_size=img_size, + patch_size=4, + in_chans=in_chans, + embed_dim=embed_dim[0]) self.patch_embed2 = PatchEmbed( - img_size=img_size // 4, patch_size=2, in_chans=embed_dim[0], embed_dim=embed_dim[1]) + img_size=img_size // 4, + patch_size=2, + in_chans=embed_dim[0], + embed_dim=embed_dim[1]) self.patch_embed3 = PatchEmbed( - img_size=img_size // 8, patch_size=2, in_chans=embed_dim[1], embed_dim=embed_dim[2]) + img_size=img_size // 8, + patch_size=2, + in_chans=embed_dim[1], + embed_dim=embed_dim[2]) self.patch_embed4 = PatchEmbed( - img_size=img_size // 16, patch_size=2, in_chans=embed_dim[2], embed_dim=embed_dim[3]) + img_size=img_size // 16, + patch_size=2, + in_chans=embed_dim[2], + embed_dim=embed_dim[3]) self.pos_drop = nn.Dropout(p=drop_rate) - dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depth))] # stochastic depth decay rule + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, sum(depth)) + ] # stochastic depth decay rule num_heads = [dim // head_dim for dim in embed_dim] self.blocks1 = nn.ModuleList([ LGLBlock( - dim=embed_dim[0], num_heads=num_heads[0], mlp_ratio=mlp_ratio[0], qkv_bias=qkv_bias, qk_scale=qk_scale, - drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, - sr_ratio=sr_ratios[0]) - for i in range(depth[0])]) + dim=embed_dim[0], + num_heads=num_heads[0], + mlp_ratio=mlp_ratio[0], + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + sr_ratio=sr_ratios[0]) for i in range(depth[0]) + ]) self.blocks2 = nn.ModuleList([ LGLBlock( - dim=embed_dim[1], num_heads=num_heads[1], mlp_ratio=mlp_ratio[1], qkv_bias=qkv_bias, qk_scale=qk_scale, - drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i + depth[0]], norm_layer=norm_layer, - sr_ratio=sr_ratios[1]) - for i in range(depth[1])]) + dim=embed_dim[1], + num_heads=num_heads[1], + mlp_ratio=mlp_ratio[1], + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i + depth[0]], + norm_layer=norm_layer, + sr_ratio=sr_ratios[1]) for i in range(depth[1]) + ]) self.blocks3 = nn.ModuleList([ LGLBlock( - dim=embed_dim[2], num_heads=num_heads[2], mlp_ratio=mlp_ratio[2], qkv_bias=qkv_bias, qk_scale=qk_scale, - drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i + depth[0] + depth[1]], norm_layer=norm_layer, - sr_ratio=sr_ratios[2]) - for i in range(depth[2])]) + dim=embed_dim[2], + num_heads=num_heads[2], + mlp_ratio=mlp_ratio[2], + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i + depth[0] + depth[1]], + norm_layer=norm_layer, + sr_ratio=sr_ratios[2]) for i in range(depth[2]) + ]) self.blocks4 = nn.ModuleList([ LGLBlock( - dim=embed_dim[3], num_heads=num_heads[3], mlp_ratio=mlp_ratio[3], qkv_bias=qkv_bias, qk_scale=qk_scale, - drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i + depth[0] + depth[1] + depth[2]], - norm_layer=norm_layer, sr_ratio=sr_ratios[3]) - for i in range(depth[3])]) + dim=embed_dim[3], + num_heads=num_heads[3], + mlp_ratio=mlp_ratio[3], + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i + depth[0] + depth[1] + depth[2]], + norm_layer=norm_layer, + sr_ratio=sr_ratios[3]) for i in range(depth[3]) + ]) self.norm = nn.BatchNorm2d(embed_dim[-1]) # Representation layer if representation_size: self.num_features = representation_size - self.pre_logits = nn.Sequential(OrderedDict([ - ('fc', nn.Linear(embed_dim, representation_size)), - ('act', nn.Tanh()) - ])) + self.pre_logits = nn.Sequential( + OrderedDict([('fc', nn.Linear(embed_dim, representation_size)), + ('act', nn.Tanh())])) else: self.pre_logits = nn.Identity() @@ -304,7 +382,6 @@ def _init_weights(m): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) - if isinstance(pretrained, str): self.apply(_init_weights) logger = get_root_logger() @@ -314,7 +391,6 @@ def _init_weights(m): else: raise TypeError('pretrained must be a str or None') - @torch.jit.ignore def no_weight_decay(self): return {'pos_embed', 'cls_token'} @@ -339,4 +415,4 @@ def forward_features(self, x): def forward(self, x): x = self.forward_features(x) - return [x] \ No newline at end of file + return [x] diff --git a/easycv/models/utils/__init__.py b/easycv/models/utils/__init__.py index 6568df3f..f5d5d149 100644 --- a/easycv/models/utils/__init__.py +++ b/easycv/models/utils/__init__.py @@ -18,7 +18,7 @@ # from .weight_init import (bias_init_with_prob, kaiming_init, normal_init, # uniform_init, xavier_init) from .sobel import Sobel -from .transformer import (MLP, DropPath, Mlp, TransformerEncoder, +from .transformer import (MLP, ConvMlp, DropPath, Mlp, TransformerEncoder, TransformerEncoderLayer, _get_activation_fn, _get_clones) diff --git a/easycv/models/utils/transformer.py b/easycv/models/utils/transformer.py index 8fdb5ee4..dd96065f 100644 --- a/easycv/models/utils/transformer.py +++ b/easycv/models/utils/transformer.py @@ -1,24 +1,12 @@ import copy -import warnings from typing import Optional import torch import torch.nn as nn import torch.nn.functional as F -from mmcv.cnn.bricks import Linear -from mmcv.cnn.bricks.drop import build_dropout -from mmcv.runner.base_module import BaseModule, ModuleList, Sequential -from mmcv.utils import ConfigDict from torch import Tensor from easycv.framework.errors import RuntimeError -from easycv.models.builder import (build_attention, build_feedforward_network, - build_transformer_layer) -from easycv.models.registry import (ATTENTION, FEEDFORWARD_NETWORK, - TRANSFORMER_LAYER, - TRANSFORMER_LAYER_SEQUENCE) -from easycv.models.utils.activation import build_activation_layer -from easycv.models.utils.norm import build_norm_layer class MLP(nn.Module): @@ -66,6 +54,31 @@ def forward(self, x): return x +class ConvMlp(nn.Module): + + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Conv2d(in_features, hidden_features, 1) + self.act = act_layer() + self.fc2 = nn.Conv2d(hidden_features, out_features, 1) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + def drop_path(x, drop_prob: float = 0., training: bool = False): if drop_prob == 0. or not training: return x @@ -202,535 +215,3 @@ def _get_activation_fn(activation): if activation == 'selu': return F.selu raise RuntimeError(F'activation should be relu/gelu, not {activation}.') - - -@FEEDFORWARD_NETWORK.register_module() -class FFN(BaseModule): - """Implements feed-forward networks (FFNs) with identity connection. - - Args: - embed_dims (int): The feature dimension. Same as - `MultiheadAttention`. Defaults: 256. - feedforward_channels (int): The hidden dimension of FFNs. - Defaults: 1024. - num_fcs (int, optional): The number of fully-connected layers in - FFNs. Default: 2. - act_cfg (dict, optional): The activation config for FFNs. - Default: dict(type='ReLU') - ffn_drop (float, optional): Probability of an element to be - zeroed in FFN. Default 0.0. - add_identity (bool, optional): Whether to add the - identity connection. Default: `True`. - dropout_layer (obj:`ConfigDict`): The dropout_layer used - when adding the shortcut. - init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. - Default: None. - """ - - def __init__(self, - embed_dims=256, - feedforward_channels=1024, - num_fcs=2, - act_cfg=dict(type='ReLU', inplace=True), - ffn_drop=0., - dropout_layer=None, - add_identity=True, - init_cfg=None, - **kwargs): - super().__init__(init_cfg) - assert num_fcs >= 2, 'num_fcs should be no less ' \ - f'than 2. got {num_fcs}.' - self.embed_dims = embed_dims - self.feedforward_channels = feedforward_channels - self.num_fcs = num_fcs - self.act_cfg = act_cfg - self.activate = build_activation_layer(act_cfg) - - layers = [] - in_channels = embed_dims - for _ in range(num_fcs - 1): - layers.append( - Sequential( - Linear(in_channels, feedforward_channels), self.activate, - nn.Dropout(ffn_drop))) - in_channels = feedforward_channels - layers.append(Linear(feedforward_channels, embed_dims)) - layers.append(nn.Dropout(ffn_drop)) - self.layers = Sequential(*layers) - self.dropout_layer = build_dropout( - dropout_layer) if dropout_layer else torch.nn.Identity() - self.add_identity = add_identity - - def forward(self, x, identity=None): - """Forward function for `FFN`. - - The function would add x to the output tensor if residue is None. - """ - out = self.layers(x) - if not self.add_identity: - return self.dropout_layer(out) - if identity is None: - identity = x - return identity + self.dropout_layer(out) - - -@TRANSFORMER_LAYER.register_module() -class BaseTransformerLayer(BaseModule): - """Base `TransformerLayer` for vision transformer. - - It can be built from `mmcv.ConfigDict` and support more flexible - customization, for example, using any number of `FFN or LN ` and - use different kinds of `attention` by specifying a list of `ConfigDict` - named `attn_cfgs`. It is worth mentioning that it supports `prenorm` - when you specifying `norm` as the first element of `operation_order`. - More details about the `prenorm`: `On Layer Normalization in the - Transformer Architecture `_ . - - Args: - attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): - Configs for `self_attention` or `cross_attention` modules, - The order of the configs in the list should be consistent with - corresponding attentions in operation_order. - If it is a dict, all of the attention modules in operation_order - will be built with this config. Default: None. - ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): - Configs for FFN, The order of the configs in the list should be - consistent with corresponding ffn in operation_order. - If it is a dict, all of the attention modules in operation_order - will be built with this config. - operation_order (tuple[str]): The execution order of operation - in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). - Support `prenorm` when you specifying first element as `norm`. - Default:None. - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='LN'). - init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. - Default: None. - batch_first (bool): Key, Query and Value are shape - of (batch, n, embed_dim) - or (n, batch, embed_dim). Default to False. - """ - - def __init__(self, - attn_cfgs=None, - ffn_cfgs=dict( - type='FFN', - embed_dims=256, - feedforward_channels=1024, - num_fcs=2, - ffn_drop=0., - act_cfg=dict(type='ReLU', inplace=True), - ), - operation_order=None, - norm_cfg=dict(type='LN'), - init_cfg=None, - batch_first=False, - **kwargs): - - deprecated_args = dict( - feedforward_channels='feedforward_channels', - ffn_dropout='ffn_drop', - ffn_num_fcs='num_fcs') - for ori_name, new_name in deprecated_args.items(): - if ori_name in kwargs: - warnings.warn( - f'The arguments `{ori_name}` in BaseTransformerLayer ' - f'has been deprecated, now you should set `{new_name}` ' - f'and other FFN related arguments ' - f'to a dict named `ffn_cfgs`. ', DeprecationWarning) - ffn_cfgs[new_name] = kwargs[ori_name] - - super().__init__(init_cfg) - - self.batch_first = batch_first - - assert set(operation_order) & { - 'self_attn', 'norm', 'ffn', 'cross_attn'} == \ - set(operation_order), f'The operation_order of' \ - f' {self.__class__.__name__} should ' \ - f'contains all four operation type ' \ - f"{['self_attn', 'norm', 'ffn', 'cross_attn']}" - - num_attn = operation_order.count('self_attn') + operation_order.count( - 'cross_attn') - if isinstance(attn_cfgs, dict): - attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] - else: - assert num_attn == len(attn_cfgs), f'The length ' \ - f'of attn_cfg {num_attn} is ' \ - f'not consistent with the number of attention' \ - f'in operation_order {operation_order}.' - - self.num_attn = num_attn - self.operation_order = operation_order - self.norm_cfg = norm_cfg - self.pre_norm = operation_order[0] == 'norm' - self.attentions = ModuleList() - - index = 0 - for operation_name in operation_order: - if operation_name in ['self_attn', 'cross_attn']: - if 'batch_first' in attn_cfgs[index]: - assert self.batch_first == attn_cfgs[index]['batch_first'] - else: - attn_cfgs[index]['batch_first'] = self.batch_first - attention = build_attention(attn_cfgs[index]) - # Some custom attentions used as `self_attn` - # or `cross_attn` can have different behavior. - attention.operation_name = operation_name - self.attentions.append(attention) - index += 1 - - self.embed_dims = self.attentions[0].embed_dims - - self.ffns = ModuleList() - num_ffns = operation_order.count('ffn') - if isinstance(ffn_cfgs, dict): - ffn_cfgs = ConfigDict(ffn_cfgs) - if isinstance(ffn_cfgs, dict): - ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] - assert len(ffn_cfgs) == num_ffns - for ffn_index in range(num_ffns): - if 'embed_dims' not in ffn_cfgs[ffn_index]: - ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims - else: - assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims - self.ffns.append( - build_feedforward_network(ffn_cfgs[ffn_index], - dict(type='FFN'))) - - self.norms = ModuleList() - num_norms = operation_order.count('norm') - for _ in range(num_norms): - self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) - - def forward(self, - query, - key=None, - value=None, - query_pos=None, - key_pos=None, - attn_masks=None, - query_key_padding_mask=None, - key_padding_mask=None, - **kwargs): - """Forward function for `TransformerDecoderLayer`. - - **kwargs contains some specific arguments of attentions. - - Args: - query (Tensor): The input query with shape - [num_queries, bs, embed_dims] if - self.batch_first is False, else - [bs, num_queries embed_dims]. - key (Tensor): The key tensor with shape [num_keys, bs, - embed_dims] if self.batch_first is False, else - [bs, num_keys, embed_dims] . - value (Tensor): The value tensor with same shape as `key`. - query_pos (Tensor): The positional encoding for `query`. - Default: None. - key_pos (Tensor): The positional encoding for `key`. - Default: None. - attn_masks (List[Tensor] | None): 2D Tensor used in - calculation of corresponding attention. The length of - it should equal to the number of `attention` in - `operation_order`. Default: None. - query_key_padding_mask (Tensor): ByteTensor for `query`, with - shape [bs, num_queries]. Only used in `self_attn` layer. - Defaults to None. - key_padding_mask (Tensor): ByteTensor for `query`, with - shape [bs, num_keys]. Default: None. - - Returns: - Tensor: forwarded results with shape [num_queries, bs, embed_dims]. - """ - - norm_index = 0 - attn_index = 0 - ffn_index = 0 - identity = query - if attn_masks is None: - attn_masks = [None for _ in range(self.num_attn)] - elif isinstance(attn_masks, torch.Tensor): - attn_masks = [ - copy.deepcopy(attn_masks) for _ in range(self.num_attn) - ] - warnings.warn(f'Use same attn_mask in all attentions in ' - f'{self.__class__.__name__} ') - else: - assert len(attn_masks) == self.num_attn, f'The length of ' \ - f'attn_masks {len(attn_masks)} must be equal ' \ - f'to the number of attention in ' \ - f'operation_order {self.num_attn}' - - for layer in self.operation_order: - if layer == 'self_attn': - temp_key = temp_value = query - query = self.attentions[attn_index]( - query, - temp_key, - temp_value, - identity if self.pre_norm else None, - query_pos=query_pos, - key_pos=query_pos, - attn_mask=attn_masks[attn_index], - key_padding_mask=query_key_padding_mask, - **kwargs) - attn_index += 1 - identity = query - - elif layer == 'norm': - query = self.norms[norm_index](query) - norm_index += 1 - - elif layer == 'cross_attn': - query = self.attentions[attn_index]( - query, - key, - value, - identity if self.pre_norm else None, - query_pos=query_pos, - key_pos=key_pos, - attn_mask=attn_masks[attn_index], - key_padding_mask=key_padding_mask, - **kwargs) - attn_index += 1 - identity = query - - elif layer == 'ffn': - query = self.ffns[ffn_index]( - query, identity if self.pre_norm else None) - ffn_index += 1 - - return query - - -@TRANSFORMER_LAYER_SEQUENCE.register_module() -class TransformerLayerSequence(BaseModule): - """Base class for TransformerEncoder and TransformerDecoder in vision - transformer. - - As base-class of Encoder and Decoder in vision transformer. - Support customization such as specifying different kind - of `transformer_layer` in `transformer_coder`. - - Args: - transformerlayer (list[obj:`mmcv.ConfigDict`] | - obj:`mmcv.ConfigDict`): Config of transformerlayer - in TransformerCoder. If it is obj:`mmcv.ConfigDict`, - it would be repeated `num_layer` times to a - list[`mmcv.ConfigDict`]. Default: None. - num_layers (int): The number of `TransformerLayer`. Default: None. - init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. - Default: None. - """ - - def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None): - super().__init__(init_cfg) - if isinstance(transformerlayers, dict): - transformerlayers = [ - copy.deepcopy(transformerlayers) for _ in range(num_layers) - ] - else: - assert isinstance(transformerlayers, list) and \ - len(transformerlayers) == num_layers - self.num_layers = num_layers - self.layers = ModuleList() - for i in range(num_layers): - self.layers.append(build_transformer_layer(transformerlayers[i])) - self.embed_dims = self.layers[0].embed_dims - self.pre_norm = self.layers[0].pre_norm - - def forward(self, - query, - key, - value, - query_pos=None, - key_pos=None, - attn_masks=None, - query_key_padding_mask=None, - key_padding_mask=None, - **kwargs): - """Forward function for `TransformerCoder`. - - Args: - query (Tensor): Input query with shape - `(num_queries, bs, embed_dims)`. - key (Tensor): The key tensor with shape - `(num_keys, bs, embed_dims)`. - value (Tensor): The value tensor with shape - `(num_keys, bs, embed_dims)`. - query_pos (Tensor): The positional encoding for `query`. - Default: None. - key_pos (Tensor): The positional encoding for `key`. - Default: None. - attn_masks (List[Tensor], optional): Each element is 2D Tensor - which is used in calculation of corresponding attention in - operation_order. Default: None. - query_key_padding_mask (Tensor): ByteTensor for `query`, with - shape [bs, num_queries]. Only used in self-attention - Default: None. - key_padding_mask (Tensor): ByteTensor for `query`, with - shape [bs, num_keys]. Default: None. - - Returns: - Tensor: results with shape [num_queries, bs, embed_dims]. - """ - for layer in self.layers: - query = layer( - query, - key, - value, - query_pos=query_pos, - key_pos=key_pos, - attn_masks=attn_masks, - query_key_padding_mask=query_key_padding_mask, - key_padding_mask=key_padding_mask, - **kwargs) - return query - - -@ATTENTION.register_module() -class MultiheadAttention(BaseModule): - """A wrapper for ``torch.nn.MultiheadAttention``. - - This module implements MultiheadAttention with identity connection, - and positional encoding is also passed as input. - - Args: - embed_dims (int): The embedding dimension. - num_heads (int): Parallel attention heads. - attn_drop (float): A Dropout layer on attn_output_weights. - Default: 0.0. - proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. - Default: 0.0. - dropout_layer (obj:`ConfigDict`): The dropout_layer used - when adding the shortcut. - init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. - Default: None. - batch_first (bool): When it is True, Key, Query and Value are shape of - (batch, n, embed_dim), otherwise (n, batch, embed_dim). - Default to False. - """ - - def __init__(self, - embed_dims, - num_heads, - attn_drop=0., - proj_drop=0., - dropout_layer=dict(type='Dropout', drop_prob=0.), - init_cfg=None, - batch_first=False, - **kwargs): - super().__init__(init_cfg) - if 'dropout' in kwargs: - warnings.warn( - 'The arguments `dropout` in MultiheadAttention ' - 'has been deprecated, now you can separately ' - 'set `attn_drop`(float), proj_drop(float), ' - 'and `dropout_layer`(dict) ', DeprecationWarning) - attn_drop = kwargs['dropout'] - dropout_layer['drop_prob'] = kwargs.pop('dropout') - - self.embed_dims = embed_dims - self.num_heads = num_heads - self.batch_first = batch_first - - self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, - **kwargs) - - self.proj_drop = nn.Dropout(proj_drop) - self.dropout_layer = build_dropout( - dropout_layer) if dropout_layer else nn.Identity() - - def forward(self, - query, - key=None, - value=None, - identity=None, - query_pos=None, - key_pos=None, - attn_mask=None, - key_padding_mask=None, - **kwargs): - """Forward function for `MultiheadAttention`. - - **kwargs allow passing a more general data flow when combining - with other operations in `transformerlayer`. - - Args: - query (Tensor): The input query with shape [num_queries, bs, - embed_dims] if self.batch_first is False, else - [bs, num_queries embed_dims]. - key (Tensor): The key tensor with shape [num_keys, bs, - embed_dims] if self.batch_first is False, else - [bs, num_keys, embed_dims] . - If None, the ``query`` will be used. Defaults to None. - value (Tensor): The value tensor with same shape as `key`. - Same in `nn.MultiheadAttention.forward`. Defaults to None. - If None, the `key` will be used. - identity (Tensor): This tensor, with the same shape as x, - will be used for the identity link. - If None, `x` will be used. Defaults to None. - query_pos (Tensor): The positional encoding for query, with - the same shape as `x`. If not None, it will - be added to `x` before forward function. Defaults to None. - key_pos (Tensor): The positional encoding for `key`, with the - same shape as `key`. Defaults to None. If not None, it will - be added to `key` before forward function. If None, and - `query_pos` has the same shape as `key`, then `query_pos` - will be used for `key_pos`. Defaults to None. - attn_mask (Tensor): ByteTensor mask with shape [num_queries, - num_keys]. Same in `nn.MultiheadAttention.forward`. - Defaults to None. - key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys]. - Defaults to None. - - Returns: - Tensor: forwarded results with shape - [num_queries, bs, embed_dims] - if self.batch_first is False, else - [bs, num_queries embed_dims]. - """ - - if key is None: - key = query - if value is None: - value = key - if identity is None: - identity = query - if key_pos is None: - if query_pos is not None: - # use query_pos if key_pos is not available - if query_pos.shape == key.shape: - key_pos = query_pos - else: - warnings.warn(f'position encoding of key is' - f'missing in {self.__class__.__name__}.') - if query_pos is not None: - query = query + query_pos - if key_pos is not None: - key = key + key_pos - - # Because the dataflow('key', 'query', 'value') of - # ``torch.nn.MultiheadAttention`` is (num_query, batch, - # embed_dims), We should adjust the shape of dataflow from - # batch_first (batch, num_query, embed_dims) to num_query_first - # (num_query ,batch, embed_dims), and recover ``attn_output`` - # from num_query_first to batch_first. - if self.batch_first: - query = query.transpose(0, 1) - key = key.transpose(0, 1) - value = value.transpose(0, 1) - - out = self.attn( - query=query, - key=key, - value=value, - attn_mask=attn_mask, - key_padding_mask=key_padding_mask)[0] - - if self.batch_first: - out = out.transpose(0, 1) - - return identity + self.dropout_layer(self.proj_drop(out)) From 99ee278867e281e4e4c767c3b19f56ceebcaac39 Mon Sep 17 00:00:00 2001 From: Jiabei-prog <79827906+Jiabei-prog@users.noreply.github.com> Date: Fri, 4 Nov 2022 16:14:05 +0800 Subject: [PATCH 5/9] Fixed #214 --- easycv/models/utils/transformer.py | 544 +++++++++++++++++++++++++++++ 1 file changed, 544 insertions(+) diff --git a/easycv/models/utils/transformer.py b/easycv/models/utils/transformer.py index dd96065f..535f0425 100644 --- a/easycv/models/utils/transformer.py +++ b/easycv/models/utils/transformer.py @@ -1,12 +1,24 @@ import copy +import warnings from typing import Optional import torch import torch.nn as nn import torch.nn.functional as F +from mmcv.cnn.bricks import Linear +from mmcv.cnn.bricks.drop import build_dropout +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential +from mmcv.utils import ConfigDict from torch import Tensor from easycv.framework.errors import RuntimeError +from easycv.models.builder import (build_attention, build_feedforward_network, + build_transformer_layer) +from easycv.models.registry import (ATTENTION, FEEDFORWARD_NETWORK, + TRANSFORMER_LAYER, + TRANSFORMER_LAYER_SEQUENCE) +from easycv.models.utils.activation import build_activation_layer +from easycv.models.utils.norm import build_norm_layer class MLP(nn.Module): @@ -215,3 +227,535 @@ def _get_activation_fn(activation): if activation == 'selu': return F.selu raise RuntimeError(F'activation should be relu/gelu, not {activation}.') + + +@FEEDFORWARD_NETWORK.register_module() +class FFN(BaseModule): + """Implements feed-forward networks (FFNs) with identity connection. + + Args: + embed_dims (int): The feature dimension. Same as + `MultiheadAttention`. Defaults: 256. + feedforward_channels (int): The hidden dimension of FFNs. + Defaults: 1024. + num_fcs (int, optional): The number of fully-connected layers in + FFNs. Default: 2. + act_cfg (dict, optional): The activation config for FFNs. + Default: dict(type='ReLU') + ffn_drop (float, optional): Probability of an element to be + zeroed in FFN. Default 0.0. + add_identity (bool, optional): Whether to add the + identity connection. Default: `True`. + dropout_layer (obj:`ConfigDict`): The dropout_layer used + when adding the shortcut. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + act_cfg=dict(type='ReLU', inplace=True), + ffn_drop=0., + dropout_layer=None, + add_identity=True, + init_cfg=None, + **kwargs): + super().__init__(init_cfg) + assert num_fcs >= 2, 'num_fcs should be no less ' \ + f'than 2. got {num_fcs}.' + self.embed_dims = embed_dims + self.feedforward_channels = feedforward_channels + self.num_fcs = num_fcs + self.act_cfg = act_cfg + self.activate = build_activation_layer(act_cfg) + + layers = [] + in_channels = embed_dims + for _ in range(num_fcs - 1): + layers.append( + Sequential( + Linear(in_channels, feedforward_channels), self.activate, + nn.Dropout(ffn_drop))) + in_channels = feedforward_channels + layers.append(Linear(feedforward_channels, embed_dims)) + layers.append(nn.Dropout(ffn_drop)) + self.layers = Sequential(*layers) + self.dropout_layer = build_dropout( + dropout_layer) if dropout_layer else torch.nn.Identity() + self.add_identity = add_identity + + def forward(self, x, identity=None): + """Forward function for `FFN`. + + The function would add x to the output tensor if residue is None. + """ + out = self.layers(x) + if not self.add_identity: + return self.dropout_layer(out) + if identity is None: + identity = x + return identity + self.dropout_layer(out) + + +@TRANSFORMER_LAYER.register_module() +class BaseTransformerLayer(BaseModule): + """Base `TransformerLayer` for vision transformer. + + It can be built from `mmcv.ConfigDict` and support more flexible + customization, for example, using any number of `FFN or LN ` and + use different kinds of `attention` by specifying a list of `ConfigDict` + named `attn_cfgs`. It is worth mentioning that it supports `prenorm` + when you specifying `norm` as the first element of `operation_order`. + More details about the `prenorm`: `On Layer Normalization in the + Transformer Architecture `_ . + + Args: + attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for `self_attention` or `cross_attention` modules, + The order of the configs in the list should be consistent with + corresponding attentions in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. Default: None. + ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for FFN, The order of the configs in the list should be + consistent with corresponding ffn in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. + operation_order (tuple[str]): The execution order of operation + in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). + Support `prenorm` when you specifying first element as `norm`. + Default:None. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + batch_first (bool): Key, Query and Value are shape + of (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + """ + + def __init__(self, + attn_cfgs=None, + ffn_cfgs=dict( + type='FFN', + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0., + act_cfg=dict(type='ReLU', inplace=True), + ), + operation_order=None, + norm_cfg=dict(type='LN'), + init_cfg=None, + batch_first=False, + **kwargs): + + deprecated_args = dict( + feedforward_channels='feedforward_channels', + ffn_dropout='ffn_drop', + ffn_num_fcs='num_fcs') + for ori_name, new_name in deprecated_args.items(): + if ori_name in kwargs: + warnings.warn( + f'The arguments `{ori_name}` in BaseTransformerLayer ' + f'has been deprecated, now you should set `{new_name}` ' + f'and other FFN related arguments ' + f'to a dict named `ffn_cfgs`. ', DeprecationWarning) + ffn_cfgs[new_name] = kwargs[ori_name] + + super().__init__(init_cfg) + + self.batch_first = batch_first + + assert set(operation_order) & { + 'self_attn', 'norm', 'ffn', 'cross_attn'} == \ + set(operation_order), f'The operation_order of' \ + f' {self.__class__.__name__} should ' \ + f'contains all four operation type ' \ + f"{['self_attn', 'norm', 'ffn', 'cross_attn']}" + + num_attn = operation_order.count('self_attn') + operation_order.count( + 'cross_attn') + if isinstance(attn_cfgs, dict): + attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] + else: + assert num_attn == len(attn_cfgs), f'The length ' \ + f'of attn_cfg {num_attn} is ' \ + f'not consistent with the number of attention' \ + f'in operation_order {operation_order}.' + + self.num_attn = num_attn + self.operation_order = operation_order + self.norm_cfg = norm_cfg + self.pre_norm = operation_order[0] == 'norm' + self.attentions = ModuleList() + + index = 0 + for operation_name in operation_order: + if operation_name in ['self_attn', 'cross_attn']: + if 'batch_first' in attn_cfgs[index]: + assert self.batch_first == attn_cfgs[index]['batch_first'] + else: + attn_cfgs[index]['batch_first'] = self.batch_first + attention = build_attention(attn_cfgs[index]) + # Some custom attentions used as `self_attn` + # or `cross_attn` can have different behavior. + attention.operation_name = operation_name + self.attentions.append(attention) + index += 1 + + self.embed_dims = self.attentions[0].embed_dims + + self.ffns = ModuleList() + num_ffns = operation_order.count('ffn') + if isinstance(ffn_cfgs, dict): + ffn_cfgs = ConfigDict(ffn_cfgs) + if isinstance(ffn_cfgs, dict): + ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] + assert len(ffn_cfgs) == num_ffns + for ffn_index in range(num_ffns): + if 'embed_dims' not in ffn_cfgs[ffn_index]: + ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims + else: + assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims + self.ffns.append( + build_feedforward_network(ffn_cfgs[ffn_index], + dict(type='FFN'))) + + self.norms = ModuleList() + num_norms = operation_order.count('norm') + for _ in range(num_norms): + self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) + + def forward(self, + query, + key=None, + value=None, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `TransformerDecoderLayer`. + + **kwargs contains some specific arguments of attentions. + + Args: + query (Tensor): The input query with shape + [num_queries, bs, embed_dims] if + self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + value (Tensor): The value tensor with same shape as `key`. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. + Default: None. + attn_masks (List[Tensor] | None): 2D Tensor used in + calculation of corresponding attention. The length of + it should equal to the number of `attention` in + `operation_order`. Default: None. + query_key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_queries]. Only used in `self_attn` layer. + Defaults to None. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_keys]. Default: None. + + Returns: + Tensor: forwarded results with shape [num_queries, bs, embed_dims]. + """ + + norm_index = 0 + attn_index = 0 + ffn_index = 0 + identity = query + if attn_masks is None: + attn_masks = [None for _ in range(self.num_attn)] + elif isinstance(attn_masks, torch.Tensor): + attn_masks = [ + copy.deepcopy(attn_masks) for _ in range(self.num_attn) + ] + warnings.warn(f'Use same attn_mask in all attentions in ' + f'{self.__class__.__name__} ') + else: + assert len(attn_masks) == self.num_attn, f'The length of ' \ + f'attn_masks {len(attn_masks)} must be equal ' \ + f'to the number of attention in ' \ + f'operation_order {self.num_attn}' + + for layer in self.operation_order: + if layer == 'self_attn': + temp_key = temp_value = query + query = self.attentions[attn_index]( + query, + temp_key, + temp_value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=query_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=query_key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'norm': + query = self.norms[norm_index](query) + norm_index += 1 + + elif layer == 'cross_attn': + query = self.attentions[attn_index]( + query, + key, + value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=key_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'ffn': + query = self.ffns[ffn_index]( + query, identity if self.pre_norm else None) + ffn_index += 1 + + return query + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class TransformerLayerSequence(BaseModule): + """Base class for TransformerEncoder and TransformerDecoder in vision + transformer. + + As base-class of Encoder and Decoder in vision transformer. + Support customization such as specifying different kind + of `transformer_layer` in `transformer_coder`. + + Args: + transformerlayer (list[obj:`mmcv.ConfigDict`] | + obj:`mmcv.ConfigDict`): Config of transformerlayer + in TransformerCoder. If it is obj:`mmcv.ConfigDict`, + it would be repeated `num_layer` times to a + list[`mmcv.ConfigDict`]. Default: None. + num_layers (int): The number of `TransformerLayer`. Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None): + super().__init__(init_cfg) + if isinstance(transformerlayers, dict): + transformerlayers = [ + copy.deepcopy(transformerlayers) for _ in range(num_layers) + ] + else: + assert isinstance(transformerlayers, list) and \ + len(transformerlayers) == num_layers + self.num_layers = num_layers + self.layers = ModuleList() + for i in range(num_layers): + self.layers.append(build_transformer_layer(transformerlayers[i])) + self.embed_dims = self.layers[0].embed_dims + self.pre_norm = self.layers[0].pre_norm + + def forward(self, + query, + key, + value, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `TransformerCoder`. + + Args: + query (Tensor): Input query with shape + `(num_queries, bs, embed_dims)`. + key (Tensor): The key tensor with shape + `(num_keys, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_keys, bs, embed_dims)`. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. + Default: None. + attn_masks (List[Tensor], optional): Each element is 2D Tensor + which is used in calculation of corresponding attention in + operation_order. Default: None. + query_key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_queries]. Only used in self-attention + Default: None. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_keys]. Default: None. + + Returns: + Tensor: results with shape [num_queries, bs, embed_dims]. + """ + for layer in self.layers: + query = layer( + query, + key, + value, + query_pos=query_pos, + key_pos=key_pos, + attn_masks=attn_masks, + query_key_padding_mask=query_key_padding_mask, + key_padding_mask=key_padding_mask, + **kwargs) + return query + + +@ATTENTION.register_module() +class MultiheadAttention(BaseModule): + """A wrapper for ``torch.nn.MultiheadAttention``. + + This module implements MultiheadAttention with identity connection, + and positional encoding is also passed as input. + + Args: + embed_dims (int): The embedding dimension. + num_heads (int): Parallel attention heads. + attn_drop (float): A Dropout layer on attn_output_weights. + Default: 0.0. + proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. + Default: 0.0. + dropout_layer (obj:`ConfigDict`): The dropout_layer used + when adding the shortcut. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + batch_first (bool): When it is True, Key, Query and Value are shape of + (batch, n, embed_dim), otherwise (n, batch, embed_dim). + Default to False. + """ + + def __init__(self, + embed_dims, + num_heads, + attn_drop=0., + proj_drop=0., + dropout_layer=dict(type='Dropout', drop_prob=0.), + init_cfg=None, + batch_first=False, + **kwargs): + super().__init__(init_cfg) + if 'dropout' in kwargs: + warnings.warn( + 'The arguments `dropout` in MultiheadAttention ' + 'has been deprecated, now you can separately ' + 'set `attn_drop`(float), proj_drop(float), ' + 'and `dropout_layer`(dict) ', DeprecationWarning) + attn_drop = kwargs['dropout'] + dropout_layer['drop_prob'] = kwargs.pop('dropout') + + self.embed_dims = embed_dims + self.num_heads = num_heads + self.batch_first = batch_first + + self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, + **kwargs) + + self.proj_drop = nn.Dropout(proj_drop) + self.dropout_layer = build_dropout( + dropout_layer) if dropout_layer else nn.Identity() + + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_pos=None, + attn_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `MultiheadAttention`. + + **kwargs allow passing a more general data flow when combining + with other operations in `transformerlayer`. + + Args: + query (Tensor): The input query with shape [num_queries, bs, + embed_dims] if self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + If None, the ``query`` will be used. Defaults to None. + value (Tensor): The value tensor with same shape as `key`. + Same in `nn.MultiheadAttention.forward`. Defaults to None. + If None, the `key` will be used. + identity (Tensor): This tensor, with the same shape as x, + will be used for the identity link. + If None, `x` will be used. Defaults to None. + query_pos (Tensor): The positional encoding for query, with + the same shape as `x`. If not None, it will + be added to `x` before forward function. Defaults to None. + key_pos (Tensor): The positional encoding for `key`, with the + same shape as `key`. Defaults to None. If not None, it will + be added to `key` before forward function. If None, and + `query_pos` has the same shape as `key`, then `query_pos` + will be used for `key_pos`. Defaults to None. + attn_mask (Tensor): ByteTensor mask with shape [num_queries, + num_keys]. Same in `nn.MultiheadAttention.forward`. + Defaults to None. + key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys]. + Defaults to None. + + Returns: + Tensor: forwarded results with shape + [num_queries, bs, embed_dims] + if self.batch_first is False, else + [bs, num_queries embed_dims]. + """ + + if key is None: + key = query + if value is None: + value = key + if identity is None: + identity = query + if key_pos is None: + if query_pos is not None: + # use query_pos if key_pos is not available + if query_pos.shape == key.shape: + key_pos = query_pos + else: + warnings.warn(f'position encoding of key is' + f'missing in {self.__class__.__name__}.') + if query_pos is not None: + query = query + query_pos + if key_pos is not None: + key = key + key_pos + + # Because the dataflow('key', 'query', 'value') of + # ``torch.nn.MultiheadAttention`` is (num_query, batch, + # embed_dims), We should adjust the shape of dataflow from + # batch_first (batch, num_query, embed_dims) to num_query_first + # (num_query ,batch, embed_dims), and recover ``attn_output`` + # from num_query_first to batch_first. + if self.batch_first: + query = query.transpose(0, 1) + key = key.transpose(0, 1) + value = value.transpose(0, 1) + + out = self.attn( + query=query, + key=key, + value=value, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask)[0] + + if self.batch_first: + out = out.transpose(0, 1) + + return identity + self.dropout_layer(self.proj_drop(out)) From 4111f149e15583657c3f2244e057d0f3d1a9dac2 Mon Sep 17 00:00:00 2001 From: Jiabei-prog <79827906+Jiabei-prog@users.noreply.github.com> Date: Mon, 7 Nov 2022 15:51:58 +0800 Subject: [PATCH 6/9] add unittest file --- tests/models/backbones/test_edgevit.py | 36 ++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 tests/models/backbones/test_edgevit.py diff --git a/tests/models/backbones/test_edgevit.py b/tests/models/backbones/test_edgevit.py new file mode 100644 index 00000000..4978dbaa --- /dev/null +++ b/tests/models/backbones/test_edgevit.py @@ -0,0 +1,36 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +import torch + +from easycv.models.backbones import EdgeVit + + +class EdgeVitTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + + def test_vitdet(self): + model = EdgeVit( + img_size=224, + depth=[1, 1, 3, 2], + embed_dim=[36, 72, 144, 288], + head_dim=36, + mlp_ratio=[4] * 4, + qkv_bias=True, + num_classes=1000, + drop_path_rate=0.1, + sr_ratios=[4, 2, 2, 1], + ) + + model.init_weights() + model.train() + imgs = torch.rand(36, 3, 224, 224) + feat = model(imgs) + self.assertEqual(len(feat), 1) + self.assertEqual(feat[0].shape, torch.Size([36, 288, 7, 7])) + + +if __name__ == '__main__': + unittest.main() From 89f2735267dd10535b9c52e8fe9f0d8fd7ccd667 Mon Sep 17 00:00:00 2001 From: Jiabei-prog <79827906+Jiabei-prog@users.noreply.github.com> Date: Tue, 15 Nov 2022 11:06:01 +0800 Subject: [PATCH 7/9] add benchmark data for edgevit_xs and edgevit_s --- .../imagenet/edgevit/imagenet_edgeVIT_s_jpg.py | 18 ++++++++++++++---- docs/source/model_zoo_cls.md | 4 +++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py index 42650c36..d5997454 100644 --- a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py +++ b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py @@ -2,6 +2,16 @@ # model settings model = dict( type='Classification', + train_preprocess=['mixUp'], + mixup_cfg=dict( + mixup_alpha=0.8, + cutmix_alpha=1.0, + cutmix_minmax=None, + prob=1.0, + switch_prob=0.5, + mode='batch', + label_smoothing=0.1, + num_classes=1000), backbone=dict( type='EdgeVit', depth=[1, 2, 5, 3], @@ -16,11 +26,11 @@ type='ClsHead', with_avg_pool=True, in_channels=384, - loss_config=dict( - type='CrossEntropyLossWithLabelSmooth', label_smooth=0.1), - )) + loss_config={ + 'type': 'SoftTargetCrossEntropy', + }, + with_fc=True)) -# input data settings data = dict( imgs_per_gpu=128, workers_per_gpu=10, diff --git a/docs/source/model_zoo_cls.md b/docs/source/model_zoo_cls.md index cc6a7ccd..72e791b2 100644 --- a/docs/source/model_zoo_cls.md +++ b/docs/source/model_zoo_cls.md @@ -82,6 +82,8 @@ | efficientformer_l1 | [efficientformer_l1](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/efficientformer/efficientformer_l1.py) | 80.102 | 94.934 | 1820 | 7.5 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/efficientformer/efficientformer_l1_1000d.pth) | | efficientformer_l3 | [efficientformer_l3](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/efficientformer/efficientformer_l3.py) | 82.272 | 96.028 | 2436 | 13.07 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/efficientformer/efficientformer_l3_300d.pth) | | efficientformer_l7 | [efficientformer_l7](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/efficientformer/efficientformer_l7.py) | 83.076 | 96.44 | 1622 | 18.96 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/efficientformer/efficientformer_l7_300d.pth) | -| EdgeVit_xxs_b512_224 | [EdgeVit_xxs_b512_224](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/edgevit/EdgeVit_b512x8_300e_jpg.py) | 75.18 | 92.19 | 13876 | 8.632 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/edgevit/edgexxs/ClsEvaluator_neck_top1_best.pth) | +| EdgeVit_xxs_b512_224 | [EdgeVit_xxs_b512_224](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xxs_jpg.py) | 75.18 | 92.188 | 206 | 8.67 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/edgevit/edgexxs/ClsEvaluator_neck_top1_best.pth) | +| EdgeVit_xs_b256_224 | [EdgeVit_xs_b256_224](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py) | 77.624 | 93.47 | 551 | 8.04 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/edgevit/edgexs/ClsEvaluator_neck_top1_best.pth) | +| EdgeVit_s_b128_224 | [EdgeVit_s_b128_224](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py) | 80.3 | 95.302 | 576 | 13.49 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/edgevit/edges/ClsEvaluator_neck_top1_best.pth) | (ps: 通过导入官方模型得到推理结果,需要torch.__version__ >= 1.9.0,推理的输入尺寸默认为224,机器默认为V100 16G,其中gpu memory记录的是gpu peak memory) From a9bc3922badf6f5d568c34bb5eedde9930d4e86d Mon Sep 17 00:00:00 2001 From: Jiabei-prog <79827906+Jiabei-prog@users.noreply.github.com> Date: Fri, 13 Jan 2023 02:49:01 +0800 Subject: [PATCH 8/9] add pretrained model paths --- .../imagenet/edgevit/EdgeVit_b512x8_300e_jpg.py | 1 + .../imagenet/edgevit/imagenet_edgeVIT_s_jpg.py | 1 + .../imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py | 1 + .../imagenet/edgevit/imagenet_edgeVIT_xxs_jpg.py | 1 + docs/source/model_zoo_cls.md | 6 +++--- easycv/models/backbones/edgevit.py | 10 ++++++++-- easycv/models/modelzoo.py | 9 +++++++++ 7 files changed, 24 insertions(+), 5 deletions(-) diff --git a/configs/classification/imagenet/edgevit/EdgeVit_b512x8_300e_jpg.py b/configs/classification/imagenet/edgevit/EdgeVit_b512x8_300e_jpg.py index d9bd6647..4bc4e156 100644 --- a/configs/classification/imagenet/edgevit/EdgeVit_b512x8_300e_jpg.py +++ b/configs/classification/imagenet/edgevit/EdgeVit_b512x8_300e_jpg.py @@ -15,6 +15,7 @@ head_dim=36, mlp_ratio=[4] * 4, qkv_bias=True, + model_size='xxs', num_classes=1000, drop_path_rate=0.1, sr_ratios=[4, 2, 2, 1]), diff --git a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py index d5997454..6c8d33c5 100644 --- a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py +++ b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py @@ -19,6 +19,7 @@ head_dim=48, mlp_ratio=[4] * 4, qkv_bias=True, + model_size='s', num_classes=1000, drop_path_rate=0.1, sr_ratios=[4, 2, 2, 1]), diff --git a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py index c2c48b2f..2eceb50c 100644 --- a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py +++ b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py @@ -8,6 +8,7 @@ head_dim=48, mlp_ratio=[4] * 4, qkv_bias=True, + model_size='xs', num_classes=1000, drop_path_rate=0.1, sr_ratios=[4, 2, 2, 1]), diff --git a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xxs_jpg.py b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xxs_jpg.py index 74b53c69..999e6fc4 100644 --- a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xxs_jpg.py +++ b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xxs_jpg.py @@ -9,6 +9,7 @@ head_dim=36, mlp_ratio=[4] * 4, qkv_bias=True, + model_size='xxs', num_classes=1000, drop_path_rate=0.1, sr_ratios=[4, 2, 2, 1]), diff --git a/docs/source/model_zoo_cls.md b/docs/source/model_zoo_cls.md index 72e791b2..79a4168f 100644 --- a/docs/source/model_zoo_cls.md +++ b/docs/source/model_zoo_cls.md @@ -82,8 +82,8 @@ | efficientformer_l1 | [efficientformer_l1](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/efficientformer/efficientformer_l1.py) | 80.102 | 94.934 | 1820 | 7.5 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/efficientformer/efficientformer_l1_1000d.pth) | | efficientformer_l3 | [efficientformer_l3](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/efficientformer/efficientformer_l3.py) | 82.272 | 96.028 | 2436 | 13.07 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/efficientformer/efficientformer_l3_300d.pth) | | efficientformer_l7 | [efficientformer_l7](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/efficientformer/efficientformer_l7.py) | 83.076 | 96.44 | 1622 | 18.96 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/efficientformer/efficientformer_l7_300d.pth) | -| EdgeVit_xxs_b512_224 | [EdgeVit_xxs_b512_224](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xxs_jpg.py) | 75.18 | 92.188 | 206 | 8.67 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/edgevit/edgexxs/ClsEvaluator_neck_top1_best.pth) | -| EdgeVit_xs_b256_224 | [EdgeVit_xs_b256_224](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py) | 77.624 | 93.47 | 551 | 8.04 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/edgevit/edgexs/ClsEvaluator_neck_top1_best.pth) | -| EdgeVit_s_b128_224 | [EdgeVit_s_b128_224](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py) | 80.3 | 95.302 | 576 | 13.49 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/edgevit/edges/ClsEvaluator_neck_top1_best.pth) | +| EdgeVit_xxs_b512_224 | [EdgeVit_xxs_b512_224](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xxs_jpg.py) | 75.18 | 92.188 | 206 | 8.67 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/edgevit/edgexxs/edgevit_xxs.pth) | +| EdgeVit_xs_b256_224 | [EdgeVit_xs_b256_224](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py) | 77.624 | 93.47 | 551 | 8.04 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/edgevit/edgexs/edgevit_xs.pth) | +| EdgeVit_s_b128_224 | [EdgeVit_s_b128_224](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py) | 80.3 | 95.302 | 576 | 13.49 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/edgevit/edges/edgevit_s.pth) | (ps: 通过导入官方模型得到推理结果,需要torch.__version__ >= 1.9.0,推理的输入尺寸默认为224,机器默认为V100 16G,其中gpu memory记录的是gpu peak memory) diff --git a/easycv/models/backbones/edgevit.py b/easycv/models/backbones/edgevit.py index f2f4499e..b99d2fea 100644 --- a/easycv/models/backbones/edgevit.py +++ b/easycv/models/backbones/edgevit.py @@ -14,6 +14,7 @@ from easycv.models.utils import ConvMlp, Mlp from easycv.utils.checkpoint import load_checkpoint from easycv.utils.logger import get_root_logger +from ..modelzoo import EdgeVit as model_urls from ..registry import BACKBONES @@ -244,6 +245,7 @@ def __init__(self, mlp_ratio=[4] * 4, qkv_bias=True, qk_scale=None, + model_size='s', representation_size=None, drop_rate=0., attn_drop_rate=0., @@ -270,6 +272,7 @@ def __init__(self, """ super().__init__() self.num_classes = num_classes + self.model_size = model_size self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) @@ -365,13 +368,16 @@ def __init__(self, self.pretrained = pretrained self.init_weights() - def init_weights(self, pretrained=None): + self.default_pretrained_model_path = model_urls.get( + self.__class__.__name__ + '_' + self.model_size, None) + + def init_weights(self): """Initialize the weights in backbone. Args: pretrained (str, optional): Path to pre-trained weights. Defaults to None. """ - pretrained = pretrained or self.pretrained + pretrained = self.pretrained def _init_weights(m): if isinstance(m, nn.Linear): diff --git a/easycv/models/modelzoo.py b/easycv/models/modelzoo.py index 58f005c4..8d92c146 100644 --- a/easycv/models/modelzoo.py +++ b/easycv/models/modelzoo.py @@ -260,3 +260,12 @@ 'dynamic_vit_large_p16': 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/timm/vit/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npz', } + +EdgeVit = { + 'EdgeVit_xxs': + 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/edgevit/edgexxs/edgevit_xxs.pth', + 'EdgeVit_xs': + 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/edgevit/edgexs/edgevit_xs.pth', + 'EdgeVit_s': + 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/edgevit/edges/edgevit_s.pth', +} From 77e1c2ca844f4a300eb1e61c1b89a1b8e368f9be Mon Sep 17 00:00:00 2001 From: Jiabei-prog <79827906+Jiabei-prog@users.noreply.github.com> Date: Mon, 16 Jan 2023 16:13:02 +0800 Subject: [PATCH 9/9] add pretrained model path --- .../edgevit/EdgeVit_b512x8_300e_jpg.py | 1 - .../edgevit/imagenet_edgeVIT_s_jpg.py | 1 - .../edgevit/imagenet_edgeVIT_xs_jpg.py | 1 - .../edgevit/imagenet_edgeVIT_xxs_jpg.py | 1 - easycv/models/backbones/edgevit.py | 28 ++++++------------- 5 files changed, 9 insertions(+), 23 deletions(-) diff --git a/configs/classification/imagenet/edgevit/EdgeVit_b512x8_300e_jpg.py b/configs/classification/imagenet/edgevit/EdgeVit_b512x8_300e_jpg.py index 4bc4e156..d9bd6647 100644 --- a/configs/classification/imagenet/edgevit/EdgeVit_b512x8_300e_jpg.py +++ b/configs/classification/imagenet/edgevit/EdgeVit_b512x8_300e_jpg.py @@ -15,7 +15,6 @@ head_dim=36, mlp_ratio=[4] * 4, qkv_bias=True, - model_size='xxs', num_classes=1000, drop_path_rate=0.1, sr_ratios=[4, 2, 2, 1]), diff --git a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py index 6c8d33c5..d5997454 100644 --- a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py +++ b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_s_jpg.py @@ -19,7 +19,6 @@ head_dim=48, mlp_ratio=[4] * 4, qkv_bias=True, - model_size='s', num_classes=1000, drop_path_rate=0.1, sr_ratios=[4, 2, 2, 1]), diff --git a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py index 2eceb50c..c2c48b2f 100644 --- a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py +++ b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xs_jpg.py @@ -8,7 +8,6 @@ head_dim=48, mlp_ratio=[4] * 4, qkv_bias=True, - model_size='xs', num_classes=1000, drop_path_rate=0.1, sr_ratios=[4, 2, 2, 1]), diff --git a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xxs_jpg.py b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xxs_jpg.py index 999e6fc4..74b53c69 100644 --- a/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xxs_jpg.py +++ b/configs/classification/imagenet/edgevit/imagenet_edgeVIT_xxs_jpg.py @@ -9,7 +9,6 @@ head_dim=36, mlp_ratio=[4] * 4, qkv_bias=True, - model_size='xxs', num_classes=1000, drop_path_rate=0.1, sr_ratios=[4, 2, 2, 1]), diff --git a/easycv/models/backbones/edgevit.py b/easycv/models/backbones/edgevit.py index b99d2fea..405d9d3c 100644 --- a/easycv/models/backbones/edgevit.py +++ b/easycv/models/backbones/edgevit.py @@ -245,14 +245,12 @@ def __init__(self, mlp_ratio=[4] * 4, qkv_bias=True, qk_scale=None, - model_size='s', representation_size=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-8), - sr_ratios=[4, 2, 2, 1], - pretrained=None): + sr_ratios=[4, 2, 2, 1]): """ Args: depth (list): depth of each stage @@ -272,7 +270,6 @@ def __init__(self, """ super().__init__() self.num_classes = num_classes - self.model_size = model_size self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) @@ -365,19 +362,21 @@ def __init__(self, else: self.pre_logits = nn.Identity() - self.pretrained = pretrained self.init_weights() + size_dict = { + 'xxs': [1, 1, 3, 2], + 'xs': [1, 1, 3, 1], + 's': [1, 2, 5, 3] + } + self.default_pretrained_model_path = model_urls.get( - self.__class__.__name__ + '_' + self.model_size, None) + self.__class__.__name__ + '_' + + [k for k, v in size_dict.items() if v == depth][0], None) def init_weights(self): """Initialize the weights in backbone. - Args: - pretrained (str, optional): Path to pre-trained weights. - Defaults to None. """ - pretrained = self.pretrained def _init_weights(m): if isinstance(m, nn.Linear): @@ -388,15 +387,6 @@ def _init_weights(m): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) - if isinstance(pretrained, str): - self.apply(_init_weights) - logger = get_root_logger() - load_checkpoint(self, pretrained, strict=False, logger=logger) - elif pretrained is None: - self.apply(_init_weights) - else: - raise TypeError('pretrained must be a str or None') - @torch.jit.ignore def no_weight_decay(self): return {'pos_embed', 'cls_token'}