Source code for mmaction.models.backbones.uniformer

# Copyright (c) OpenMMLab. All rights reserved.
import os
from typing import Dict, List, Optional, Union

import torch
import torch.nn as nn
from mmcv.cnn.bricks import DropPath
from mmengine.logging import MMLogger
from mmengine.model import BaseModule, ModuleList
from mmengine.runner.checkpoint import _load_checkpoint
from mmengine.utils import to_2tuple

from mmaction.registry import MODELS

logger = MMLogger.get_current_instance()

MODEL_PATH = 'https://download.openmmlab.com/mmaction/v1.0/recognition'
_MODELS = {
    'uniformer_small_in1k':
    os.path.join(MODEL_PATH,
                 'uniformerv1/uniformer_small_in1k_20221219-fe0a7ae0.pth'),
    'uniformer_base_in1k':
    os.path.join(MODEL_PATH,
                 'uniformerv1/uniformer_base_in1k_20221219-82c01015.pth'),
}


def conv_3xnxn(inp: int,
               oup: int,
               kernel_size: int = 3,
               stride: int = 3,
               groups: int = 1):
    """3D convolution with kernel size of 3xnxn.

    Args:
        inp (int): Dimension of input features.
        oup (int): Dimension of output features.
        kernel_size (int): The spatial kernel size (i.e., n).
            Defaults to 3.
        stride (int): The spatial stride.
            Defaults to 3.
        groups (int): Group number of operated features.
            Defaults to 1.
    """
    return nn.Conv3d(
        inp,
        oup, (3, kernel_size, kernel_size), (2, stride, stride), (1, 0, 0),
        groups=groups)


def conv_1xnxn(inp: int,
               oup: int,
               kernel_size: int = 3,
               stride: int = 3,
               groups: int = 1):
    """3D convolution with kernel size of 1xnxn.

    Args:
        inp (int): Dimension of input features.
        oup (int): Dimension of output features.
        kernel_size (int): The spatial kernel size (i.e., n).
            Defaults to 3.
        stride (int): The spatial stride.
            Defaults to 3.
        groups (int): Group number of operated features.
            Defaults to 1.
    """
    return nn.Conv3d(
        inp,
        oup, (1, kernel_size, kernel_size), (1, stride, stride), (0, 0, 0),
        groups=groups)


def conv_1x1x1(inp: int, oup: int, groups: int = 1):
    """3D convolution with kernel size of 1x1x1.

    Args:
        inp (int): Dimension of input features.
        oup (int): Dimension of output features.
        groups (int): Group number of operated features.
            Defaults to 1.
    """
    return nn.Conv3d(inp, oup, (1, 1, 1), (1, 1, 1), (0, 0, 0), groups=groups)


def conv_3x3x3(inp: int, oup: int, groups: int = 1):
    """3D convolution with kernel size of 3x3x3.

    Args:
        inp (int): Dimension of input features.
        oup (int): Dimension of output features.
        groups (int): Group number of operated features.
            Defaults to 1.
    """
    return nn.Conv3d(inp, oup, (3, 3, 3), (1, 1, 1), (1, 1, 1), groups=groups)


def conv_5x5x5(inp: int, oup: int, groups: int = 1):
    """3D convolution with kernel size of 5x5x5.

    Args:
        inp (int): Dimension of input features.
        oup (int): Dimension of output features.
        groups (int): Group number of operated features.
            Defaults to 1.
    """
    return nn.Conv3d(inp, oup, (5, 5, 5), (1, 1, 1), (2, 2, 2), groups=groups)


def bn_3d(dim):
    """3D batch normalization.

    Args:
        dim (int): Dimension of input features.
    """
    return nn.BatchNorm3d(dim)


class Mlp(BaseModule):
    """Multilayer perceptron.

    Args:
        in_features (int): Number of input features.
        hidden_features (int): Number of hidden features.
            Defaults to None.
        out_features (int): Number of output features.
            Defaults to None.
        drop (float): Dropout rate. Defaults to 0.0.
        init_cfg (dict, optional): Config dict for initialization.
            Defaults to None.
    """

    def __init__(
        self,
        in_features: int,
        hidden_features: int = None,
        out_features: int = None,
        drop: float = 0.,
        init_cfg: Optional[dict] = None,
    ) -> None:
        super().__init__(init_cfg=init_cfg)

        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


class Attention(BaseModule):
    """Self-Attention.

    Args:
        dim (int): Number of input features.
        num_heads (int): Number of attention heads.
            Defaults to 8.
        qkv_bias (bool): If True, add a learnable bias to query, key, value.
            Defaults to True.
        qk_scale (float, optional): Override default qk scale of
            ``head_dim ** -0.5`` if set. Defaults to None.
        attn_drop (float): Attention dropout rate.
            Defaults to 0.0.
        proj_drop (float): Dropout rate.
            Defaults to 0.0.
        init_cfg (dict, optional): Config dict for initialization.
            Defaults to None.
        init_cfg (dict, optional): The config of weight initialization.
            Defaults to None.
    """

    def __init__(
        self,
        dim: int,
        num_heads: int = 8,
        qkv_bias: bool = True,
        qk_scale: float = None,
        attn_drop: float = 0.,
        proj_drop: float = 0.,
        init_cfg: Optional[dict] = None,
    ) -> None:
        super().__init__(init_cfg=init_cfg)

        self.num_heads = num_heads
        head_dim = dim // num_heads
        # NOTE scale factor was wrong in my original version,
        # can set manually to be compat with prev weights
        self.scale = qk_scale or head_dim**-0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[
            2]  # make torchscript happy (cannot use tensor as tuple)

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class CMlp(BaseModule):
    """Multilayer perceptron via convolution.

    Args:
        in_features (int): Number of input features.
        hidden_features (int): Number of hidden features.
            Defaults to None.
        out_features (int): Number of output features.
            Defaults to None.
        drop (float): Dropout rate. Defaults to 0.0.
        init_cfg (dict, optional): Config dict for initialization.
            Defaults to None.
    """

    def __init__(
        self,
        in_features,
        hidden_features=None,
        out_features=None,
        drop=0.,
        init_cfg: Optional[dict] = None,
    ) -> None:
        super().__init__(init_cfg=init_cfg)

        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = conv_1x1x1(in_features, hidden_features)
        self.act = nn.GELU()
        self.fc2 = conv_1x1x1(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


class CBlock(BaseModule):
    """Convolution Block.

    Args:
        dim (int): Number of input features.
        mlp_ratio (float): Ratio of mlp hidden dimension
            to embedding dimension. Defaults to 4.
        drop (float): Dropout rate.
            Defaults to 0.0.
        drop_paths (float): Stochastic depth rates.
            Defaults to 0.0.
        init_cfg (dict, optional): Config dict for initialization.
            Defaults to None.
    """

    def __init__(
        self,
        dim: int,
        mlp_ratio: float = 4.,
        drop: float = 0.,
        drop_path: float = 0.,
        init_cfg: Optional[dict] = None,
    ) -> None:
        super().__init__(init_cfg=init_cfg)

        self.pos_embed = conv_3x3x3(dim, dim, groups=dim)
        self.norm1 = bn_3d(dim)
        self.conv1 = conv_1x1x1(dim, dim, 1)
        self.conv2 = conv_1x1x1(dim, dim, 1)
        self.attn = conv_5x5x5(dim, dim, groups=dim)
        # NOTE: drop path for stochastic depth,
        # we shall see if this is better than dropout here
        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = bn_3d(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = CMlp(
            in_features=dim, hidden_features=mlp_hidden_dim, drop=drop)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.pos_embed(x)
        x = x + self.drop_path(
            self.conv2(self.attn(self.conv1(self.norm1(x)))))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x


class SABlock(BaseModule):
    """Self-Attention Block.

    Args:
        dim (int): Number of input features.
        num_heads (int): Number of attention heads.
        mlp_ratio (float): Ratio of mlp hidden dimension
            to embedding dimension. Defaults to 4.
        qkv_bias (bool): If True, add a learnable bias to query, key, value.
            Defaults to True.
        qk_scale (float, optional): Override default qk scale of
            ``head_dim ** -0.5`` if set. Defaults to None.
        drop (float): Dropout rate. Defaults to 0.0.
        attn_drop (float): Attention dropout rate. Defaults to 0.0.
        drop_paths (float): Stochastic depth rates.
            Defaults to 0.0.
        init_cfg (dict, optional): Config dict for initialization.
            Defaults to None.
    """

    def __init__(
        self,
        dim: int,
        num_heads: int,
        mlp_ratio: float = 4.,
        qkv_bias: bool = False,
        qk_scale: float = None,
        drop: float = 0.,
        attn_drop: float = 0.,
        drop_path: float = 0.,
        init_cfg: Optional[dict] = None,
    ) -> None:
        super().__init__(init_cfg=init_cfg)

        self.pos_embed = conv_3x3x3(dim, dim, groups=dim)
        self.norm1 = nn.LayerNorm(dim)
        self.attn = Attention(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop)
        # NOTE: drop path for stochastic depth,
        # we shall see if this is better than dropout here
        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = nn.LayerNorm(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(
            in_features=dim, hidden_features=mlp_hidden_dim, drop=drop)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.pos_embed(x)
        B, C, T, H, W = x.shape
        x = x.flatten(2).transpose(1, 2)
        x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        x = x.transpose(1, 2).reshape(B, C, T, H, W)
        return x


class SpeicalPatchEmbed(BaseModule):
    """Image to Patch Embedding.

    Add extra temporal downsampling via temporal kernel size of 3.

    Args:
        img_size (int): Number of input size.
            Defaults to 224.
        patch_size (int): Number of patch size.
            Defaults to 16.
        in_chans (int): Number of input features.
            Defaults to 3.
        embed_dim (int): Number of output features.
            Defaults to 768.
        init_cfg (dict, optional): Config dict for initialization.
            Defaults to None.
    """

    def __init__(
        self,
        img_size=224,
        patch_size=16,
        in_chans=3,
        embed_dim=768,
        init_cfg: Optional[dict] = None,
    ) -> None:
        super().__init__(init_cfg=init_cfg)

        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        num_patches = (img_size[1] // patch_size[1]) * (
            img_size[0] // patch_size[0])
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches
        self.norm = nn.LayerNorm(embed_dim)
        self.proj = conv_3xnxn(
            in_chans,
            embed_dim,
            kernel_size=patch_size[0],
            stride=patch_size[0])

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.proj(x)
        B, _, T, H, W = x.shape
        x = x.flatten(2).transpose(1, 2)
        x = self.norm(x)
        x = x.reshape(B, T, H, W, -1).permute(0, 4, 1, 2, 3).contiguous()
        return x


class PatchEmbed(BaseModule):
    """Image to Patch Embedding.

    Args:
        img_size (int): Number of input size.
            Defaults to 224.
        patch_size (int): Number of patch size.
            Defaults to 16.
        in_chans (int): Number of input features.
            Defaults to 3.
        embed_dim (int): Number of output features.
            Defaults to 768.
        init_cfg (dict, optional): Config dict for initialization.
            Defaults to None.
    """

    def __init__(
        self,
        img_size=224,
        patch_size=16,
        in_chans=3,
        embed_dim=768,
        init_cfg: Optional[dict] = None,
    ) -> None:
        super().__init__(init_cfg=init_cfg)

        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        num_patches = (img_size[1] // patch_size[1]) * (
            img_size[0] // patch_size[0])
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches
        self.norm = nn.LayerNorm(embed_dim)
        self.proj = conv_1xnxn(
            in_chans,
            embed_dim,
            kernel_size=patch_size[0],
            stride=patch_size[0])

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.proj(x)
        B, _, T, H, W = x.shape
        x = x.flatten(2).transpose(1, 2)
        x = self.norm(x)
        x = x.reshape(B, T, H, W, -1).permute(0, 4, 1, 2, 3).contiguous()
        return x


[docs]@MODELS.register_module()
class UniFormer(BaseModule):
    """UniFormer.

    A pytorch implement of: `UniFormer: Unified Transformer
    for Efficient Spatiotemporal Representation Learning
    <https://arxiv.org/abs/2201.04676>`

    Args:
        depth (List[int]): List of depth in each stage.
            Defaults to [5, 8, 20, 7].
        img_size (int): Number of input size.
            Defaults to 224.
        in_chans (int): Number of input features.
            Defaults to 3.
        head_dim (int): Dimension of attention head.
            Defaults to 64.
        embed_dim (List[int]): List of embedding dimension in each layer.
            Defaults to [64, 128, 320, 512].
        mlp_ratio (float): Ratio of mlp hidden dimension
            to embedding dimension. Defaults to 4.
        qkv_bias (bool): If True, add a learnable bias to query, key, value.
            Defaults to True.
        qk_scale (float, optional): Override default qk scale of
            ``head_dim ** -0.5`` if set. Defaults to None.
        drop_rate (float): Dropout rate. Defaults to 0.0.
        attn_drop_rate (float): Attention dropout rate. Defaults to 0.0.
        drop_path_rate (float): Stochastic depth rates.
            Defaults to 0.0.
        pretrained2d (bool): Whether to load pretrained from 2D model.
            Defaults to True.
        pretrained (str): Name of pretrained model.
            Defaults to None.
        init_cfg (dict or list[dict]): Initialization config dict. Defaults to
            ``[
            dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
            dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
            ]``.
    """

    def __init__(
        self,
        depth: List[int] = [5, 8, 20, 7],
        img_size: int = 224,
        in_chans: int = 3,
        embed_dim: List[int] = [64, 128, 320, 512],
        head_dim: int = 64,
        mlp_ratio: float = 4.,
        qkv_bias: bool = True,
        qk_scale: float = None,
        drop_rate: float = 0.,
        attn_drop_rate: float = 0.,
        drop_path_rate: float = 0.,
        pretrained2d: bool = True,
        pretrained: Optional[str] = None,
        init_cfg: Optional[Union[Dict, List[Dict]]] = [
            dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
            dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
        ]
    ) -> None:
        super().__init__(init_cfg=init_cfg)

        self.pretrained = pretrained
        self.pretrained2d = pretrained2d
        self.patch_embed1 = SpeicalPatchEmbed(
            img_size=img_size,
            patch_size=4,
            in_chans=in_chans,
            embed_dim=embed_dim[0])
        self.patch_embed2 = PatchEmbed(
            img_size=img_size // 4,
            patch_size=2,
            in_chans=embed_dim[0],
            embed_dim=embed_dim[1])
        self.patch_embed3 = PatchEmbed(
            img_size=img_size // 8,
            patch_size=2,
            in_chans=embed_dim[1],
            embed_dim=embed_dim[2])
        self.patch_embed4 = PatchEmbed(
            img_size=img_size // 16,
            patch_size=2,
            in_chans=embed_dim[2],
            embed_dim=embed_dim[3])

        self.pos_drop = nn.Dropout(p=drop_rate)
        dpr = [
            x.item() for x in torch.linspace(0, drop_path_rate, sum(depth))
        ]  # stochastic depth decay rule
        num_heads = [dim // head_dim for dim in embed_dim]
        self.blocks1 = ModuleList([
            CBlock(
                dim=embed_dim[0],
                mlp_ratio=mlp_ratio,
                drop=drop_rate,
                drop_path=dpr[i]) for i in range(depth[0])
        ])
        self.blocks2 = ModuleList([
            CBlock(
                dim=embed_dim[1],
                mlp_ratio=mlp_ratio,
                drop=drop_rate,
                drop_path=dpr[i + depth[0]]) for i in range(depth[1])
        ])
        self.blocks3 = ModuleList([
            SABlock(
                dim=embed_dim[2],
                num_heads=num_heads[2],
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[i + depth[0] + depth[1]])
            for i in range(depth[2])
        ])
        self.blocks4 = ModuleList([
            SABlock(
                dim=embed_dim[3],
                num_heads=num_heads[3],
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[i + depth[0] + depth[1] + depth[2]])
            for i in range(depth[3])
        ])
        self.norm = bn_3d(embed_dim[-1])

    def _inflate_weight(self,
                        weight_2d: torch.Tensor,
                        time_dim: int,
                        center: bool = True) -> torch.Tensor:
        logger.info(f'Init center: {center}')
        if center:
            weight_3d = torch.zeros(*weight_2d.shape)
            weight_3d = weight_3d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1)
            middle_idx = time_dim // 2
            weight_3d[:, :, middle_idx, :, :] = weight_2d
        else:
            weight_3d = weight_2d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1)
            weight_3d = weight_3d / time_dim
        return weight_3d

    def _load_pretrained(self, pretrained: str = None) -> None:
        """Load ImageNet-1K pretrained model.

        The model is pretrained with ImageNet-1K.
        https://github.com/Sense-X/UniFormer

        Args:
            pretrained (str): Model name of ImageNet-1K pretrained model.
                Defaults to None.
        """
        if pretrained is not None:
            model_path = _MODELS[pretrained]
            logger.info(f'Load ImageNet pretrained model from {model_path}')
            state_dict = _load_checkpoint(model_path, map_location='cpu')
            state_dict_3d = self.state_dict()
            for k in state_dict.keys():
                if k in state_dict_3d.keys(
                ) and state_dict[k].shape != state_dict_3d[k].shape:
                    if len(state_dict_3d[k].shape) <= 2:
                        logger.info(f'Ignore: {k}')
                        continue
                    logger.info(f'Inflate: {k}, {state_dict[k].shape}' +
                                f' => {state_dict_3d[k].shape}')
                    time_dim = state_dict_3d[k].shape[2]
                    state_dict[k] = self._inflate_weight(
                        state_dict[k], time_dim)
            self.load_state_dict(state_dict, strict=False)

[docs]    def init_weights(self):
        """Initialize the weights in backbone."""
        if self.pretrained2d:
            logger = MMLogger.get_current_instance()
            logger.info(f'load model from: {self.pretrained}')
            self._load_pretrained(self.pretrained)
        else:
            if self.pretrained:
                self.init_cfg = dict(
                    type='Pretrained', checkpoint=self.pretrained)
            super().init_weights()

[docs]    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.patch_embed1(x)
        x = self.pos_drop(x)
        for blk in self.blocks1:
            x = blk(x)
        x = self.patch_embed2(x)
        for blk in self.blocks2:
            x = blk(x)
        x = self.patch_embed3(x)
        for blk in self.blocks3:
            x = blk(x)
        x = self.patch_embed4(x)
        for blk in self.blocks4:
            x = blk(x)
        x = self.norm(x)
        return x