第一次提交Yolo项目

This commit is contained in:
lhr
2025-12-27 02:14:11 +08:00
commit 604951f9c2
33 changed files with 5891 additions and 0 deletions

View File

@@ -0,0 +1,6 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
#
# For licensing see accompanying LICENSE file.
# Copyright (C) 2024 Apple Inc. All rights reserved.
#

View File

@@ -0,0 +1,6 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
#
# For licensing see accompanying LICENSE file.
# Copyright (C) 2024 Apple Inc. All rights reserved.
#

View File

@@ -0,0 +1,330 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
# For licensing see accompanying LICENSE file.
# Copyright (C) 2024 Apple Inc. All Rights Reserved.
from __future__ import annotations
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
__all__ = ["MobileOneBlock", "reparameterize_model"]
class SEBlock(nn.Module):
"""Squeeze and Excite module.
Pytorch implementation of `Squeeze-and-Excitation Networks` - https://arxiv.org/pdf/1709.01507.pdf
"""
def __init__(self, in_channels: int, rd_ratio: float = 0.0625) -> None:
"""Construct a Squeeze and Excite Module.
Args:
in_channels: Number of input channels.
rd_ratio: Input channel reduction ratio.
"""
super().__init__()
self.reduce = nn.Conv2d(
in_channels=in_channels,
out_channels=int(in_channels * rd_ratio),
kernel_size=1,
stride=1,
bias=True,
)
self.expand = nn.Conv2d(
in_channels=int(in_channels * rd_ratio),
out_channels=in_channels,
kernel_size=1,
stride=1,
bias=True,
)
def forward(self, inputs: torch.Tensor) -> torch.Tensor:
"""Apply forward pass."""
_b, c, h, w = inputs.size()
x = F.avg_pool2d(inputs, kernel_size=[h, w])
x = self.reduce(x)
x = F.relu(x)
x = self.expand(x)
x = torch.sigmoid(x)
x = x.view(-1, c, 1, 1)
return inputs * x
class MobileOneBlock(nn.Module):
"""MobileOne building block.
This block has a multi-branched architecture at train-time and plain-CNN style architecture at inference time For
more details, please refer to our paper: `An Improved One millisecond Mobile Backbone` -
https://arxiv.org/pdf/2206.04040.pdf
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int = 1,
padding: int = 0,
dilation: int = 1,
groups: int = 1,
inference_mode: bool = False,
use_se: bool = False,
use_act: bool = True,
use_scale_branch: bool = True,
num_conv_branches: int = 1,
activation: nn.Module = nn.GELU(),
) -> None:
"""Construct a MobileOneBlock module.
Args:
in_channels: Number of channels in the input.
out_channels: Number of channels produced by the block.
kernel_size: Size of the convolution kernel.
stride: Stride size.
padding: Zero-padding size.
dilation: Kernel dilation factor.
groups: Group number.
inference_mode: If True, instantiates model in inference mode.
use_se: Whether to use SE-ReLU activations.
use_act: Whether to use activation. Default: ``True``
use_scale_branch: Whether to use scale branch. Default: ``True``
num_conv_branches: Number of linear conv branches.
"""
super().__init__()
self.inference_mode = inference_mode
self.groups = groups
self.stride = stride
self.padding = padding
self.dilation = dilation
self.kernel_size = kernel_size
self.in_channels = in_channels
self.out_channels = out_channels
self.num_conv_branches = num_conv_branches
# Check if SE-ReLU is requested
if use_se:
self.se = SEBlock(out_channels)
else:
self.se = nn.Identity()
if use_act:
self.activation = activation
else:
self.activation = nn.Identity()
if inference_mode:
self.reparam_conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=True,
)
else:
# Re-parameterizable skip connection
self.rbr_skip = (
nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None
)
# Re-parameterizable conv branches
if num_conv_branches > 0:
rbr_conv = list()
for _ in range(self.num_conv_branches):
rbr_conv.append(self._conv_bn(kernel_size=kernel_size, padding=padding))
self.rbr_conv = nn.ModuleList(rbr_conv)
else:
self.rbr_conv = None
# Re-parameterizable scale branch
self.rbr_scale = None
if not isinstance(kernel_size, int):
kernel_size = kernel_size[0]
if (kernel_size > 1) and use_scale_branch:
self.rbr_scale = self._conv_bn(kernel_size=1, padding=0)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Apply forward pass."""
# Inference mode forward pass.
if self.inference_mode:
return self.activation(self.se(self.reparam_conv(x)))
# Multi-branched train-time forward pass.
# Skip branch output
identity_out = 0
if self.rbr_skip is not None:
identity_out = self.rbr_skip(x)
# Scale branch output
scale_out = 0
if self.rbr_scale is not None:
scale_out = self.rbr_scale(x)
# Other branches
out = scale_out + identity_out
if self.rbr_conv is not None:
for ix in range(self.num_conv_branches):
out += self.rbr_conv[ix](x)
return self.activation(self.se(out))
def reparameterize(self):
"""Following works like `RepVGG: Making VGG-style ConvNets Great Again` - https://arxiv.org/pdf/2101.03697.pdf.
We re-parameterize multi-branched architecture used at training time to obtain a plain CNN-like
structure for inference.
"""
if self.inference_mode:
return
kernel, bias = self._get_kernel_bias()
self.reparam_conv = nn.Conv2d(
in_channels=self.in_channels,
out_channels=self.out_channels,
kernel_size=self.kernel_size,
stride=self.stride,
padding=self.padding,
dilation=self.dilation,
groups=self.groups,
bias=True,
)
self.reparam_conv.weight.data = kernel
self.reparam_conv.bias.data = bias
# Delete un-used branches
for para in self.parameters():
para.detach_()
self.__delattr__("rbr_conv")
self.__delattr__("rbr_scale")
if hasattr(self, "rbr_skip"):
self.__delattr__("rbr_skip")
self.inference_mode = True
def _get_kernel_bias(self) -> tuple[torch.Tensor, torch.Tensor]:
"""Method to obtain re-parameterized kernel and bias. Reference:
https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py#L83.
Returns:
Tuple of (kernel, bias) after fusing branches.
"""
# get weights and bias of scale branch
kernel_scale = 0
bias_scale = 0
if self.rbr_scale is not None:
kernel_scale, bias_scale = self._fuse_bn_tensor(self.rbr_scale)
# Pad scale branch kernel to match conv branch kernel size.
pad = self.kernel_size // 2
kernel_scale = torch.nn.functional.pad(kernel_scale, [pad, pad, pad, pad])
# get weights and bias of skip branch
kernel_identity = 0
bias_identity = 0
if self.rbr_skip is not None:
kernel_identity, bias_identity = self._fuse_bn_tensor(self.rbr_skip)
# get weights and bias of conv branches
kernel_conv = 0
bias_conv = 0
if self.rbr_conv is not None:
for ix in range(self.num_conv_branches):
_kernel, _bias = self._fuse_bn_tensor(self.rbr_conv[ix])
kernel_conv += _kernel
bias_conv += _bias
kernel_final = kernel_conv + kernel_scale + kernel_identity
bias_final = bias_conv + bias_scale + bias_identity
return kernel_final, bias_final
def _fuse_bn_tensor(self, branch: nn.Sequential | nn.BatchNorm2d) -> tuple[torch.Tensor, torch.Tensor]:
"""Method to fuse batchnorm layer with preceeding conv layer. Reference:
https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py#L95.
Args:
branch: Sequence of ops to be fused.
Returns:
Tuple of (kernel, bias) after fusing batchnorm.
"""
if isinstance(branch, nn.Sequential):
kernel = branch.conv.weight
running_mean = branch.bn.running_mean
running_var = branch.bn.running_var
gamma = branch.bn.weight
beta = branch.bn.bias
eps = branch.bn.eps
else:
assert isinstance(branch, nn.BatchNorm2d)
if not hasattr(self, "id_tensor"):
input_dim = self.in_channels // self.groups
kernel_size = self.kernel_size
if isinstance(self.kernel_size, int):
kernel_size = (self.kernel_size, self.kernel_size)
kernel_value = torch.zeros(
(self.in_channels, input_dim, kernel_size[0], kernel_size[1]),
dtype=branch.weight.dtype,
device=branch.weight.device,
)
for i in range(self.in_channels):
kernel_value[i, i % input_dim, kernel_size[0] // 2, kernel_size[1] // 2] = 1
self.id_tensor = kernel_value
kernel = self.id_tensor
running_mean = branch.running_mean
running_var = branch.running_var
gamma = branch.weight
beta = branch.bias
eps = branch.eps
std = (running_var + eps).sqrt()
t = (gamma / std).reshape(-1, 1, 1, 1)
return kernel * t, beta - running_mean * gamma / std
def _conv_bn(self, kernel_size: int, padding: int) -> nn.Sequential:
"""Helper method to construct conv-batchnorm layers.
Args:
kernel_size: Size of the convolution kernel.
padding: Zero-padding size.
Returns:
Conv-BN module.
"""
mod_list = nn.Sequential()
mod_list.add_module(
"conv",
nn.Conv2d(
in_channels=self.in_channels,
out_channels=self.out_channels,
kernel_size=kernel_size,
stride=self.stride,
padding=padding,
groups=self.groups,
bias=False,
),
)
mod_list.add_module("bn", nn.BatchNorm2d(num_features=self.out_channels))
return mod_list
def reparameterize_model(model: torch.nn.Module) -> nn.Module:
"""Method returns a model where a multi-branched structure used in training is re-parameterized into a single branch
for inference.
Args:
model: MobileOne model in train mode.
Returns:
MobileOne model in inference mode.
"""
# Avoid editing original graph
model = copy.deepcopy(model)
for module in model.modules():
if hasattr(module, "reparameterize"):
module.reparameterize()
return model

View File

@@ -0,0 +1,410 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
#
# For licensing see accompanying LICENSE file.
# Copyright (C) 2024 Apple Inc. All Rights Reserved.
#
"""
Implementation of the following modules is borrowed from ml-cvnets repo:
https://github.com/apple/ml-cvnets/blob/main/cvnets/layers/multi_head_attention.py
https://github.com/apple/ml-cvnets/blob/main/cvnets/text_encoders/transformer.py.
Please see ACKNOWLEDGMENTS for license details.
"""
from __future__ import annotations
import torch
from torch import Size, Tensor, nn
from torch.nn import functional as F
from torchvision.ops import StochasticDepth
from mobileclip import logger
class LayerNormFP32(nn.LayerNorm):
"""Applies `Layer Normalization <https://arxiv.org/abs/1607.06450>`_ over a input tensor with FP32 precision."""
def __init__(
self,
normalized_shape: int | list[int] | Size,
eps: float | None = 1e-5,
elementwise_affine: bool | None = True,
*args,
**kwargs,
):
super().__init__(
normalized_shape=normalized_shape,
eps=eps,
elementwise_affine=elementwise_affine,
*args,
**kwargs,
)
def forward(self, x: Tensor) -> Tensor:
# Convert input from dtype X to FP32 and perform normalization operation.
# This may help with underflow/overflow issues that we typically see with normalization layers
inp_dtype = x.dtype
return super().forward(x.to(torch.float32)).to(inp_dtype)
def get_normalization_layer(norm_type, num_features):
if norm_type == "layer_norm":
return nn.LayerNorm(num_features)
elif norm_type == "layer_norm_fp32":
return LayerNormFP32(num_features)
else:
raise NotImplementedError(f"Option: {norm_type} not supported.")
class PositionalEmbedding(nn.Module):
def __init__(
self,
num_embeddings: int,
embedding_dim: int,
padding_idx: int | None = None,
is_learnable: bool | None = False,
interpolation_mode: str | None = "bilinear",
*args,
**kwargs,
):
super().__init__()
# Add other pos embedding here and logic to choose between them
module = LearnablePositionalEmbedding
self.pos_embed = module(
num_embeddings=num_embeddings,
embedding_dim=embedding_dim,
padding_idx=padding_idx,
interpolation_mode=interpolation_mode,
*args,
**kwargs,
)
def forward(self, seq_len: int, *args, **kwargs) -> Tensor:
return self.pos_embed(seq_len, *args, **kwargs)
def __repr__(self):
return self.pos_embed.__repr__()
class LearnablePositionalEmbedding(nn.Module):
"""Learnable Positional embedding."""
def __init__(
self,
num_embeddings: int,
embedding_dim: int,
padding_idx: int | None = None,
interpolation_mode: str | None = "bilinear",
*args,
**kwargs,
):
super().__init__()
self.pos_embed = nn.Parameter(torch.empty(1, 1, num_embeddings, embedding_dim))
self.embedding_dim = embedding_dim
self.num_embeddings = num_embeddings
self.padding_idx = padding_idx
self.interpolation_mode = interpolation_mode
self.reset_parameters()
def reset_parameters(self) -> None:
nn.init.trunc_normal_(self.pos_embed, mean=0, std=self.embedding_dim**-0.5)
if self.padding_idx is not None:
with torch.no_grad():
self.pos_embed[:, :, self.padding_idx, ...] = 0.0
def forward(self, seq_len: int, *args, **kwargs) -> Tensor:
# scale pos embedding
pos_embed = self.pos_embed
if self.padding_idx is not None:
with torch.no_grad():
pos_embed[:, :, self.padding_idx, ...] = 0.0
if seq_len != self.num_embeddings:
pos_embed = F.interpolate(
pos_embed,
size=(seq_len, self.embedding_dim),
mode=self.interpolation_mode,
)
# Input is of the form [Batch, Seq_len, Embedding_dim]
return pos_embed.reshape(1, seq_len, self.embedding_dim)
def __repr__(self):
return f"{self.__class__.__name__}(num_embeddings={self.num_embeddings}, embedding_dim={self.embedding_dim}, padding_idx={self.padding_idx})"
class MultiHeadAttention(nn.Module):
"""This layer applies a multi-head self- or cross-attention as described in `Attention is all you need
<https://arxiv.org/abs/1706.03762>`_ paper.
Args:
embed_dim (int): :math:`C_{in}` from an expected input of size :math:`(N, S, C_{in})`
num_heads (int): Number of heads in multi-head attention
attn_dropout (Optional[float]): Attention dropout. Default: 0.0
bias (Optional[bool]): Use bias or not. Default: ``True``
Notes:
- Input:
- Query tensor (x_q) :math:`(N, S, C_{in})` where :math:`N` is batch size, :math:`S` is number of source tokens,
and: math:`C_{in}` is input embedding dim
- Optional Key-Value tensor (x_kv) :math:`(N, T, C_{in})` where :math:`T` is number of target tokens
- Output: same shape as the input
"""
def __init__(
self,
embed_dim: int,
num_heads: int,
attn_dropout: float | None = 0.0,
bias: bool | None = True,
output_dim: int | None = None,
*args,
**kwargs,
) -> None:
if output_dim is None:
output_dim = embed_dim
super().__init__()
if embed_dim % num_heads != 0:
logger.error(
f"Embedding dim must be divisible by number of heads in {self.__class__.__name__}. Got: embed_dim={embed_dim} and num_heads={num_heads}"
)
self.qkv_proj = nn.Linear(in_features=embed_dim, out_features=3 * embed_dim, bias=bias)
self.attn_dropout = nn.Dropout(p=attn_dropout)
self.out_proj = nn.Linear(in_features=embed_dim, out_features=output_dim, bias=bias)
self.head_dim = embed_dim // num_heads
self.scaling = self.head_dim**-0.5
self.softmax = nn.Softmax(dim=-1)
self.num_heads = num_heads
self.embed_dim = embed_dim
self.use_separate_proj_weight = embed_dim != output_dim
def __repr__(self):
return f"{self.__class__.__name__}(head_dim={self.head_dim}, num_heads={self.num_heads}, attn_dropout={self.attn_dropout.p})"
def _forward_impl(
self,
x_q: Tensor,
x_kv: Tensor | None = None,
key_padding_mask: Tensor | None = None,
attn_mask: Tensor | None = None,
) -> Tensor:
# [N, S, C]
b_sz, S_len, _in_channels = x_q.shape
if x_kv is None:
# self-attention
# [N, S, C] --> [N, S, 3C] --> [N, S, 3, h, c] where C = hc
qkv = self.qkv_proj(x_q).reshape(b_sz, S_len, 3, self.num_heads, -1)
# [N, S, 3, h, c] --> [N, h, 3, S, C]
qkv = qkv.transpose(1, 3).contiguous()
# [N, h, 3, S, C] --> [N, h, S, C] x 3
query, key, value = qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2]
else:
T_len = x_kv.shape[1]
# cross-attention
# [N, S, C]
query = F.linear(
x_q,
weight=self.qkv_proj.weight[: self.embed_dim, ...],
bias=self.qkv_proj.bias[: self.embed_dim] if self.qkv_proj.bias is not None else None,
)
# [N, S, C] --> [N, S, h, c] --> [N, h, S, c]
query = query.reshape(b_sz, S_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
# [N, T, C] --> [N, T, 2C]
kv = F.linear(
x_kv,
weight=self.qkv_proj.weight[self.embed_dim :, ...],
bias=self.qkv_proj.bias[self.embed_dim :] if self.qkv_proj.bias is not None else None,
)
# [N, T, 2C] --> [N, T, 2, h, c]
kv = kv.reshape(b_sz, T_len, 2, self.num_heads, self.head_dim)
# [N, T, 2, h, c] --> [N, h, 2, T, c]
kv = kv.transpose(1, 3).contiguous()
key, value = kv[:, :, 0], kv[:, :, 1]
query = query * self.scaling
# [N h, T, c] --> [N, h, c, T]
key = key.transpose(-1, -2)
# QK^T
# [N, h, S, c] x [N, h, c, T] --> [N, h, S, T]
attn = torch.matmul(query, key)
batch_size, _num_heads, num_src_tokens, num_tgt_tokens = attn.shape
if attn_mask is not None:
# attn_mask shape should be the same as attn
assert list(attn_mask.shape) == [
batch_size,
num_src_tokens,
num_tgt_tokens,
], (
f"Shape of attention mask should be [{batch_size}, {num_src_tokens}, {num_tgt_tokens}]. Got: {attn_mask.shape}"
)
# [N, S, T] --> [N, 1, S, T]
attn_mask = attn_mask.unsqueeze(1)
attn = attn + attn_mask
if key_padding_mask is not None:
# Do not attend to padding positions
# key padding mask size is [N, T]
assert key_padding_mask.dim() == 2 and list(key_padding_mask.shape) == [
batch_size,
num_tgt_tokens,
], (
f"Key_padding_mask should be 2-dimension with shape [{batch_size}, {num_tgt_tokens}]. Got: {key_padding_mask.shape}"
)
attn = attn.masked_fill(
key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), # [N, T] --> [N, 1, 1, T]
float("-inf"),
)
attn_dtype = attn.dtype
attn_as_float = self.softmax(attn.float())
attn = attn_as_float.to(attn_dtype)
attn = self.attn_dropout(attn)
# weighted sum
# [N, h, S, T] x [N, h, T, c] --> [N, h, S, c]
out = torch.matmul(attn, value)
# [N, h, S, c] --> [N, S, h, c] --> [N, S, C]
out = out.transpose(1, 2).reshape(b_sz, S_len, -1)
out = self.out_proj(out)
return out
def forward(
self,
x_q: Tensor,
x_kv: Tensor | None = None,
key_padding_mask: Tensor | None = None,
attn_mask: Tensor | None = None,
*args,
**kwargs,
) -> Tensor:
# [Batch , Sequence, Hidden_dim]
return self._forward_impl(
x_q=x_q,
x_kv=x_kv,
key_padding_mask=key_padding_mask,
attn_mask=attn_mask,
)
class TransformerEncoder(nn.Module):
"""This class defines the pre-norm `Transformer encoder <https://arxiv.org/abs/1706.03762>`_.
Args:
embed_dim: :math:`C_{in}` from an expected input of size :math:`(N, P, C_{in})`.
ffn_latent_dim: Inner dimension of the FFN.
num_heads: Number of heads in multi-head attention. Default: 8.
attn_dropout: Dropout rate for attention in multi-head attention. Default: 0.0
dropout: Dropout rate. Default: 0.0.
ffn_dropout: Dropout between FFN layers. Default: 0.0.
transformer_norm_layer: Normalization layer. Default: layer_norm.
stochastic_dropout: Stochastic dropout setting. Default: 0.0.
Notes:
- Input: :math:`(N, P, C_{in})` where :math:`N` is batch size, :math:`P` is number of patches,
and: math:`C_{in}` is input embedding dim
- Output: same shape as the input
"""
def __init__(
self,
embed_dim: int,
ffn_latent_dim: int,
num_heads: int | None = 8,
attn_dropout: float | None = 0.0,
dropout: float | None = 0.0,
ffn_dropout: float | None = 0.0,
transformer_norm_layer: str | None = "layer_norm",
stochastic_dropout: float | None = 0.0,
*args,
**kwargs,
) -> None:
super().__init__()
# Build attention layer
attn_unit = MultiHeadAttention(
embed_dim,
num_heads,
attn_dropout=attn_dropout,
bias=True,
)
self.pre_norm_mha = nn.Sequential(
get_normalization_layer(norm_type=transformer_norm_layer, num_features=embed_dim),
attn_unit,
nn.Dropout(p=dropout),
)
act_name = nn.GELU()
self.pre_norm_ffn = nn.Sequential(
get_normalization_layer(norm_type=transformer_norm_layer, num_features=embed_dim),
nn.Linear(in_features=embed_dim, out_features=ffn_latent_dim, bias=True),
act_name,
nn.Dropout(p=ffn_dropout),
nn.Linear(in_features=ffn_latent_dim, out_features=embed_dim, bias=True),
nn.Dropout(p=dropout),
)
self.drop_path = nn.Identity()
if stochastic_dropout > 0.0:
if dropout > 0.0:
logger.error(
"Stochastic dropout and dropout are mutually exclusive. "
"Use either of them, but not both."
f"Got: {stochastic_dropout} and {dropout}"
)
self.drop_path = StochasticDepth(p=stochastic_dropout, mode="row")
self.embed_dim = embed_dim
self.ffn_dim = ffn_latent_dim
self.ffn_dropout = ffn_dropout
self.stochastic_dropout = stochastic_dropout
self.std_dropout = dropout
self.attn_fn_name = attn_unit.__class__.__name__
self.act_fn_name = act_name.__class__.__name__
self.norm_type = transformer_norm_layer
def __repr__(self) -> str:
return f"{self.__class__.__name__}(embed_dim={self.embed_dim}, ffn_dim={self.ffn_dim}, dropout={self.std_dropout}, ffn_dropout={self.ffn_dropout}, stochastic_dropout={self.stochastic_dropout}, attn_fn={self.attn_fn_name}, act_fn={self.act_fn_name}, norm_fn={self.norm_type})"
def forward(
self,
x: Tensor,
x_prev: Tensor | None = None,
key_padding_mask: Tensor | None = None,
attn_mask: Tensor | None = None,
*args,
**kwargs,
) -> Tensor:
# Multi-head attention
res = x
x = self.pre_norm_mha[0](x) # norm
x = self.pre_norm_mha[1](
x_q=x,
x_kv=x_prev,
key_padding_mask=key_padding_mask,
attn_mask=attn_mask,
*args,
**kwargs,
) # mha
x = self.drop_path(self.pre_norm_mha[2](x)) # applying stochastic depth
x = x + res
# Feed forward network
x = x + self.drop_path(self.pre_norm_ffn(x))
return x

View File

@@ -0,0 +1,6 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
#
# For licensing see accompanying LICENSE file.
# Copyright (C) 2024 Apple Inc. All rights reserved.
#

View File

@@ -0,0 +1,97 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
# For licensing see accompanying LICENSE file.
# Copyright (C) 2024 Apple Inc. All Rights Reserved.
from __future__ import annotations
import torch
import torch.nn as nn
from torch import Tensor
from mobileclip import logger
class GlobalPool(nn.Module):
"""This layers applies global pooling over a 4D or 5D input tensor.
Args:
pool_type (Optional[str]): Pooling type. It can be mean, rms, or abs. Default: `mean`
keep_dim (Optional[bool]): Do not squeeze the dimensions of a tensor. Default: `False`
Notes:
- Input: :math:`(N, C, H, W)` or :math:`(N, C, D, H, W)`
- Output: :math:`(N, C, 1, 1)` or :math:`(N, C, 1, 1, 1)` if keep_dim else :math:`(N, C)`
"""
pool_types = ["mean", "rms", "abs"]
def __init__(self, pool_type: str | None = "mean", keep_dim: bool | None = False, *args, **kwargs) -> None:
super().__init__()
if pool_type not in self.pool_types:
logger.error(f"Supported pool types are: {self.pool_types}. Got {pool_type}")
self.pool_type = pool_type
self.keep_dim = keep_dim
def _global_pool(self, x: Tensor, dims: list):
if self.pool_type == "rms": # root mean square
x = x**2
x = torch.mean(x, dim=dims, keepdim=self.keep_dim)
x = x**-0.5
elif self.pool_type == "abs": # absolute
x = torch.mean(torch.abs(x), dim=dims, keepdim=self.keep_dim)
else:
# default is mean
# same as AdaptiveAvgPool
x = torch.mean(x, dim=dims, keepdim=self.keep_dim)
return x
def forward(self, x: Tensor) -> Tensor:
if x.dim() == 4:
dims = [-2, -1]
elif x.dim() == 5:
dims = [-3, -2, -1]
else:
raise NotImplementedError("Currently 2D and 3D global pooling supported")
return self._global_pool(x, dims=dims)
class GlobalPool2D(nn.Module):
"""This class implements global pooling with linear projection."""
def __init__(self, in_dim: int, out_dim: int, *args, **kwargs) -> None:
super().__init__()
scale = in_dim**-0.5
self.pool = GlobalPool(pool_type="mean", keep_dim=False)
self.proj = nn.Parameter(scale * torch.randn(size=(in_dim, out_dim)))
self.in_dim = in_dim
self.out_dim = out_dim
def forward(self, x: Tensor, *args, **kwargs) -> Tensor:
# x is of shape [batch, in_dim]
assert x.dim() == 4, f"Input should be 4-dimensional (Batch x in_dim x in_height x in_width). Got: {x.shape}"
# [batch, in_dim, in_height, in_width] --> [batch, in_dim]
x = self.pool(x)
# [batch, in_dim] x [in_dim, out_dim] --> [batch, out_dim]
x = x @ self.proj
return x
class SimpleImageProjectionHead(nn.Module):
"""This class implements linear projection head."""
def __init__(self, in_dim: int, out_dim: int) -> None:
super().__init__()
scale = in_dim**-0.5
self.proj = nn.Parameter(scale * torch.randn(size=(in_dim, out_dim)))
self.in_dim = in_dim
self.out_dim = out_dim
def forward(self, x: Tensor, *args, **kwargs) -> Tensor:
# x is of shape [batch, in_dim]
assert x.dim() == 2, f"Input should be 2-dimensional (Batch x in_dim). Got: {x.shape}"
# [batch, in_dim] x [in_dim, out_dim] --> [batch, out_dim]
x = x @ self.proj
return x

View File

@@ -0,0 +1,177 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
#
# For acknowledgment see accompanying ACKNOWLEDGMENTS file.
# Copyright (C) 2024 Apple Inc. All rights reserved.
#
import torch
import torch.nn as nn
from timm.models.layers import SqueezeExcite
__all__ = ["ReparamLargeKernelConv"]
class ReparamLargeKernelConv(nn.Module):
"""Building Block of RepLKNet.
This class defines overparameterized large kernel conv block introduced in `RepLKNet
<https://arxiv.org/abs/2203.06717>`_
Reference: https://github.com/DingXiaoH/RepLKNet-pytorch
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int,
groups: int,
small_kernel: int,
inference_mode: bool = False,
use_se: bool = False,
activation: nn.Module = nn.GELU(),
) -> None:
"""Construct a ReparamLargeKernelConv module.
Args:
in_channels: Number of input channels.
out_channels: Number of output channels.
kernel_size: Kernel size of the large kernel conv branch.
stride: Stride size. Default: 1
groups: Group number. Default: 1
small_kernel: Kernel size of small kernel conv branch.
inference_mode: If True, instantiates model in inference mode. Default: ``False``
activation: Activation module. Default: ``nn.GELU``
"""
super().__init__()
self.stride = stride
self.groups = groups
self.in_channels = in_channels
self.out_channels = out_channels
self.activation = activation
self.kernel_size = kernel_size
self.small_kernel = small_kernel
self.padding = kernel_size // 2
# Check if SE is requested
if use_se:
self.se = SqueezeExcite(out_channels, rd_ratio=0.25)
else:
self.se = nn.Identity()
if inference_mode:
self.lkb_reparam = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=self.padding,
dilation=1,
groups=groups,
bias=True,
)
else:
self.lkb_origin = self._conv_bn(kernel_size=kernel_size, padding=self.padding)
if small_kernel is not None:
assert small_kernel <= kernel_size, (
"The kernel size for re-param cannot be larger than the large kernel!"
)
self.small_conv = self._conv_bn(kernel_size=small_kernel, padding=small_kernel // 2)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Apply forward pass."""
if hasattr(self, "lkb_reparam"):
out = self.lkb_reparam(x)
else:
out = self.lkb_origin(x)
if hasattr(self, "small_conv"):
out += self.small_conv(x)
return self.activation(self.se(out))
def get_kernel_bias(self) -> tuple[torch.Tensor, torch.Tensor]:
"""Method to obtain re-parameterized kernel and bias. Reference: https://github.com/DingXiaoH/RepLKNet-pytorch.
Returns:
Tuple of (kernel, bias) after fusing branches.
"""
eq_k, eq_b = self._fuse_bn(self.lkb_origin.conv, self.lkb_origin.bn)
if hasattr(self, "small_conv"):
small_k, small_b = self._fuse_bn(self.small_conv.conv, self.small_conv.bn)
eq_b += small_b
eq_k += nn.functional.pad(small_k, [(self.kernel_size - self.small_kernel) // 2] * 4)
return eq_k, eq_b
def reparameterize(self) -> None:
"""Following works like `RepVGG: Making VGG-style ConvNets Great Again` - https://arxiv.org/pdf/2101.03697.pdf.
We re-parameterize multi-branched architecture used at training time to obtain a plain CNN-like
structure for inference.
"""
eq_k, eq_b = self.get_kernel_bias()
self.lkb_reparam = nn.Conv2d(
in_channels=self.in_channels,
out_channels=self.out_channels,
kernel_size=self.kernel_size,
stride=self.stride,
padding=self.padding,
dilation=self.lkb_origin.conv.dilation,
groups=self.groups,
bias=True,
)
self.lkb_reparam.weight.data = eq_k
self.lkb_reparam.bias.data = eq_b
self.__delattr__("lkb_origin")
if hasattr(self, "small_conv"):
self.__delattr__("small_conv")
@staticmethod
def _fuse_bn(conv: torch.Tensor, bn: nn.BatchNorm2d) -> tuple[torch.Tensor, torch.Tensor]:
"""Method to fuse batchnorm layer with conv layer.
Args:
conv: Convolutional kernel weights.
bn: Batchnorm 2d layer.
Returns:
Tuple of (kernel, bias) after fusing batchnorm.
"""
kernel = conv.weight
running_mean = bn.running_mean
running_var = bn.running_var
gamma = bn.weight
beta = bn.bias
eps = bn.eps
std = (running_var + eps).sqrt()
t = (gamma / std).reshape(-1, 1, 1, 1)
return kernel * t, beta - running_mean * gamma / std
def _conv_bn(self, kernel_size: int, padding: int = 0) -> nn.Sequential:
"""Helper method to construct conv-batchnorm layers.
Args:
kernel_size: Size of the convolution kernel.
padding: Zero-padding size.
Returns:
A nn.Sequential Conv-BN module.
"""
mod_list = nn.Sequential()
mod_list.add_module(
"conv",
nn.Conv2d(
in_channels=self.in_channels,
out_channels=self.out_channels,
kernel_size=kernel_size,
stride=self.stride,
padding=padding,
groups=self.groups,
bias=False,
),
)
mod_list.add_module("bn", nn.BatchNorm2d(num_features=self.out_channels))
return mod_list

View File

@@ -0,0 +1,6 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
#
# For licensing see accompanying LICENSE file.
# Copyright (C) 2024 Apple Inc. All rights reserved.
#

View File

@@ -0,0 +1,265 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
# For licensing see accompanying LICENSE file.
# Copyright (C) 2024 Apple Inc. All Rights Reserved.
from __future__ import annotations
import torch
import torch.nn as nn
from timm.models.layers import DropPath, trunc_normal_
from mobileclip.modules.common.mobileone import MobileOneBlock
class ConvFFN(nn.Module):
"""Convolutional FFN Module."""
def __init__(
self,
in_channels: int,
context_size: int,
hidden_channels: int | None = None,
out_channels: int | None = None,
act_layer: nn.Module = nn.GELU,
drop: float = 0.0,
) -> None:
"""Build convolutional FFN module.
Args:
in_channels: Number of input channels.
context_size: Context size for 1D signals.
hidden_channels: Number of channels after expansion. Default: None
out_channels: Number of output channels. Default: None
act_layer: Activation layer. Default: ``GELU``
drop: Dropout rate. Default: ``0.0``.
"""
super().__init__()
out_channels = out_channels or in_channels
hidden_channels = hidden_channels or in_channels
self.conv = nn.Sequential()
self.conv.add_module(
"conv",
nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=(1, int(context_size)),
padding=(0, int(context_size // 2)),
groups=in_channels,
bias=False,
),
)
self.conv.add_module("bn", nn.BatchNorm2d(num_features=out_channels))
self.fc1 = nn.Conv2d(in_channels, hidden_channels, kernel_size=1)
self.act = act_layer()
self.fc2 = nn.Conv2d(hidden_channels, out_channels, kernel_size=1)
self.drop = nn.Dropout(drop)
self.apply(self._init_weights)
def _init_weights(self, m: nn.Module) -> None:
if isinstance(m, nn.Conv2d):
trunc_normal_(m.weight, std=0.02)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.conv(x)
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class RepMixer(nn.Module):
"""Reparameterizable token mixer.
For more details, please refer to our paper: `FastViT: A Fast Hybrid Vision Transformer using Structural
Reparameterization <https://arxiv.org/pdf/2303.14189.pdf>`_
"""
def __init__(
self,
dim,
kernel_size=3,
use_layer_scale=True,
layer_scale_init_value=1e-5,
inference_mode: bool = False,
):
"""Build RepMixer Module.
Args:
dim: Input feature map dimension. :math:`C_{in}` from an expected input of size :math:`(B, C_{in}, H, W)`.
kernel_size: Kernel size for spatial mixing. Default: 3
use_layer_scale: If True, learnable layer scale is used. Default: ``True``
layer_scale_init_value: Initial value for layer scale. Default: 1e-5
inference_mode: If True, instantiates model in inference mode. Default: ``False``
"""
super().__init__()
self.dim = dim
self.kernel_size = kernel_size
self.inference_mode = inference_mode
if inference_mode:
self.reparam_conv = nn.Conv2d(
in_channels=self.dim,
out_channels=self.dim,
kernel_size=(1, self.kernel_size),
stride=1,
padding=(0, self.kernel_size // 2),
groups=self.dim,
bias=True,
)
else:
self.norm = MobileOneBlock(
dim,
dim,
(1, kernel_size),
padding=(0, kernel_size // 2),
groups=dim,
use_act=False,
use_scale_branch=False,
num_conv_branches=0,
)
self.mixer = MobileOneBlock(
dim,
dim,
(1, kernel_size),
padding=(0, kernel_size // 2),
groups=dim,
use_act=False,
)
self.use_layer_scale = use_layer_scale
if use_layer_scale:
self.layer_scale = nn.Parameter(layer_scale_init_value * torch.ones((dim, 1, 1)), requires_grad=True)
def forward(self, x: torch.Tensor) -> torch.Tensor:
if hasattr(self, "reparam_conv"):
x = self.reparam_conv(x)
return x
else:
if self.use_layer_scale:
x = x + self.layer_scale * (self.mixer(x) - self.norm(x))
else:
x = x + self.mixer(x) - self.norm(x)
return x
def reparameterize(self) -> None:
"""Reparameterize mixer and norm into a single convolutional layer for efficient inference."""
if self.inference_mode:
return
self.mixer.reparameterize()
self.norm.reparameterize()
if self.use_layer_scale:
w = self.mixer.id_tensor + self.layer_scale.unsqueeze(-1) * (
self.mixer.reparam_conv.weight - self.norm.reparam_conv.weight
)
b = torch.squeeze(self.layer_scale) * (self.mixer.reparam_conv.bias - self.norm.reparam_conv.bias)
else:
w = self.mixer.id_tensor + self.mixer.reparam_conv.weight - self.norm.reparam_conv.weight
b = self.mixer.reparam_conv.bias - self.norm.reparam_conv.bias
self.reparam_conv = nn.Conv2d(
in_channels=self.dim,
out_channels=self.dim,
kernel_size=(1, self.kernel_size),
stride=1,
padding=(0, self.kernel_size // 2),
groups=self.dim,
bias=True,
)
self.reparam_conv.weight.data = w
self.reparam_conv.bias.data = b
for para in self.parameters():
para.detach_()
self.__delattr__("mixer")
self.__delattr__("norm")
if self.use_layer_scale:
self.__delattr__("layer_scale")
class RepMixerBlock(nn.Module):
"""Implementation of Metaformer block with RepMixer as token mixer.
For more details on Metaformer structure, please refer to: `MetaFormer Is Actually What You Need for Vision
<https://arxiv.org/pdf/2111.11418.pdf>`_
"""
def __init__(
self,
dim: int,
kernel_size: int = 11,
mlp_ratio: float = 4.0,
act_layer: nn.Module = nn.GELU,
drop: float = 0.0,
drop_path: float = 0.0,
use_layer_scale: bool = True,
layer_scale_init_value: float = 1e-5,
inference_mode: bool = False,
*args,
**kwargs,
):
"""Build RepMixer Block.
Args:
dim: Number of embedding dimensions.
kernel_size: Kernel size for repmixer. Default: 3
mlp_ratio: MLP expansion ratio. Default: 4.0
act_layer: Activation layer. Default: ``nn.GELU``
drop: Dropout rate. Default: 0.0
drop_path: Drop path rate. Default: 0.0
use_layer_scale: Flag to turn on layer scale. Default: ``True``
layer_scale_init_value: Layer scale value at initialization. Default: 1e-5
inference_mode: Flag to instantiate block in inference mode. Default: ``False``
"""
super().__init__()
self.token_mixer = RepMixer(
dim,
kernel_size=kernel_size,
use_layer_scale=use_layer_scale,
layer_scale_init_value=layer_scale_init_value,
inference_mode=inference_mode,
)
assert mlp_ratio > 0, f"MLP ratio should be greater than 0, found: {mlp_ratio}"
mlp_hidden_dim = int(dim * mlp_ratio)
self.convffn = ConvFFN(
in_channels=dim,
context_size=kernel_size,
hidden_channels=mlp_hidden_dim,
act_layer=act_layer,
drop=drop,
)
# Drop Path
self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
# Layer Scale
self.use_layer_scale = use_layer_scale
if use_layer_scale:
self.layer_scale = nn.Parameter(layer_scale_init_value * torch.ones((dim, 1, 1)), requires_grad=True)
def forward(self, x, *args, **kwargs):
if x.dim() == 3:
# B, C, D --- where C is the context length
# Convert to B, D, C --- to match RepMixer impl.
x = x.permute(0, 2, 1)
x = torch.unsqueeze(x, dim=2)
else:
raise ValueError(f"Expected tensor of dim=3, obtained tensor of dim={x.dim()}")
if self.use_layer_scale:
x = self.token_mixer(x)
x = x + self.drop_path(self.layer_scale * self.convffn(x))
else:
x = self.token_mixer(x)
x = x + self.drop_path(self.convffn(x))
# Convert tensors back
x = x.squeeze(dim=2).permute(0, 2, 1)
return x

View File

@@ -0,0 +1,39 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
#
# For licensing see accompanying LICENSE file.
# Copyright (C) 2024 Apple Inc. All Rights Reserved.
#
import open_clip
from torch import Tensor, nn
class ClipTokenizer(nn.Module):
def __init__(self, cfg, *args, **kwargs):
super().__init__()
self.context_length = cfg["text_cfg"]["context_length"]
model_name = getattr(cfg["text_cfg"], "open_clip_tokenizer", "ViT-B-16")
self.tokenizer = open_clip.get_tokenizer(model_name)
def get_vocab_size(self) -> int:
return len(self.tokenizer.encoder)
def get_encodings(self) -> dict[str, int]:
return self.tokenizer.encoder
def get_eot_token(self) -> int:
# Tokenizing an empty string returns a list [sot_id, eot_id]
return self.tokenizer("")[1]
def get_sot_token(self) -> int:
# Tokenizing an empty string returns a list [sot_id, eot_id]
return self.tokenizer("")[0]
def forward(self, input_sentence: str, *args, **kwargs) -> Tensor:
# tokenizer returns indices as a string
tokenized_sentence = self.tokenizer(input_sentence, self.context_length)
assert tokenized_sentence.shape[-1] == self.context_length, (
"Tokenized tensor should be exactly `context_length` long."
)
return tokenized_sentence