I found the issue with my code, I was implementing a simple Transformer block like this
class TransformerBlock(nn.Module):
"""
Transformer Block
Args:
dim (int): input dimension
num_heads (int): number of attention heads
mlp_ratio (float): ratio of mlp hidden dim to input dim
qkv_bias (bool): if True, add bias to the qkv projection
qk_scale (float): scale factor for the qk projection
drop (float): dropout rate
attn_drop (float): attention dropout rate
drop_path (float): stochastic depth rate
act_layer (nn.Module): activation layer
norm_layer (nn.Module): normalization layer
sr_ratio (int): spatial reduction ratio
Returns:
torch.Tensor: output tensor, shape (B, N, C), N=H*W
"""
def __init__(self, dim:int, num_heads:int, mlp_ratio:float=4.,
qkv_bias:bool=False, qk_scale:float=None, drop:float=0., attn_drop:float=0.,
drop_path:float=0., act_layer:nn.Module=nn.GELU, norm_layer:nn.Module=nn.LayerNorm, sr_ratio:int=1) -> None:
super(TransformerBlock, self).__init__()
self.norm1 = norm_layer(dim)
self.attn = SelfAttention(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio)
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = MLP(input_dim=dim, hidden_dim=mlp_hidden_dim, act_layer=act_layer, drop=drop)
self.apply(self._init_weights)
def _init_weights(self, m:nn.Module) -> None:
"""
Initialise weights of the layer m
Args:
m (nn.Module): model layer
Returns:
None
"""
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
elif isinstance(m, nn.Conv2d):
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
fan_out //= m.groups
m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
if m.bias is not None:
m.bias.data.zero_()
def forward(self, x:torch.Tensor, H:int, W:int) -> torch.Tensor:
"""
forward method
Args:
x (torch.Tensor): input tensor, shape (B, N, C), N=H*W
H (int): height
W (int): width
"""
x_ = self.drop_path(self.attn(self.norm1(x), H, W)) + x
x_ = self.drop_path(self.mlp(self.norm2(x), H, W)) + x_
return x
initially, my forward method contained a inplace operation
def forward(self, x:torch.Tensor, H:int, W:int) -> torch.Tensor:
"""
forward method
Args:
x (torch.Tensor): input tensor, shape (B, N, C), N=H*W
H (int): height
W (int): width
"""
x += self.drop_path(self.attn(self.norm1(x), H, W))
x += self.drop_path(self.mlp(self.norm2(x), H, W))
return x
I am still confused, as to why such inplace operation would cause this error.