Hi, I am using the timesformer code for attention
It has an attention class
# class Attention1(nn.Module):
# def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., with_qkv=True):
# super().__init__()
# self.num_heads = num_heads
# head_dim = dim // num_heads
# self.scale = qk_scale or head_dim ** -0.5
# self.with_qkv = with_qkv
# if self.with_qkv:
# self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
# self.proj = nn.Linear(dim, dim)
# self.proj_drop = nn.Dropout(proj_drop)
# self.attn_drop = nn.Dropout(attn_drop)
# def forward(self, x):
# # print("x shape",x.shape,x.dtype)
# B, N, C = x.shape
# if self.with_qkv:
# qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
# q, k, v = qkv[0], qkv[1], qkv[2]
# else:
# qkv = x.reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
# q, k, v = qkv, qkv, qkv
# #print("q","k","v",q.shape,k.shape,v.shape)
# attn = (q @ k.transpose(-2, -1)) * self.scale
# attn = attn.softmax(dim=-1)
# attn = self.attn_drop(attn)
# x = (attn @ v).transpose(1, 2).reshape(B, N, C)
# if self.with_qkv:
# x = self.proj(x)
# x = self.proj_drop(x)
# # print("returned x shape",x.shape)
# return x
I dont do anything but just wrap it in a wrapper class and try to train that.Basically, i renamed Attention to NaiveAttention in code and define a Attention1 wrapper. But, the error rates do not reduce. here is what i did: Could someone please suggest what i might have done wrong. I have already checked that there is no pre-trained weight being loaded during training, the requires_grad flag of the layer and that each parameter is getting randomly initialized.
Thanks,
class NaiveAttention(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., with_qkv=True):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim ** -0.5
self.with_qkv = with_qkv
if self.with_qkv:
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
self.attn_drop = nn.Dropout(attn_drop)
def forward(self, x):
# print("x shape",x.shape,x.dtype)
B, N, C = x.shape
if self.with_qkv:
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2]
else:
qkv = x.reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
q, k, v = qkv, qkv, qkv
#print("q","k","v",q.shape,k.shape,v.shape)
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
if self.with_qkv:
x = self.proj(x)
x = self.proj_drop(x)
# print("returned x shape",x.shape)
return x
class Attention1(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., with_qkv=True):
super().__init__()
self.global_attn = NaiveAttention(dim, num_heads, qkv_bias, qk_scale, attn_drop, proj_drop, with_qkv)
def forward(self,x):
x = self.global_attn(x)
return x