Putting a model layer in a nn.module wrapper class, does not lead to weight convergence

Hi, I am using the timesformer code for attention
It has an attention class

# class Attention1(nn.Module):
#     def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., with_qkv=True):
#         super().__init__()
#         self.num_heads = num_heads
#         head_dim = dim // num_heads
#         self.scale = qk_scale or head_dim ** -0.5
#         self.with_qkv = with_qkv
#         if self.with_qkv:
#            self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
#            self.proj = nn.Linear(dim, dim)
#            self.proj_drop = nn.Dropout(proj_drop)
#         self.attn_drop = nn.Dropout(attn_drop)

#     def forward(self, x):
#         # print("x shape",x.shape,x.dtype)

#         B, N, C = x.shape
#         if self.with_qkv:
#            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
#            q, k, v = qkv[0], qkv[1], qkv[2]
#         else:
#            qkv = x.reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
#            q, k, v  = qkv, qkv, qkv
#         #print("q","k","v",q.shape,k.shape,v.shape)
#         attn = (q @ k.transpose(-2, -1)) * self.scale
#         attn = attn.softmax(dim=-1)
#         attn = self.attn_drop(attn)

#         x = (attn @ v).transpose(1, 2).reshape(B, N, C)
#         if self.with_qkv:
#            x = self.proj(x)
#            x = self.proj_drop(x)
#         # print("returned x shape",x.shape)
#         return x

I dont do anything but just wrap it in a wrapper class and try to train that.Basically, i renamed Attention to NaiveAttention in code and define a Attention1 wrapper. But, the error rates do not reduce. here is what i did: Could someone please suggest what i might have done wrong. I have already checked that there is no pre-trained weight being loaded during training, the requires_grad flag of the layer and that each parameter is getting randomly initialized.
Thanks,

class NaiveAttention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., with_qkv=True):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim ** -0.5
        self.with_qkv = with_qkv
        if self.with_qkv:
           self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
           self.proj = nn.Linear(dim, dim)
           self.proj_drop = nn.Dropout(proj_drop)
        self.attn_drop = nn.Dropout(attn_drop)

    def forward(self, x):
        # print("x shape",x.shape,x.dtype)

        B, N, C = x.shape
        if self.with_qkv:
           qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
           q, k, v = qkv[0], qkv[1], qkv[2]
        else:
           qkv = x.reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
           q, k, v  = qkv, qkv, qkv
        #print("q","k","v",q.shape,k.shape,v.shape)
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        if self.with_qkv:
           x = self.proj(x)
           x = self.proj_drop(x)
        # print("returned x shape",x.shape)
        return x

class Attention1(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., with_qkv=True):
        super().__init__()
        self.global_attn = NaiveAttention(dim, num_heads, qkv_bias, qk_scale, attn_drop, proj_drop, with_qkv)

    def forward(self,x):
        x = self.global_attn(x)
        return x