RuntimeError: Given groups=1, weight[64, 3, 3, 3], so expected input[16, 64, 256, 256] to have 3 channels, but got 64 channels instead

Hi,

Please help!!! I have a problem with tensor shapes mismatch -

RuntimeError: mat1 and mat2 shapes cannot be multiplied (288000x64 and 180x540).

Everything was alright until I tried to add WindowAttention instance to a conv layer below

self.conv_first = nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1)
self.multi_attn = WindowAttention(dim=embed_dim, window_size=(window_size, window_size), num_heads=num_heads[0], qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop_rate, proj_drop=drop_rate)

def forward(self,x):
x = self.conv_first(x)
x = self.multi_attn(x)

class WindowAttention(nn.Module):
def init(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):

    super().__init__()
    self.dim = dim
    self.window_size = window_size  # Wh, Ww
    self.num_heads = num_heads
    head_dim = dim // num_heads
    self.scale = qk_scale or head_dim ** -0.5

    # define a parameter table of relative position bias
    self.relative_position_bias_table = nn.Parameter(
        torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH

    # get pair-wise relative position index for each token inside the window
    coords_h = torch.arange(self.window_size[0])
    coords_w = torch.arange(self.window_size[1])
    coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
    coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
    relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
    relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
    relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
    relative_coords[:, :, 1] += self.window_size[1] - 1
    relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
    relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
    self.register_buffer("relative_position_index", relative_position_index)

    self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
    self.attn_drop = nn.Dropout(attn_drop)
    self.proj = nn.Linear(dim, dim)

    self.proj_drop = nn.Dropout(proj_drop)

    trunc_normal_(self.relative_position_bias_table, std=.02)
    self.softmax = nn.Softmax(dim=-1)

def forward(self, x, mask=None):
    B_, N, C = x.shape
    qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
    q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)

    q = q * self.scale
    attn = (q @ k.transpose(-2, -1))

    relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
        self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
    relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
    attn = attn + relative_position_bias.unsqueeze(0)

    if mask is not None:
        nW = mask.shape[0]
        attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
        attn = attn.view(-1, self.num_heads, N, N)
        attn = self.softmax(attn)
    else:
        attn = self.softmax(attn)

    attn = self.attn_drop(attn)

    x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
    x = self.proj(x)
    x = self.proj_drop(x)
    return x

Your code is hard to read as it’s not properly formatted and also not executable as input shapes etc. are missing.
I would suggest to check the newly added layer and to understand how it manipulates its input shape.

This is the issue

I am trying to incorporate the Attention module into a conv layer. The Attention module works well with transformer model and here are the shape results which I don’t know why there is a shape mismatch.

The shape before adding Attention instance to conv_layer

Input shape: torch.Size([4, 4096, 360])
weight - shape: torch.Size([360, 180])

The shape after adding Attention instance to conv_layer

Input shape: torch.Size([4, 3, 64, 64])
Weight shape: torch.Size([540, 180])


I get this error.What should i do?

self.weight was initialized such that the corresponding conv layer expects input activation with 6 input channels while 3 are given. Either fix the self.weight initialization or expand the incoming activation to have 6 channels.

Any guesses where self.weight is initialized?

No, as I’m unfamiliar with your code.
From the stacktrace you might want to check the efficientnet_pytorch code.

If i run shape command i get these dimensions but in the runtime error i get other.Any thoughts?

I guess processed_images might be the input to the model and other transformations or layers are called before the failing operation. You would need to narrow down which ops are applied to the model input before the error is raised.

class UNet(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(UNet, self).__init__()
        self.encoder1 = self.conv_block(in_channels, 64)
        self.encoder2 = self.conv_block(64, 128)
        self.encoder3 = self.conv_block(128, 256)
        self.encoder4 = self.conv_block(256, 512)

        self.decoder1 = self.up_conv_block(512, 256)
        self.decoder2 = self.up_conv_block(256, 128)
        self.decoder3 = self.up_conv_block(128, 64)

        self.out = nn.Conv2d(64, out_channels, kernel_size=1)
        self.sigmoid = nn.Sigmoid()

        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

RuntimeError: Given transposed=1, weight of size [256, 128, 2, 2], expected input[16, 512, 64, 64] to have 256 channels, but got 512 channels instead
I got this error too, but neither I nor GPT could understand where my error was.

I am having a similar error.

from diffusers import UNet1DModel
import torch
import torch.nn as nn
class ClassConditionedUned(nn.Module):
    def __init__(self, num_ela=8, class_emb_size=4):
        super().__init__()
        self.class_emb = nn.Sequential(
            nn.Linear(num_ela, 32),
            nn.ReLU(),
            nn.Linear(32, class_emb_size)
        )
        self.model = UNet1DModel(
            sample_size=500,
            in_channels=1+class_emb_size,
            out_channels=1,
            layers_per_block=1,  
            block_out_channels = (32, 32, 64),   
            down_block_types = ("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
            up_block_types = ("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),   
        )
        
    def forward(self, x, t, ela_vec):
        bs, ch, h = x.shape
        class_cond = self.class_emb(ela_vec) # Map to embedding dimension
        class_cond = class_cond.view(bs, -1, 1).expand(-1, -1, h)
        net_input = torch.cat((x, class_cond), 1)
        print(t.shape)
        return self.model(net_input, t).sample

model = ClassConditionedUned()
x = torch.randn(1, 1, 512)
t = torch.randint(0, 1000, (1,))
ela_vec = torch.rand(1, 8)  # normalisierte ELA-Vektoren

with torch.no_grad():
    out = model(x, t, ela_vec)

with this code i am getting the error
out = model(x, t, ela_vec)
^^^^^^^^^^^^^^^^^^^^
RuntimeError: Given groups=1, weight of size [32, 5, 1], expected input[1, 21, 512] to have 5 channels, but got 21 channels instead

Maybe you can help me. Thank you in advance

Fixed in this cross-post.