Invalid argument 0: Sizes of tensors must match except in dimension 1. Got 29 and 30 in dimension 2 at /pytorch/aten/src/THC/generic/THCTensorMath.cu:71

Mohamed_Nabih · July 6, 2021, 2:13pm

Hello, I am trying to train a speech enhancement CNN based on Wave-U-Net but I get this error

invalid argument 0: Sizes of tensors must match except in dimension 1. Got 29 and 30 in dimension 2 at /pytorch/aten/src/THC/generic/THCTensorMath.cu:71

class DownSamplingLayer(nn.Module):

def __init__(self, channel_in, channel_out, dilation=1, kernel_size=15, stride=1, padding=7):
    super(DownSamplingLayer, self).__init__()
    self.main = nn.Sequential(
        nn.Conv1d(channel_in, channel_out, kernel_size=kernel_size,
                  stride=stride, padding=padding, dilation=dilation),
        nn.BatchNorm1d(channel_out),
        nn.LeakyReLU(negative_slope=0.1)
    )

def forward(self, ipt):
    return self.main(ipt)

class UpSamplingLayer(nn.Module):

def __init__(self, channel_in, channel_out, kernel_size=5, stride=1, padding=2):
    super(UpSamplingLayer, self).__init__()
    self.main = nn.Sequential(
        nn.Conv1d(channel_in, channel_out, kernel_size=kernel_size,
                  stride=stride, padding=padding),
        nn.BatchNorm1d(channel_out),
        nn.LeakyReLU(negative_slope=0.1, inplace=True),
    )

def forward(self, ipt):
    return self.main(ipt)

class SE_Model(nn.Module):

def __init__(self, n_layers=12, channels_interval=24):
    super(SEModel, self).__init__()

    self.n_layers = n_layers
    self.channels_interval = channels_interval
    encoder_in_channels_list = [1] + [i * self.channels_interval for i in range(1, self.n_layers)]
    encoder_out_channels_list = [i * self.channels_interval for i in range(1, self.n_layers + 1)]
    self.encoder = nn.ModuleList()
    for i in range(self.n_layers):
        self.encoder.append(
            DownSamplingLayer(
                channel_in=encoder_in_channels_list[i],
                channel_out=encoder_out_channels_list[i]
            )
        )

    self.middle = nn.Sequential(
        nn.Conv1d(self.n_layers * self.channels_interval, self.n_layers * self.channels_interval, 15, stride=1,
                  padding=7),
        nn.BatchNorm1d(self.n_layers * self.channels_interval),
        nn.LeakyReLU(negative_slope=0.1, inplace=True)
    )

    decoder_in_channels_list = [(2 * i + 1) * self.channels_interval for i in range(1, self.n_layers)] + [
        2 * self.n_layers * self.channels_interval]
    decoder_in_channels_list = decoder_in_channels_list[::-1]
    decoder_out_channels_list = encoder_out_channels_list[::-1]
    self.decoder = nn.ModuleList()
    for i in range(self.n_layers):
        self.decoder.append(
            UpSamplingLayer(
                channel_in=decoder_in_channels_list[i],
                channel_out=decoder_out_channels_list[i]
            )
        )

    self.out = nn.Sequential(
        nn.Conv1d(1 + self.channels_interval, 1, kernel_size=1, stride=1),
        nn.Tanh()
    )

def forward(self, input):
    tmp = []
    o = input

    # Up Sampling
    for i in range(self.n_layers):
        o = self.encoder[i](o)
        print(o.shape)
        
        tmp.append(o)
        # [batch_size, T // 2, channels]
        o = o[:, :, ::2]
    print(o.shape)

    o = self.middle(o)
    print(o.shape)

    # Down Sampling
    for i in range(self.n_layers):
        # [batch_size, T * 2, channels]
        print(o.shape)
        o = F.interpolate(o, scale_factor=2, mode="linear", align_corners=True)
        print(o.shape)
        # Skip Connection
        o = torch.cat([o, tmp[self.n_layers - i - 1]], dim=1)
        o = self.decoder[i](o)
        print(o.shape)

    o = torch.cat([o, input], dim=1)
    o = self.out(o)
    return o

ptrblck · July 9, 2021, 6:21am

These shape errors are often created for odd input shapes (or generally shapes, which weren’t used while developing the model). You could add custom shape checks and either slice the larger activation or pad the smaller one before concatenating them. Alternatively (and if possible) you could also use the expected shapes (often these kind of models work for the default shapes such as [batch_size, channels, 224, 224]).