Hello, I am trying to train a speech enhancement CNN based on Wave-U-Net but I get this error
invalid argument 0: Sizes of tensors must match except in dimension 1. Got 29 and 30 in dimension 2 at /pytorch/aten/src/THC/generic/THCTensorMath.cu:71
class DownSamplingLayer(nn.Module):
def __init__(self, channel_in, channel_out, dilation=1, kernel_size=15, stride=1, padding=7):
super(DownSamplingLayer, self).__init__()
self.main = nn.Sequential(
nn.Conv1d(channel_in, channel_out, kernel_size=kernel_size,
stride=stride, padding=padding, dilation=dilation),
nn.BatchNorm1d(channel_out),
nn.LeakyReLU(negative_slope=0.1)
)
def forward(self, ipt):
return self.main(ipt)
class UpSamplingLayer(nn.Module):
def __init__(self, channel_in, channel_out, kernel_size=5, stride=1, padding=2):
super(UpSamplingLayer, self).__init__()
self.main = nn.Sequential(
nn.Conv1d(channel_in, channel_out, kernel_size=kernel_size,
stride=stride, padding=padding),
nn.BatchNorm1d(channel_out),
nn.LeakyReLU(negative_slope=0.1, inplace=True),
)
def forward(self, ipt):
return self.main(ipt)
class SE_Model(nn.Module):
def __init__(self, n_layers=12, channels_interval=24):
super(SEModel, self).__init__()
self.n_layers = n_layers
self.channels_interval = channels_interval
encoder_in_channels_list = [1] + [i * self.channels_interval for i in range(1, self.n_layers)]
encoder_out_channels_list = [i * self.channels_interval for i in range(1, self.n_layers + 1)]
self.encoder = nn.ModuleList()
for i in range(self.n_layers):
self.encoder.append(
DownSamplingLayer(
channel_in=encoder_in_channels_list[i],
channel_out=encoder_out_channels_list[i]
)
)
self.middle = nn.Sequential(
nn.Conv1d(self.n_layers * self.channels_interval, self.n_layers * self.channels_interval, 15, stride=1,
padding=7),
nn.BatchNorm1d(self.n_layers * self.channels_interval),
nn.LeakyReLU(negative_slope=0.1, inplace=True)
)
decoder_in_channels_list = [(2 * i + 1) * self.channels_interval for i in range(1, self.n_layers)] + [
2 * self.n_layers * self.channels_interval]
decoder_in_channels_list = decoder_in_channels_list[::-1]
decoder_out_channels_list = encoder_out_channels_list[::-1]
self.decoder = nn.ModuleList()
for i in range(self.n_layers):
self.decoder.append(
UpSamplingLayer(
channel_in=decoder_in_channels_list[i],
channel_out=decoder_out_channels_list[i]
)
)
self.out = nn.Sequential(
nn.Conv1d(1 + self.channels_interval, 1, kernel_size=1, stride=1),
nn.Tanh()
)
def forward(self, input):
tmp = []
o = input
# Up Sampling
for i in range(self.n_layers):
o = self.encoder[i](o)
print(o.shape)
tmp.append(o)
# [batch_size, T // 2, channels]
o = o[:, :, ::2]
print(o.shape)
o = self.middle(o)
print(o.shape)
# Down Sampling
for i in range(self.n_layers):
# [batch_size, T * 2, channels]
print(o.shape)
o = F.interpolate(o, scale_factor=2, mode="linear", align_corners=True)
print(o.shape)
# Skip Connection
o = torch.cat([o, tmp[self.n_layers - i - 1]], dim=1)
o = self.decoder[i](o)
print(o.shape)
o = torch.cat([o, input], dim=1)
o = self.out(o)
return o