I modified the WaveNet architecture for binary classification:
class CustomConv(nn.Module):
def __init__(self):
super(CustomConv, self).__init__()
self.conv1 = nn.Conv1d(in_channels=1, out_channels=256, kernel_size=3, padding=1)
self.blocks = self.build_conv_block(num_blocks, 256)
self.conv2 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=1)
self.conv3 = nn.Conv1d(in_channels=128, out_channels=1, kernel_size=1)
self.act = nn.ReLU()
self.linear = nn.Linear(2000, 1)
self.sigmoid = nn.Sigmoid()
def build_conv_block(self, num_layers, num_channels):
block = []
for _ in range(num_layers):
for i in range(12):
block.append(ConvBlock(num_channels, 2**i))
return nn.Sequential(*block)
def forward(self, x):
x = self.conv1(x)
x = self.act(x)
_,x = self.blocks((x,0))
x = self.act(x)
x = self.conv2(x)
x = self.act(x)
x = self.conv3(x)
x = torch.squeeze(x)
x = self.linear(x)
x = self.sigmoid(x)
return x
My model hasn’t been training very well, and I was wondering if torch.squeeze()
could have any (negative) effect on the gradients produced during backprop, or any effect on training in general. Is there better practice for something like this? Any ideas?