Hello everybody,
I have a problem in training a network composed of both convolutional and recurrent layers. Specifically, the training goes as expected (loss decreasing for both training and validation, around 10^-1) and the accuracy follows.
However, after a seemingly random number of epochs (different at each run) the loss suddenly explodes to very high values and the accuracy goes to the equivalent of random. From there, the training does not recover at all.
This is the code of my net:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(1, 128, kernel_size=(5, 5), padding=2),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d((5, 1))
)
self.layer1.cuda(0)
self.layer2 = nn.Sequential(
nn.Conv2d(128, 256, kernel_size=(5, 5), padding=2),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.MaxPool2d((5, 2)),
)
self.layer2.cuda(1)
self.layer3 = nn.Sequential(
nn.Conv2d(256, 256, kernel_size=(5, 5), padding=2),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.MaxPool2d((4, 1)),
)
self.layer3.cuda(2)
self.recurrent_layer1 = nn.Sequential(
nn.GRU(1280, 256),
)
self.fc1 = nn.Sequential(
nn.Linear(89088, 10),
)
self.recurrent_layer1.cuda(3)
self.fc1.cuda(3)
def forward(self, x):
x = x.cuda(0)
x1 = self.layer1(x)
x1 = F.dropout(x1, training=self.training)
x1 = x1.cuda(1)
x2 = self.layer2(x1)
x2 = F.dropout(x2, training=self.training)
x2 = x2.cuda(2)
x3 = self.layer3(x2)
x3 = F.dropout(x3, training=self.training)
x3 = torch.transpose(x3, 1, 3)
x3 = (x3.contiguous()).view(x3.shape[0], x3.shape[1], x3.shape[2] * x3.shape[3])
x3 = x3.cuda(3)
x4 = self.recurrent_layer1(x3)
x4_1 = x4[0]
x4_1 = (x4_1.contiguous()).view(x4_1.shape[0], -1)
x5 = self.fc1(x4_1)
return F.softmax(x5)
model = Net()
optimizer = Adam(model.parameters(), lr=0.001)
While this is the train functions:
def train(epoch):
model.train()
val_loss = 0
correct_pred = 0
for batch_idx, sample in enumerate(train_dataloader):
data = sample['data']
label = sample['label']
data, label = data.cuda(0, async=True), label.cuda(3, async=True)
data = Variable(data)
label = Variable(label)
optimizer.zero_grad()
output = model(data)
loss = F.binary_cross_entropy(output, label)
loss.backward()
optimizer.step()
if batch_idx % 10 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_idx),
100. * batch_idx / len(train_dataloader), loss.data[0]))
for batch_idx, sample in enumerate(validation_dataloader):
model.eval()
data = sample['data']
label = sample['label']
data, label = data.cuda(async=True), label.cuda(3, async=True)
data = Variable(data, volatile=True)
label = Variable(label)
output = model(data)
val_loss += F.binary_cross_entropy(output, label)
prediction = output.data.max(1, keepdim=True)[1] # get the index of the max probability
labels_non_oh = label.data.max(1, keepdim=True)[1]
correct_pred += prediction.eq(labels_non_oh).cpu().sum()
As you can see it is mostly code adapted from the examples.
I’m using pytorch version 0.3.0, Cuda version 7.
Is this a bug or an error of mine? I’m still new to pytorch, any input would be appreciated.