I face some instability in training my model in a continual setting. I have 20 tasks and after task 12, the accuracy goes down to 0.0007, so nothing happens in terms of learning. My hypothesis is maybe the problem is a numerical instability. so, I would like to train on a higher precision, aka float64. To so so, I am changing my code as follows:
def update_model(self, x, y, criterion, optimizer):
# chekc the label type, output of the bayesian model
optimizer.zero_grad()
do_cutmix = self.cutmix and np.random.rand(1) < 0.5
if do_cutmix:
x, labels_a, labels_b, lam = cutmix_data(x=x, y=y, alpha=1.0)
x = x.double()
labels_a = labels_a.double()
labels_b = labels_b.double()
# take care of the output of the bayesian model and its probabilistic loss
if self.bayesian:
self.model.double()
logit_dict = self.model(x)
loss = lam * criterion(logit_dict, labels_a)['total_loss'] + (1 - lam) * criterion(
logit_dict, labels_b)['total_loss']
#loss = losses_dict['total_loss']
logit = criterion(logit_dict, labels_a)['prediction']
logit = logit.mean(dim=2)
else:
self.model.double()
logit = self.model(x)
loss = lam * criterion(logit, labels_a) + (1 - lam) * criterion(
logit, labels_b
)
else:
if self.bayesian:
# measure forward pass time
#t_start = time.time()
self.model.double()
logit_dict = self.model(x)
#t_end = time.time() - t_start
# logger.info(f'forward pass time: {t_end:.2f} s')
# criterion is the probabilistic loss class
#t_s = time.time()
losses_dict = criterion(logit_dict, y)
#t_e = time.time() - t_s
#logger.info(f'loss time: {t_e:.2f} s')
loss = losses_dict['total_loss']
logit = losses_dict['prediction'] # Shape: torch.Size([10, 10, 64]) --> (batch_size, num_classes, samples)
# change the shape of the logit to be (batch_size, num_classes)
logit = logit.mean(dim=2)
else:
self.model.double()
logit = self.model(x)
loss = criterion(logit, y)
# calculate the number of correct predictions per batch for the bayesian model as well here
_, preds = logit.topk(self.topk, 1, True, True)
loss.backward()
''' ToDo: is it necessary to clip the gradient? it was done in mnvi code
Maybe they didn't need it but I'm not sure. For the Bayesian case, it is probably needed.
'''
if self.bayesian:
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.1, norm_type='inf')
optimizer.step()
return loss.item(), torch.sum(preds == y.unsqueeze(1)).item(), y.size(0)
def _train(
self, train_loader, memory_loader, optimizer, criterion
):
total_loss, correct, num_data = 0.0, 0.0, 0.0
self.model.train()
if memory_loader is not None and train_loader is not None:
data_iterator = zip(train_loader, cycle(memory_loader))
elif memory_loader is not None:
data_iterator = memory_loader
elif train_loader is not None:
data_iterator = train_loader
else:
raise NotImplementedError("None of dataloder is valid")
for i, data in enumerate(data_iterator):
if len(data) == 2:
stream_data, mem_data = data
x = torch.cat([stream_data["image"], mem_data["image"]])
y = torch.cat([stream_data["label"], mem_data["label"]])
else:
x = data["image"]
y = data["label"]
# set to double
x = x.double().to(self.device)
y = y.double().to(self.device)
# this is equivalent to the step code in the test repo
l, c, d = self.update_model(x, y, criterion, optimizer)
# Compute the moving averages - equivalent to MovingAverage in the test repo
total_loss += l
correct += c
num_data += d
if train_loader is not None:
n_batches = len(train_loader)
else:
n_batches = len(memory_loader)
return total_loss / n_batches, correct / num_data
but I get this error:
outputs_mean = F.conv2d(
RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.cuda.DoubleTensor) should be the same
I did some debugging by printing the dtype of the inputs to the first layer. I see that the batch of iteration 5 is actually in float32!
Am I doing the casting correct? I already deactivated the cutmix augmentation and the error still persists, so it cannot be the reason for it.
I appreciate your help on both the stability and the casting to float64.