Hi. I have a network that is big and I wanted to free up memory by using HalfTensors instead of FloatTensors for the network and variables. I convert my model and the variables to halftensors before loading onto GPU. For the first iteration, forward propagation works fine, but after doing the first update all parameters and the loss become nan. It seems that it is not the loss function that causes this since the loss in the first iteration is a number, not nan. Any idea what might be causing this? This is my training file:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.autograd as autograd
import torch.nn.functional as F
import torch.optim as optim
from I2I3D import i2i
import scipy.io
import numpy as np
datasize = 1
epochs = 1
imgsize = 32
lr = 0.00001
batchsize = 1 # datasize should be divisible by batchsize
dtype = torch.HalfTensor
''' Load network '''
print('Loading network model.')
net = i2i()
params = list(net.parameters())
print(params)
# INITIALIZE PARAMETERS TO ZERO
for ii in params:
nn.init.constant(ii,0)
net.half()
if torch.cuda.device_count() > 1:
print("%d GPUs detected. Loading network model to GPUs." %torch.cuda.device_count())
net = nn.DataParallel(net)
else:
print("Parallel computing on multiple GPUs not available.")
if torch.cuda.is_available():
net.cuda()
else:
print("Cuda not available.")
print('Network model loaded.')
''' Training '''
print('Training model.')
imageSize = 128 # Cube image side length.
# Has to be divisible by 8 due to pooling
# IF USING REAL DATA
data = scipy.io.loadmat('/home/fredrik/Dropbox/data.mat')
im = np.array(data['image'])
gt = np.array(data['gt'])
x = torch.from_numpy(im)
x = x[:imageSize,:imageSize,:imageSize]
y = torch.from_numpy(gt)
y = y[:imageSize,:imageSize,:imageSize]
# IF USING RANDOM DATA
#x = torch.rand(imageSize,imageSize,imageSize)
#y = torch.rand(imageSize,imageSize,imageSize)
x = x.type(dtype)
x = Variable(x.unsqueeze(0).unsqueeze(0))
y = y.type(dtype)
y = Variable(y.unsqueeze(0).unsqueeze(0))
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=lr)
iterations = 1
for i in range(iterations):
optimizer.zero_grad()
if torch.cuda.is_available():
x, y = x.cuda(), y.cuda()
out = net(x)
losses = list()
for j in range(4):
losses.append(criterion(out[j], y))
loss = sum(losses)
print(loss)
loss.backward()
optimizer.step()
print('loss: %f, iter: %d/%d' %(loss.data[0],i+1, iterations))
print('Done.')