I set up a traing data loader and testing data loader to load my data. Fairly strange thing is that, after first iteration, an out of memory error was raise in the second one. Is there something cashed I forgot to free?
optimizer=optim.SGD(net.parameters(), lr=0.01)
criterion=nn.MSELoss()
for i_batch, sample in enumerate(dataLoader):
print('read the data')
input,target=sample['tr'].type(torch.FloatTensor), sample['gt'].type(torch.FloatTensor)
if torch.cuda.is_available():
input, target=input.unsqueeze(1).cuda(), target.unsqueeze(1).cuda()
else:
input, target=input.unsqueeze(1), target.unsqueeze(1)
input, target=Variable(input), Variable(target)
#feed data into the net
optimizer.zero_grad()
print('put the data into net')
output=net(input)
#define loss function
loss = criterion(output, target)
loss = loss*10
print('back propagate')
f.write('%f\n'%loss)
loss.backward()
optimizer.step()
torch.cuda.empty_cache()
if i_batch%50==0:
#test phase
test_mse=0
t_i=1
for t_batch, t_sample in enumerate(testLoader):
print(t_batch)
t_input, t_target=t_sample['tr'].type(torch.FloatTensor), t_sample['gt'].type(torch.FloatTensor)
if torch.cuda.is_available():
t_input, t_target=t_input.unsqueeze(1).cuda(), t_target.unsqueeze(1).cuda()
else:
t_input, t_target=t_input.unsqueeze(1), t_target.unsqueeze(1)
t_input, t_target=Variable(t_input), Variable(t_target)
optimizer.zero_grad()
t_output=net(t_input)
t_loss=10*criterion(t_output, t_target)
test_mse=test_mse+t_loss
del t_input
del t_target
if t_i==t_batch:
break
#print(torch.cuda.max_memory_allocated())
torch.cuda.empty_cache()
#print(torch.cuda.max_memory_allocated())
test_mse=test_mse/(t_i+1)
#I opened a file externally to write some log
f.write('%f\n'%test_mse)
print('iter %d, training loss: mse %.4f, testing loss: %.4f' %(i_batch, loss, test_mse))
id_test=random.randint(1200, 1399)
testsample=dataSet.__getitem__(id_test)
test_input=Variable(testsample['tr'].unsqueeze(0).unsqueeze(1).type(torch.FloatTensor).cuda())
test_output=net(test_input)
optimizer.zero_grad()
save_result_path='/Deconv/testresult/test%d'%int(time.time())
save_blur_path='/Deconv/blurresult/blur%d'%int(time.time())
np.save(save_result_path,test_output.squeeze().cpu().data.numpy())
np.save(save_blur_path,test_input.squeeze().cpu().data.numpy())
print('save test sample id %d blur to %s, result to path %s'%(id_test, save_blur_path, save_result_path))
torch.cuda.empty_cache()
this is my net class:
class Net(nn.Module):
# should be indent
def __init__(self):
super(Net, self).__init__()
# 1 input image channel, 6 output channels, 5x5 square convolution
# kernel
self.conv1 = nn.Conv3d(1, 16, 5, padding=2)
self.conv2 = nn.Conv3d(16, 32, 5, padding=2)
self.conv3 = nn.Conv3d(32, 32, 5, padding=2)
self.conv4 = nn.Conv3d(32, 32, 5, padding=2)
self.conv5 = nn.Conv3d(32, 32, 5, padding=2)
self.conv6 = nn.Conv3d(32, 32, 5, padding=2)
self.conv7 = nn.Conv3d(32, 16, 5, padding=2)
self.conv8 = nn.Conv3d(16, 1, 5, padding=2)
def forward(self, x):
x1 = F.relu(self.conv1(x))
x2 = F.relu(self.conv2(x1))
x3 = F.relu(self.conv3(x2))
x4 = F.relu(self.conv4(x3))
x4 = x4 + x3
x5 = F.relu(self.conv5(x4))
x5 = x5 + x4
x6 = F.relu(self.conv6(x5))
x6 = x6 + x5
x7 = F.relu(self.conv7(x6))
x8 = F.relu(self.conv8(x7))
return x8
def num_flat_features(self, x):
size = x.size()[1:] # all dimensions except the batch dimension
num_features = 1
for s in size:
num_features *= s
return num_features
my input is 101*101*101
loading dataset
start training…
start epoch 0
read the data
put the data into net
back propagateread the data
put the data into net
THCudaCheck FAIL file=/pytorch/torch/lib/THC/generic/THCStorage.cu line=58 error=2 : out of memory
Traceback (most recent call last):
File “deconv.py”, line 163, in
output=net(input)
File “/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py”, line 325, in call
result = self.forward(*input, **kwargs)
File “deconv.py”, line 97, in forward
x6 = x6 + x5
RuntimeError: cuda runtime error (2) : out of memory at /pytorch/torch/lib/THC/generic/THCStorage.cu:58