Hallo,
I tried to train a combination model of RestNet and LSTM on several GPUs, but the loss and the weights did not change. I got confused and have no idea why. May some have an idea.
class ResNet(nn.Module):
def __init__(self):
super(ResNet, self).__init__()
resnet = load_pretrainednet()
modules = list(resnet.children())[:-1]
self.resnet = nn.Sequential(*modules)
def forward(self, x):
x1 = self.resnet(x)
x1 = x1.view(x1.size(0), -1)
print("Outside: input size", x.size(), "outputs_size", x1.size())
return x1
class Combine(nn.Module):
def __init__(self):
super(Combine, self).__init__()
self.cnn = ResNet()
self.rnn = nn.LSTM(input_size=2048, hidden_size=21, num_layers=1, batch_first=True)
self.linear = nn.Linear(21,21)
def forward(self, x):
batch_size, C, H, W = x.size()
c_in = x.view(batch_size, C, H, W)
c_out = self.cnn(c_in)
r_in = c_out.view(batch_size, 1, -1)
self.rnn.flatten_parameters()
r_out, (h_n, h_c) = self.rnn(r_in)
r_out2 = self.linear(r_out[:, -1, :])
return r_out2
def train_net(net, data_loader, num_images):
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(net.parameters(), lr=0.000001)
running_loss = 0.0
running_corrects = 0.0
net.train() # Set model to training mode
run_count = 0
current_images = 0
name, old_lstm_weight = list(net.module.rnn.named_parameters())[0]
old_linear_weight = net.module.linear.weight
for i, (inputs, labels, masks) in enumerate(data_loader, 1):
print (run_count)
torch.cuda.empty_cache()
gc.collect()
#input = input.unsqueeze()
inputs = torch.cat((inputs),0)
labels = torch.cat((labels),0)
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
print ('Current input length in total: {}'.format(inputs.size()[0]))
if torch.cuda.is_available():
print ('Current inputsize per GPU: {}'.format(np.ceil(inputs.size()[0]/torch.cuda.device_count())))
preds = net(inputs)
for elem in nvsmi.get_gpu_processes():
print(elem)
labels[labels >= 0.5] = 1.0
labels[labels < 0.5] = 0.0
loss = criterion(preds, labels)
loss.backward()
for elm in net.module.parameters():
print(elm[0][0])
break
optimizer.step()
for elm in net.module.parameters():
print(elm[0][0])
break
preds = torch.sigmoid(preds) # torch.Size([N, C]) e.g. tensor([[0., 0.5, 0.]])
preds[preds >= 0.5] = 1.0
preds[preds < 0.5] = 0.0
accuracy = (preds == labels).sum()/(labels.size()[0] * labels.size()[1] *100.0)
# zero the parameter gradients
optimizer.zero_grad()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += accuracy
del preds
torch.cuda.empty_cache()
gc.collect()
num_images += inputs.size(0)
current_images += inputs.size(0)
if run_count % 10 == 9: # every 1000 mini-batches...
# ...log the running loss
writer.add_scalar('Training/NEW_Runs_Loss', running_loss/current_images, num_images)
writer.add_scalar('Training/NEW_Runs_Acuracy', running_corrects.double()/current_images, num_images)
run_count += 1
#print ('input size: ', len(inputs))
#print ('label size: ', len(labels))
#plot_classes_preds(net, inputs, labels, mean, std)
#print(list(net.module.rnn)[0])
name, lstm_weight = list(net.module.rnn.named_parameters())[0]
linear_weight = net.module.linear.weight
if torch.equal(lstm_weight, old_lstm_weight):
print(colored("LSTM weight didn't changed", 'red'))
else:
print(colored("LSTM weight changed", 'green'))
if torch.equal(linear_weight, old_linear_weight):
print(colored("Linear weight didn't changed", 'red'))
else:
print(colored("Linear weight changed", 'green'))
old_lstm_weight = lstm_weight
old_linear_weight = linear_weight
#print(net.module.linear.weight)
return net, running_loss, running_corrects, optimizer, num_images
Print output::
Current input length in total: 90
Current inputsize per GPU: 45.0
Outside: input size torch.Size([45, 3, 224, 224]) outputs_size torch.Size([45, 2048])
Outside: input size torch.Size([45, 3, 224, 224]) outputs_size torch.Size([45, 2048])
pid: 3375727 | gpu_id: 0 | gpu_uuid: GPU-96de9d91-de41-4be2-6c12-280909e98722 | gpu_name: Tesla V100-SXM2-32GB | used_memory: 9689.0MB
pid: 3375727 | gpu_id: 1 | gpu_uuid: GPU-bbcd5e54-bdfe-7c4c-3d6b-8312ba354811 | gpu_name: Tesla V100-SXM2-32GB | used_memory: 8939.0MB
tensor([[-0.0124, -0.0049, -0.0047, -0.0125, 0.0765, -0.0013, -0.0930],
[-0.0035, -0.0379, -0.0086, 0.1207, 0.1172, 0.2363, 0.0651],
[ 0.0040, 0.0597, 0.0610, 0.0591, 0.0746, 0.1351, 0.1906],
[ 0.1521, -0.0442, -0.1501, -0.2492, -0.2439, -0.1416, 0.1227],
[ 0.0078, 0.0360, -0.0127, -0.2912, -0.3637, -0.2218, 0.0186],
[ 0.0095, 0.0808, 0.2047, 0.1493, 0.0226, -0.0785, -0.0541],
[-0.0052, 0.0481, 0.1400, 0.3045, 0.2305, 0.0612, 0.1152]],
device='cuda:0', dtype=torch.float64, grad_fn=<SelectBackward>)
tensor([[-0.0124, -0.0049, -0.0047, -0.0125, 0.0765, -0.0013, -0.0930],
[-0.0035, -0.0379, -0.0086, 0.1207, 0.1172, 0.2363, 0.0651],
[ 0.0040, 0.0597, 0.0610, 0.0591, 0.0746, 0.1351, 0.1906],
[ 0.1521, -0.0442, -0.1501, -0.2492, -0.2439, -0.1416, 0.1227],
[ 0.0078, 0.0360, -0.0127, -0.2912, -0.3637, -0.2218, 0.0186],
[ 0.0095, 0.0808, 0.2047, 0.1493, 0.0226, -0.0785, -0.0541],
[-0.0052, 0.0481, 0.1400, 0.3045, 0.2305, 0.0612, 0.1152]],
device='cuda:0', dtype=torch.float64, grad_fn=<SelectBackward>)
LSTM weight didn't changed
Linear weight didn't changed
Thanks