Hello,
I’m struggling for a few weeks to train a certain complex model on different computers.
The training is running smoothly on my local machine with nice loss and accuracy curves but on the more powerful multy-gpu machine the training is very bad (on the multy-gpu machine - the accuracy, even on the training dataset, doesn’t raise beyond a certain point and the accuracy on the validation dataset is all over the place compared to the results of my local machine).
I installed the same clean conda environment on both machines. The only relevant difference I could find between the machines are different nvidia drivers and different cuda versions.
On the local machine: Cuda compilation tools, release 10.1, V10.1.105
Driver Version: 418.40.04
On the multy-GPU machine: Cuda compilation tools, release 9.2, V9.2.148
Driver Version: 396.54
After very extensive and exhaustive testing I pinned down the problem to one of the models training differently (the other models trained similarly on both machines, only this model was training differently. This is with the same code, same data, same random seeds etc…).
I created a simple code to isolate and reproduce the problem which is posted below:
The outputs I get on the different machines with the exact same code and seeds:
Local Machine output
$ python Test_different_training.py
Results of the forward pass on the first batch is same on both machines:
Same input: tensor([[0.6349, 0.0771, 0.4478],
[0.0277, 0.4497, 0.6643],
[0.4654, 0.3515, 0.3045],
[0.1548, 0.5315, 0.2011],
[0.5183, 0.2718, 0.5145]], device=‘cuda:0’)
Same output tensor([[-0.0634, -0.0145],
[-0.0023, 0.0131],
[-0.0695, -0.0140],
[-0.0643, -0.0045],
[-0.0902, -0.0123]], device=‘cuda:0’, grad_fn=)Results of the forward pass after 10 batches is diffrent:
Same input: tensor([[0.9786, 0.8589, 0.1811],
[0.3121, 0.1688, 0.7962],
[0.5744, 0.4271, 0.6725],
[0.3887, 0.4706, 0.2278],
[0.2610, 0.0231, 0.3505]], device=‘cuda:0’)
> Different output tensor([[-0.0148, 0.0174],
> [-0.1467, 0.1211],
> [-0.3497, 0.3899],
> [-0.0969, 0.0811],
> [ 0.0293, -0.0280]], device=‘cuda:0’, grad_fn=)
Multy-GPU Machine output
$python Test_different_training.py
Results of the forward pass on the first batch is same on both machines:
Same input: tensor([[0.6349, 0.0771, 0.4478],
[0.0277, 0.4497, 0.6643],
[0.4654, 0.3515, 0.3045],
[0.1548, 0.5315, 0.2011],
[0.5183, 0.2718, 0.5145]], device=‘cuda:0’)
Same output tensor([[-0.0634, -0.0145],
[-0.0023, 0.0131],
[-0.0695, -0.0140],
[-0.0643, -0.0045],
[-0.0902, -0.0123]], device=‘cuda:0’, grad_fn=)Results of the forward pass after 10 batches is diffrent:
Same input: tensor([[0.9786, 0.8589, 0.1811],
[0.3121, 0.1688, 0.7962],
[0.5744, 0.4271, 0.6725],
[0.3887, 0.4706, 0.2278],
[0.2610, 0.0231, 0.3505]], device=‘cuda:0’)
> Different output tensor([[-1.2169, 1.2565],
> [-0.1790, 0.1972],
> [-0.7532, 0.7486],
> [-0.1011, 0.1140],
> [ 0.0452, -0.0452]], device=‘cuda:0’, grad_fn=)
Code to test:
The input to the model is structured in a weird way but that’s because it’s only one part of the original architecture, maybe that strange structure and the manipulation I perform on that input causes the difference? I hope not.
import numpy as np
import torch
import torch.nn as nn
class RNNModel_classifier(nn.Module):
def __init__(self, nClasses = 2):
super(RNNModel_classifier, self).__init__()
self.conv1_r = nn.Conv1d(in_channels=4096, out_channels=1024, kernel_size=5, padding=1, stride=1)
self.batchnorm1_r = nn.BatchNorm1d(1024)
self.conv2_r = nn.Conv1d(in_channels=1024, out_channels=256, kernel_size=7, padding=0, stride=1)
self.batchnorm2_r = nn.BatchNorm1d(256)
self.conv3_r = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=9, padding=0, stride=2)
self.batchnorm3_r = nn.BatchNorm1d(128)
self.conv4_r = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=7, padding=0, stride=1)
self.batchnorm4_r = nn.BatchNorm1d(64)
self.conv5_r = nn.Conv1d(in_channels=64, out_channels=64, kernel_size=2, padding=0, stride=1)
self.batchnorm5_r = nn.BatchNorm1d(64)
self.conv1_s = nn.Conv1d(in_channels = 32, out_channels = 32, kernel_size = 3, padding = 0, stride = 1)
self.batchnorm1_s = nn.BatchNorm1d(32)
self.conv2_s = nn.Conv1d(in_channels = 32, out_channels = 64, kernel_size = 7, padding = 0, stride = 2)
self.batchnorm2_s = nn.BatchNorm1d(64)
self.conv3_s = nn.Conv1d(in_channels = 64, out_channels = 128, kernel_size = 9, padding = 0, stride = 4)
self.batchnorm3_s = nn.BatchNorm1d(128)
self.conv4_s = nn.Conv1d(in_channels = 128, out_channels = 128, kernel_size = 17, padding = 0, stride = 8)
self.batchnorm4_s = nn.BatchNorm1d(128)
self.conv5_s = nn.Conv1d(in_channels = 128, out_channels = 64, kernel_size = 15, padding = 0, stride = 4)
self.batchnorm5_s = nn.BatchNorm1d(64)
self.conv6_s = nn.Conv1d(in_channels = 64, out_channels = 64, kernel_size = 12, padding = 0, stride = 1)
self.batchnorm6_s = nn.BatchNorm1d(64)
self.classifier2 = nn.Bilinear(64, 64, nClasses*32)
self.batchnorm2 = nn.BatchNorm1d(nClasses*32)
self.classifier3 = nn.Linear(nClasses*32,nClasses*32)
self.classifier4 = nn.Linear(nClasses*32,nClasses)
self.init_weights()
def init_weights(self):
initrange = 0.1
self.classifier2.bias.data.fill_(0)
self.classifier2.weight.data.uniform_(-initrange, initrange)
self.classifier3.bias.data.fill_(0)
self.classifier3.weight.data.uniform_(-initrange, initrange)
self.classifier4.bias.data.fill_(0)
self.classifier4.weight.data.uniform_(-initrange, initrange)
def forward(self, hidden, batch_size_of_sample = 1):
res_r = torch.relu(self.batchnorm1_r(self.conv1_r(hidden[-1].transpose(0, 1).contiguous().view(
int(hidden[-1].transpose(0, 1).size(0) / batch_size_of_sample), -1, 32))))
res_r = torch.relu(self.batchnorm2_r(self.conv2_r(res_r)))
res_r = torch.relu(self.batchnorm3_r(self.conv3_r(res_r)))
res_r = torch.relu(self.batchnorm4_r(self.conv4_r(res_r)))
res_r = torch.relu(self.batchnorm5_r(self.conv5_r(res_r)).squeeze(2))
res_s = torch.relu(self.batchnorm1_s(self.conv1_s(hidden[-1].transpose(0, 1).contiguous().view(
int(hidden[-1].transpose(0, 1).size(0) / batch_size_of_sample), -1, 32).transpose(1, 2).contiguous())))
res_s = torch.relu(self.batchnorm2_s(self.conv2_s(res_s)))
res_s = torch.relu(self.batchnorm3_s(self.conv3_s(res_s)))
res_s = torch.relu(self.batchnorm4_s(self.conv4_s(res_s)))
res_s = torch.relu(self.batchnorm5_s(self.conv5_s(res_s)))
res_s = torch.relu(self.batchnorm6_s(self.conv6_s(res_s)).squeeze(2))
result = self.classifier2(res_r, res_s)
result = self.classifier3(torch.relu(self.batchnorm2(result)))
result = self.classifier4(torch.relu(result))
return result
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
batch_number = 10
model_classifier = RNNModel_classifier().cuda()
criterion = nn.CrossEntropyLoss().cuda()
params = model_classifier.parameters()
optimizer = torch.optim.Adam(params, lr=0.1, weight_decay=1.2e-6)
for i in range(10):
optimizer.zero_grad()
hidden_size = [1, 4096*batch_number, 32]
hidden = [torch.rand(hidden_size).cuda()]
targets = torch.randint(0, 2, [batch_number]).cuda()
output = model_classifier(hidden,batch_size_of_sample = 4096)
if i == 0:
print("Results of the forward pass on the first batch is same on both machines:")
print("Same input: ", hidden[-1][0][0:5, 0:3])
print("Same output", output[0:5])
loss = criterion(output, targets)
loss.backward()
optimizer.step()
output = model_classifier(hidden, batch_size_of_sample = 4096)
print()
print("Results of the forward pass after 10 batches is different:")
print("Same input: ", hidden[-1][0][0:5, 0:3])
print("Different output", output[0:5])