The transfers to GPU seem to be slow - wondering whether I can do anything with the below snippet of code to speed things up.
Gpu.IDS is just an array. Example: IDS = [4, 5, 6]
device = torch.device('cuda:' + str(Gpu.IDS[0]) if torch.cuda.is_available() else 'cpu')
print('running on ', device)
model = model.to(device)
if torch.cuda.device_count() > 1:
print("Let's use multiple GPUs:", len(Gpu.IDS), "out of ", torch.cuda.device_count(), "GPUs!")
model = nn.DataParallel(model, device_ids=Gpu.IDS)
tokens = train
dataset = NuscenesDataset(tokens, helper)
dataloader = DataLoader(dataset, batch_size=16*len(Gpu.IDS), num_workers=4 * len(Gpu.IDS))
n_iter = 0
minimum_loss = 0
loss_function = MTPLoss(NUM_MODES, 1, 5)
current_loss = 10000
model_dir = make_model_dir()
learning_rates = [0.1]
for lr in learning_rates:
# optimizer = optim.SGD(model.parameters(), lr=lr, momentum = 0.9)
optimizer = optim.Adam(model.parameters(), lr=lr)
print('--------- LEARNING RATE ', lr, '--------------')
for epoch in range(1, 10001):
# print below - only for large dataset
# print('-> epoch:', epoch)
for img, agent_state_vector, ground_truth, _, _ in dataloader:
# imshow(torchvision.utils.make_grid(img))
img = img.to(device)
agent_state_vector = agent_state_vector.to(device)
ground_truth = ground_truth.to(device)
ground_truth = ground_truth.to(torch.float32)
optimizer.zero_grad()
prediction = model(img, agent_state_vector)
# print('prediction.shape', prediction.shape, 'ground_truth.shape', ground_truth.shape)
loss = loss_function(prediction, ground_truth)
loss.backward()
optimizer.step()
current_loss = loss.cpu().detach().numpy()