I’ve been experimenting with training a toy LSTM example on multiple GPUs and noticed there is no training speedup compared to the single GPU version. I am using a GTX 1070 TI and a GTX 1070, the 1070 TI is in a 16x PCIE slot while the 1070 is in a 4x PCIE slot (could this be a limiting factor?).
Here are my training scripts:
Multi GPU (I’m using DistributedDataParallel because pack_padded_sequence doesn’t work properly with DataParallel https://github.com/pytorch/pytorch/issues/13214):
mport argparse, torch
import torch.nn as nn
from datetime import datetime
# Run this program with CUDA_VISIBLE_DEVICES=0,1
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers):
super(LSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
def forward(self, x, lens):
self.lstm.flatten_parameters()
lens = torch.tensor(lens, dtype=torch.int64, device=torch.device('cpu'))
x = torch.nn.utils.rnn.pack_padded_sequence(x, lens, batch_first=True)
out, _ = self.lstm(x)
out = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
return out
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.lstm = LSTM(512, 512, 3)
def forward(self, x, x_lens):
x = self.lstm(x, x_lens)
return x
def train(model):
for _ in range(800):
inputs = torch.randn(64, 245, 512).cuda() # (B, T, C)
lens = list(range(64, 0, -1))
outputs = model(inputs, lens)
startTime = datetime.now()
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int)
args = parser.parse_args()
torch.cuda.set_device(args.local_rank)
model = Net()
torch.distributed.init_process_group(backend="nccl")
model = model.cuda()
model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)
train(model)
print("### Total training time: ", datetime.now() - startTime)
Single GPU:
import torch
import torch.nn as nn
from datetime import datetime
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers):
super(LSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
def forward(self, x, lens):
self.lstm.flatten_parameters()
lens = torch.tensor(lens, dtype=torch.int64, device=torch.device('cpu'))
x = torch.nn.utils.rnn.pack_padded_sequence(x, lens, batch_first=True)
out, _ = self.lstm(x)
out = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
return out
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.lstm = LSTM(512, 512, 3)
def forward(self, x, x_lens):
x = self.lstm(x, x_lens)
return x
def train(model):
for _ in range(800):
inputs = torch.randn(64, 245, 512).cuda() # (B, T, C)
lens = list(range(64, 0, -1))
outputs = model(inputs, lens)
startTime = datetime.now()
model = Net()
model = model.cuda()
train(model)
print("### Total training time: ", datetime.now() - startTime)