No speedup doing multi GPU training with DistributedDataParallel vs. single GPU

I’ve been experimenting with training a toy LSTM example on multiple GPUs and noticed there is no training speedup compared to the single GPU version. I am using a GTX 1070 TI and a GTX 1070, the 1070 TI is in a 16x PCIE slot while the 1070 is in a 4x PCIE slot (could this be a limiting factor?).

Here are my training scripts:

Multi GPU (I’m using DistributedDataParallel because pack_padded_sequence doesn’t work properly with DataParallel https://github.com/pytorch/pytorch/issues/13214):

mport argparse, torch 
import torch.nn as nn
from datetime import datetime

# Run this program with CUDA_VISIBLE_DEVICES=0,1
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)

    def forward(self, x, lens):
        self.lstm.flatten_parameters()
        lens = torch.tensor(lens, dtype=torch.int64, device=torch.device('cpu'))
        x = torch.nn.utils.rnn.pack_padded_sequence(x, lens, batch_first=True)
        out, _ = self.lstm(x)
        out = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)

        return out

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.lstm = LSTM(512, 512, 3)

    def forward(self, x, x_lens):
        x = self.lstm(x, x_lens)
        return x

def train(model):
    for _ in range(800):
        inputs = torch.randn(64, 245, 512).cuda() # (B, T, C)
        lens = list(range(64, 0, -1))
        outputs = model(inputs, lens)

startTime = datetime.now()
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int)
args = parser.parse_args()
torch.cuda.set_device(args.local_rank)
    
model = Net()
torch.distributed.init_process_group(backend="nccl")
model = model.cuda()
model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)
train(model)
print("### Total training time: ", datetime.now() - startTime)

Single GPU:

import torch 
import torch.nn as nn
from datetime import datetime

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)

    def forward(self, x, lens):
        self.lstm.flatten_parameters()
        lens = torch.tensor(lens, dtype=torch.int64, device=torch.device('cpu'))
        x = torch.nn.utils.rnn.pack_padded_sequence(x, lens, batch_first=True)
        out, _ = self.lstm(x)
        out = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)

        return out

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.lstm = LSTM(512, 512, 3)

    def forward(self, x, x_lens):
        x = self.lstm(x, x_lens)
        return x

def train(model):
    for _ in range(800):
        inputs = torch.randn(64, 245, 512).cuda() # (B, T, C)
        lens = list(range(64, 0, -1))
        outputs = model(inputs, lens)

startTime = datetime.now()
    
model = Net()
model = model.cuda()
train(model)
print("### Total training time: ", datetime.now() - startTime)