DataParallel does not split data memory, instead seems to copy it

I am running a batched LSTM model (batch size of 32) on PyTorch with 8xV100 GPUs, 16GB memory each GPU.

I have an LSTM model with the following structure:

# Class containing the LSTM model initialization and feed-forward logic
class LSTMClassifier(nn.Module):
    # LSTM initialization
    def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size, static_size):
        super(LSTMClassifier, self).__init__()

        # Setting the hidden layer dimension of the LSTM
        self.hidden_dim = hidden_dim
        # Initializing the embedding layer
        self.embeddings = nn.Embedding(vocab_size, embedding_dim-2)
        # Initializing the LSTM layer with one hidden layer 
        self.lstm = nn.LSTM(((embedding_dim*vocab_size)+static_size), hidden_dim, num_layers = 1, batch_first=True)
        # Initializing linear linear that takes the hidden layer output
        self.hidden2label = nn.Linear(hidden_dim, label_size)


    # Defining the hidden state of the LSTM
    def init_hidden(self, batch_size):
        # the first is the hidden h
        # the second is the cell  c
        return [autograd.Variable(torch.zeros(batch_size, 1, self.hidden_dim).cuda()),
                autograd.Variable(torch.zeros(batch_size, 1, self.hidden_dim).cuda())]

    # Defining the feed forward logic of the LSTM. It contains:
    # 1. The embedding layer
    # 2. The LSTM layer with one hidden layer
    # 3. The softmax layer
    def forward(self, seq, freq, time, static):
        print(seq.size())

        # Grab the mini-batch length and max sequence length (pre-ordered)
        # (need to do this in the forward logic because of data parallelism and how the GPU's will split up the batch)
        sequence_length = seq.size()[1]
        batch_length = seq.size()[0]

        # reset the LSTM hidden state. 
        # Must be done before you run a new batch. Otherwise the LSTM will treat a new batch as a continuation of a sequence
        self.hidden = self.init_hidden(batch_length)

        # Permute the cell and hidden layers. This is because when using Batch_first = True on data parallel,
        # the hidden state will still expect an input of (nLayer, batch size, hidden dim), but we are feeding it (batch size, nLayer, hidden dim)
        # Thus, to fix it, we need to swap the first and 2nd inputs before feeding to hidden dim
        self.hidden[0] = self.hidden[0].permute(1, 0, 2).contiguous()
        self.hidden[1] = self.hidden[1].permute(1, 0, 2).contiguous()

        # This is the pass to the embedding layer. 
        # The sequence is of dimension N and the output is N x Demb
        embeds = self.embeddings(seq)

        # Concatenate the embedding output with the time and frequency vectors
        embeds = torch.cat((embeds,freq), dim=3)
        embeds = torch.cat((embeds,time), dim=3)

        # Flatten the tensor
        x = embeds.view(batch_length, sequence_length, -1) 

        # Concatenate the static information
        x = torch.cat((x, static), dim=2)

        # Grab the list of lengths of sequences, for the purpose of packing the padded sequenes
        seq_lengths = torch.LongTensor(list(map(len, seq)))

        # pack the padded sequence so that paddings are ignored
        packed_x = torch.nn.utils.rnn.pack_padded_sequence(x, seq_lengths, batch_first=True)

        # Feed to the LSTM layer
        self.lstm.flatten_parameters()
        lstm_out, self.hidden = self.lstm(packed_x, self.hidden)

        # Swap back the 1st and 2nd inputs to the hidden layer back to its original configuration
        self.hidden = list(self.hidden)
        self.hidden[0] = self.hidden[0].permute(1, 0, 2).contiguous()
        self.hidden[1] = self.hidden[1].permute(1, 0, 2).contiguous()

        # Unpack the packed padded sequence so that it is ready for prediction
        unpacked_lstm_out, input_sizes = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)

        # Feed the last layer of the LSTM into the linear layer
        y = self.hidden2label(unpacked_lstm_out[:,-1,:])

        # Produce the softmax probabilities
        log_probs = F.log_softmax(y)

        return log_probs

And I just run one iteration of a batched dataset (batch size, is 32):

torch.cuda.empty_cache()
EMBEDDING_DIM = 32
HIDDEN_DIM = 50
EPOCH = 10
BATCH_SIZE = 16
best_val_auc = 0.0

model = LSTMClassifier(embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, vocab_size=len(events_to_ix), label_size=len(targets_to_ix), static_size=(len(gender_to_ix)+1)).cuda() 
model = torch.nn.DataParallel(model).cuda()

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

no_up = 0

model.train()
avg_loss = 0.0
count = 0
truth_res = []
pred_res = []

g= train_data.groupby(np.arange(len(train_data)) // 32)

rows = g.get_group((list(g.groups)[0])) # Batch size of 32

torch.cuda.empty_cache()
# Grab the targets into a list and append it into the truth_res list in order to measure AUC performance
target = [targets_to_ix[target] for target in rows['event_target']]
truth_res.extend(target)

# Encode the data and output to tensors (based on the previous description)
seq, freq, time_data, static = encode_data(rows, events_to_ix)
print(len(seq))

# Pad the sequences
seq = rnn_utils.pad_sequence(seq, batch_first = True)
freq = rnn_utils.pad_sequence(freq, batch_first = True)
time_data = rnn_utils.pad_sequence(time_data, batch_first = True)
static = rnn_utils.pad_sequence(static, batch_first = True)

# Put the padded sequences into Variable and Cuda cores
seq = autograd.Variable(seq).cuda()
freq = autograd.Variable(freq).cuda()
time_data = autograd.Variable(time_data).cuda()
static = autograd.Variable(static).cuda()
target = autograd.Variable(torch.LongTensor(target)).cuda()

# Feed the tensor Variables into the model
pred = model(seq,freq,time_data,static)

This is my nvidia-smi output right before loading to the model:

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.104      Driver Version: 410.104      CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla V100-SXM2...  On   | 00000000:00:17.0 Off |                    0 |
| N/A   48C    P0    64W / 300W |   2438MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:00:18.0 Off |                    0 |
| N/A   42C    P0    44W / 300W |     11MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  On   | 00000000:00:19.0 Off |                    0 |
| N/A   41C    P0    46W / 300W |     11MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   3  Tesla V100-SXM2...  On   | 00000000:00:1A.0 Off |                    0 |
| N/A   44C    P0    44W / 300W |     11MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   4  Tesla V100-SXM2...  On   | 00000000:00:1B.0 Off |                    0 |
| N/A   44C    P0    42W / 300W |     11MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   5  Tesla V100-SXM2...  On   | 00000000:00:1C.0 Off |                    0 |
| N/A   44C    P0    45W / 300W |     11MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   6  Tesla V100-SXM2...  On   | 00000000:00:1D.0 Off |                    0 |
| N/A   42C    P0    44W / 300W |     11MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   7  Tesla V100-SXM2...  On   | 00000000:00:1E.0 Off |                    0 |
| N/A   45C    P0    47W / 300W |     11MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0     93165      C   ...r/anaconda3/envs/pytorch_p36/bin/python  2427MiB |
+-----------------------------------------------------------------------------+

Right after loading the model, I call nvidia-smi in the start of the forward pass and the nvidia-smi output is:

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.104      Driver Version: 410.104      CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla V100-SXM2...  On   | 00000000:00:17.0 Off |                    0 |
| N/A   50C    P0    59W / 300W |   3040MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:00:18.0 Off |                    0 |
| N/A   44C    P0    58W / 300W |   1342MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  On   | 00000000:00:19.0 Off |                    0 |
| N/A   44C    P0    62W / 300W |   1342MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   3  Tesla V100-SXM2...  On   | 00000000:00:1A.0 Off |                    0 |
| N/A   47C    P0    57W / 300W |   1342MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   4  Tesla V100-SXM2...  On   | 00000000:00:1B.0 Off |                    0 |
| N/A   47C    P0    57W / 300W |   1342MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   5  Tesla V100-SXM2...  On   | 00000000:00:1C.0 Off |                    0 |
| N/A   47C    P0    59W / 300W |   1342MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   6  Tesla V100-SXM2...  On   | 00000000:00:1D.0 Off |                    0 |
| N/A   44C    P0    59W / 300W |   1342MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   7  Tesla V100-SXM2...  On   | 00000000:00:1E.0 Off |                    0 |
| N/A   48C    P0    62W / 300W |   1342MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

I would expect the memory across 8 GPUs to be more uniform and much smaller, seeing how the batched data I passed in only took around ~2GB of space. Second, the among of memory usage seems to completely blow up after just 1 batch iteration (each GPU ends up using ~9GB). Is there something I am doing wrong? I feel like I am possibly just duplicating a bunch of tensors and somehow storing it all in the GPU.

That explains the imbalanced part but doesn’t explain why the memory seems to be duplicated in each GPU. Any thoughts?

Well, dataparallel duplicates the model in each gpu by definition. I think it’s explained in docs. Gradients are computed gpu-wise also. Think is that final gradients with which all models are updated are an average of gradients on each gpu but in the end it uses same memory than it would use if you had a single gpu.