The memory of DataParallel

I’m implementing the following code in multi gpus:

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
from torch import optim


# Parameters and DataLoaders
input_size = 5000
output_size = 5000

batch_size = 1000
data_size = 100000
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class RandomDataset(Dataset):

    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len

rand_loader = DataLoader(dataset=RandomDataset(input_size, data_size),
                         batch_size=batch_size, shuffle=True)

class Model(nn.Module):
    # Our model

    def __init__(self, input_size, output_size):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(input_size, output_size)
        self.fc2 = nn.Linear(input_size, output_size)
        self.fc3 = nn.Linear(input_size, output_size)
        self.fc4 = nn.Linear(input_size, output_size)
        self.fc5 = nn.Linear(input_size, output_size)

    def forward(self, input):
        input = self.fc1(input)
        input = self.fc2(input)
        input = self.fc3(input)
        input = self.fc4(input)
        output = self.fc5(input)
        print("\tIn Model: input size", input.size(),
              "output size", output.size())

        return output

model = Model(input_size, output_size)
if torch.cuda.device_count() > 1:
  print("Let's use", torch.cuda.device_count(), "GPUs!")
  model = nn.DataParallel(model)

model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

while True:
    for data in rand_loader:
        optimizer.zero_grad()
        loss = torch.sum(model(data.to(device)))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

Finally, our gpu 0,1,2,3 respectively use the memory of 3399MB, 1109MB, 1109MB, 1109MB.

However, when I implement the above code with single gpu:

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
from torch import optim


# Parameters and DataLoaders
input_size = 5000
output_size = 5000

batch_size = 1000
data_size = 100000
os.environ["CUDA_VISIBLE_DEVICES"] = "4, 5, 6, 7"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class RandomDataset(Dataset):

    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len

rand_loader = DataLoader(dataset=RandomDataset(input_size, data_size),
                         batch_size=batch_size, shuffle=True)

class Model(nn.Module):
    # Our model

    def __init__(self, input_size, output_size):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(input_size, output_size)
        self.fc2 = nn.Linear(input_size, output_size)
        self.fc3 = nn.Linear(input_size, output_size)
        self.fc4 = nn.Linear(input_size, output_size)
        self.fc5 = nn.Linear(input_size, output_size)

    def forward(self, input):
        input = self.fc1(input)
        input = self.fc2(input)
        input = self.fc3(input)
        input = self.fc4(input)
        output = self.fc5(input)
        print("\tIn Model: input size", input.size(),
              "output size", output.size())

        return output

model = Model(input_size, output_size)
if torch.cuda.device_count() > 1:
  print("Let's use", torch.cuda.device_count(), "GPUs!")
  model = nn.DataParallel(model)

model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)


while True:
    for data in rand_loader:
        optimizer.zero_grad()
        loss = torch.sum(model(data.to(device)))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

The gpu 0 only use the memory 2629MB. Why? I think DataParallel split the tensor into multi gpu, and each gpu should use less memory than when we use single gpu?