I’m implementing the following code in multi gpus:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
from torch import optim
# Parameters and DataLoaders
input_size = 5000
output_size = 5000
batch_size = 1000
data_size = 100000
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
rand_loader = DataLoader(dataset=RandomDataset(input_size, data_size),
batch_size=batch_size, shuffle=True)
class Model(nn.Module):
# Our model
def __init__(self, input_size, output_size):
super(Model, self).__init__()
self.fc1 = nn.Linear(input_size, output_size)
self.fc2 = nn.Linear(input_size, output_size)
self.fc3 = nn.Linear(input_size, output_size)
self.fc4 = nn.Linear(input_size, output_size)
self.fc5 = nn.Linear(input_size, output_size)
def forward(self, input):
input = self.fc1(input)
input = self.fc2(input)
input = self.fc3(input)
input = self.fc4(input)
output = self.fc5(input)
print("\tIn Model: input size", input.size(),
"output size", output.size())
return output
model = Model(input_size, output_size)
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
model = nn.DataParallel(model)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)
while True:
for data in rand_loader:
optimizer.zero_grad()
loss = torch.sum(model(data.to(device)))
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
Finally, our gpu 0,1,2,3 respectively use the memory of 3399MB, 1109MB, 1109MB, 1109MB.
However, when I implement the above code with single gpu:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
from torch import optim
# Parameters and DataLoaders
input_size = 5000
output_size = 5000
batch_size = 1000
data_size = 100000
os.environ["CUDA_VISIBLE_DEVICES"] = "4, 5, 6, 7"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
rand_loader = DataLoader(dataset=RandomDataset(input_size, data_size),
batch_size=batch_size, shuffle=True)
class Model(nn.Module):
# Our model
def __init__(self, input_size, output_size):
super(Model, self).__init__()
self.fc1 = nn.Linear(input_size, output_size)
self.fc2 = nn.Linear(input_size, output_size)
self.fc3 = nn.Linear(input_size, output_size)
self.fc4 = nn.Linear(input_size, output_size)
self.fc5 = nn.Linear(input_size, output_size)
def forward(self, input):
input = self.fc1(input)
input = self.fc2(input)
input = self.fc3(input)
input = self.fc4(input)
output = self.fc5(input)
print("\tIn Model: input size", input.size(),
"output size", output.size())
return output
model = Model(input_size, output_size)
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
model = nn.DataParallel(model)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)
while True:
for data in rand_loader:
optimizer.zero_grad()
loss = torch.sum(model(data.to(device)))
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
The gpu 0 only use the memory 2629MB. Why? I think DataParallel split the tensor into multi gpu, and each gpu should use less memory than when we use single gpu?