Hi,
I was playing around with the PyTorch distributed methodologies, and am wondering whether there’s any comparison documentation/tutorial available?
It appears that using DP actually takes 2x time on FashionMNIST classification task, than not using it.
For e.g., in the reference code below, net_dp took 367.25 s to train compared to 149.9 s using a single NVIDIA RTX a6000 GPU.
Wondering why is this or am I doing it wrong?
Ref:
- Sample code
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 4 * 4, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 4 * 4)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def train(net: nn.Module, is_dp=False):
...
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net1 = nn.DataPrallel(Net())
net.to(device)
train(net1, True)
"""
DataParallel(
(module): Net(
(conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
(pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
(fc1): Linear(in_features=256, out_features=120, bias=True)
(fc2): Linear(in_features=120, out_features=84, bias=True)
(fc3): Linear(in_features=84, out_features=10, bias=True)
)
)
"""
net2_single-gpu = Net().to(device)
train(net2_single-gpu, False)