Hi there!
I’m using pytorch as an autograd library. Can someone provide a simple tutorial or a snippet for a simple example of multi-gpu processing?
If no gradient have to be generated the example in PyTorch: How to parallelize over multiple GPU using multiprocessing.pool
seems reasonable, but what to do if the gradients are needed?
Is there a tutorial on simple “mpi-like” calls reported in the doc? https://pytorch.org/tutorials/beginner/former_torchies/parallelism_tutorial.html
How to use the data_parallel function reported there?
I tried to apply the concepts in the DataParallel guide, but actually the code is slower if the model is encapsulated in the nn.DataPrallel. Clearly something is missing on my side.
import torch
from torch import Tensor
import torch.nn as nn
from torch.nn.parameter import Parameter
import time
class ModDef(nn.Module):
def __init__(self, input_size=100, output_size=100) -> None:
super(ModDef, self).__init__()
self.w1 = Parameter(torch.randn(1024,1024))
self.w2 = Parameter(torch.randn(1024,1024))
def forward(self, X: Tensor) -> Tensor:
output = torch.exp(100+(self.w1 * X - X.mean() / (self.w2 + 1))**2)
return output
if __name__ == '__main__':
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = ModDef()
print('send')
my = nn.DataParallel(model).to(device)
optimizer = torch.optim.SGD(my.parameters(), lr=1e-4)
datain = torch.randn(100,1024,1024).to(device)
d2 = torch.randn(100,1024,1024).to(device)
print('start')
ta = time.time()
for ii in range(100):
optimizer.zero_grad()
out = my(datain)
loss = (out - d2).sum()
loss.backward()
optimizer.step()
tb = time.time()
print(tb-ta)
print('end')