I am trying to run the network training in multiple processes. However, the gradients that are computed after the loss is back-propagated are available only in that process not outside, even when the process joins.
Here is a minimal reproducing example:
import torch
import torch.nn as nn
import torch.multiprocessing as mp
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.net = nn.Sequential(
nn.Linear(3, 4),
nn.ReLU(),
nn.Linear(4, 1)
)
def forward(self, x):
return self.net(x)
def compute_something(network: nn.Module, pid: int):
print(f'Running PID: {pid}')
inputs = torch.randn(size=(5, 3))
logits = network(inputs)
loss = -1 * torch.mean(torch.pow(logits, 2))
loss.backward()
print('INSIDE' + '#' * 10)
for param in network.parameters():
print(f'Param: {param}')
print(f'Grad: {param.grad}')
print('#' * 10)
def main():
net = Net()
net = net.share_memory()
processes = [mp.Process(target=compute_something, args=(net, pid, ))
for pid in range(2)]
for p in processes:
p.start()
for p in processes:
p.join()
for param in net.parameters():
print(f'Param: {param}')
print(f'Grad: {param.grad}')
if __name__ == '__main__':
main()
In the internal case the gradients are computed and assigned to tensors, but in the external case, the gradients are None
. I appreciate any help.