When usr RRef.remote() to run functions on the object referenced by this RRef, i meet
RuntimeError: RPCErr:1:RPC ran for more than set timeout (60000 ms) and will now be marked with an error
At first I just tested simple code like this
master node code:
import os
import torch
import torch.distributed.rpc as rpc
import torch.nn as nn
os.environ['MASTER_ADDR'] = '192.168.31.181'
os.environ['MASTER_PORT'] = '1999'
os.environ['GLOO_SOCKET_IFNAME'] = 'eth0'
os.environ['TP_SOCKET_IFNAME'] = 'eth0'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc = nn.Linear(10, 10).to(device)
def forward(self, x):
return self.fc(x)
def test(self):
print("test remote")
rpc.init_rpc("master", rank=0, world_size=2)
r_net = rpc.remote("worker", Net)
r_net.remote().test()
rpc.shutdown()
worker node code
rpc.init_rpc("worker", rank=1, world_size=2)
rpc.shutdown()
And raised the error
RuntimeError: RPCErr:1:RPC ran for more than set timeout (60000 ms) and will now be marked with an error
And i reboot the nodes,The error in the above code disappeared
Next I tried to run the following code
import os
import torch
import torch.distributed.rpc as rpc
from torch.distributed.rpc import RRef
import torch.nn as nn
import torch.nn.functional as fun
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import time
import torch.distributed.autograd as dist_autograd
from torch.distributed.optim import DistributedOptimizer
os.environ['MASTER_ADDR'] = '192.168.31.181'
os.environ['MASTER_PORT'] = '1999'
os.environ['GLOO_SOCKET_IFNAME'] = 'eth0'
os.environ['TP_SOCKET_IFNAME'] = 'eth0'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5).to(device)
self.conv2 = nn.Conv2d(6, 16, 5).to(device)
self.fc1 = nn.Linear(16*5*5, 120).to(device)
self.fc2 = nn.Linear(120, 84).to(device)
self.fc3 = nn.Linear(84, 10).to(device)
self.fc = nn.Linear(10, 10).to(device)
def forward(self, x):
x = x.to(device)
x = fun.max_pool2d(fun.relu(self.conv1(x)), 2)
x = fun.max_pool2d(fun.relu(self.conv2(x)), 2)
x = torch.flatten(x, 1)
x = fun.relu(self.fc1(x))
x = fun.relu(self.fc2(x))
x = self.fc3(x)
return x
def parameter_rrefs(self):
return [RRef(p) for p in self.parameters()]
def run_training():
batch_size = 4
transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset = torchvision.datasets.CIFAR10(root="./data", train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
shuffle=True, num_workers=4)
r_net = rpc.remote("worker", Net)
criterion = nn.CrossEntropyLoss()
opt = DistributedOptimizer(
optim.SGD,
r_net.remote().parameter_rrefs().to_here(),
lr=0.05,
)
time_start = time.time()
for epoch in range(2):
running_loss = 0.0
for i, data in enumerate(trainloader):
inputs, labels = data[0], data[1]
with dist_autograd.context() as context_id:
outputs = r_net.remote().forward(inputs).to_here()
loss = criterion(outputs, labels)
dist_autograd.backward(context_id, [loss])
opt.step(context_id)
running_loss += loss.to_here().item()
if i % 2000 == 1999:
print("epoch ", epoch + 1, " ", i + 1, " running_loss:", running_loss / 2000)
running_loss = 0.0
time_end = time.time()
print("Finished Training in ", (time_end - time_start), " s")
rpc.init_rpc("master", rank=0, world_size=2)
run_training()
rpc.shutdown()
The same problem reappeared and it cannot be solved by restarting
Traceback (most recent call last):
File "/home/nano/rpc_training_master.py", line 89, in <module>
run_training()
File "/home/nano/rpc_training_master.py", line 71, in run_training
outputs = r_net.remote().forward(inputs).to_here()
RuntimeError: RPCErr:1:RPC ran for more than set timeout (60000 ms) and will now be marked with an error
how to fix this problem? thanks you