I tried to use torch.no_grad() with DDP, but it would throw
This error indicates that your
module has parameters that were not used in producing its output (the
return value of forward). You can enable unused parameter detection
by passing the keyword argument find_unused_parameters=True to
torch.nn.parallel.DistributedDataParallel
the pseudo code is as following
class MyModel(nn.Module):
def forward(self, x):
with torch.no_grad():
self.layers(x)
return x
class WholeFlow(nn.Module):
def __init__(self):
self.f=MyModel()
self.g=nn.Linear(256, 256)
def forward(self, x):
x=self.f(x)
x=self.g(x)
return x
SGD=(WholeFlow.g.parameters(),...)
@ptrblck I meet a similar problem, I have a code like follows, only last frame will do bp and calculate loss. But all frames use the same parameters.
It works with find_unused_parameters=True, but the weights cannot be updated. I need the weights to be updated by last frame’s bp. So I can only use detach for this case?
class DummyDataset(torch.utils.data.Dataset):
def __init__(self, seq_len=5):
self.seq_len = seq_len
def __len__(self):
return 200
def __getitem__(self, idx):
return np.random.rand(self.seq_len, 10).astype(np.float32), np.random.rand(1).astype(np.float32)
class DummyNet(nn.Module):
def __init__(self):
super().__init__()
self.mlp_fea = nn.Linear(10, 10)
self.mlp_out = nn.Linear(10, 1)
def forward(self, inputs, labels, training=False):
inputs = inputs.cuda()
labels = labels.cuda()
B, SEQ, C = inputs.shape
print(self.mlp_fea.weight)
for i in range(SEQ):
# print('seq: ', i)
x = inputs[:, i]
if i == 0:
fea_prev = torch.zeros_like(x)
if i < SEQ - 1:
self.eval()
with torch.no_grad():
fea_prev = self.mlp_fea(x + fea_prev)
self.train()
else:
fea_prev = self.mlp_fea(x + fea_prev)
out = self.mlp_out(fea_prev)
if i == SEQ - 1:
loss = nn.L1Loss(reduction='mean')(out, labels)
return loss