RuntimeError: No grad accumulator for a saved leaf

Dibyendu_Das · January 2, 2023, 2:11pm

I am facing the following error through a NN with embedding model:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-50-c2985808eb6c> in <module>
     14 
     15         loss = criterion(anchor_out, positive_out, negative_out)
---> 16         loss.backward()
     17         optimizer.step()
     18 

/opt/conda/lib/python3.6/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    148                 products. Defaults to ``False``.
    149         """
--> 150         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    151 
    152     def register_hook(self, hook):

/opt/conda/lib/python3.6/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     97     Variable._execution_engine.run_backward(
     98         tensors, grad_tensors, retain_graph, create_graph,
---> 99         allow_unreachable=True)  # allow_unreachable flag
    100 
    101 

RuntimeError: No grad accumulator for a saved leaf!

Here is my code:


class Network(nn.Module):
    def __init__(self, emb_dim=128):
        super(Network, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, 5),
            nn.PReLU(),
            nn.MaxPool2d(2, stride=2),
            nn.Dropout(0.3),
            nn.Conv2d(32, 64, 5),
            nn.PReLU(),
            nn.MaxPool2d(2, stride=2),
            nn.Dropout(0.3)
        )
        
        self.fc = nn.Sequential(
            nn.Linear(64*2*2, 512),
            nn.PReLU(),
            nn.Linear(512, emb_dim)
        )
        
    def forward(self, x):
        x = self.conv(x)
        print(x.shape)
        x = x.view(-1, 64*2*2)
        x = self.fc(x)
        # x = nn.functional.normalize(x)
        return x

def init_weights(m):
    if isinstance(m, nn.Conv2d):
        torch.nn.init.kaiming_normal_(m.weight)

model = Network(embedding_dims)
model.apply(init_weights)
model = torch.jit.script(model).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = torch.jit.script(TripletLoss())


model.train()
for epoch in tqdm(range(epochs), desc="Epochs"):
    running_loss = []
    for step, (anchor_img, positive_img, negative_img, anchor_label) in enumerate(tqdm(train_loader, desc="Training", leave=False)):
        anchor_img = anchor_img.to(device)
        positive_img = positive_img.to(device)
        negative_img = negative_img.to(device)
        
        optimizer.zero_grad()
        anchor_out = model(anchor_img)
        positive_out = model(positive_img)
        negative_out = model(negative_img)
        
        
        loss = criterion(anchor_out, positive_out, negative_out)
        loss.backward()
        optimizer.step()
        
        running_loss.append(loss.cpu().detach().numpy())
    print("Epoch: {}/{} - Loss: {:.4f}".format(epoch+1, epochs, np.mean(running_loss)))

ptrblck · January 2, 2023, 9:42pm

I cannot reproduce the issue using your code and nn.TripletMarginLoss() since your TripletLoss is undefined:

embedding_dims = 10
model = Network(embedding_dims)
model.apply(init_weights)
model = torch.jit.script(model)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = torch.jit.script(nn.TripletMarginLoss())

model.train()
anchor_img = torch.randn(1, 1, 28, 28)
positive_img = torch.randn(1, 1, 28, 28)
negative_img = torch.randn(1, 1, 28, 28)
        
optimizer.zero_grad()
anchor_out = model(anchor_img)

positive_out = model(positive_img)
negative_out = model(negative_img)
        
loss = criterion(anchor_out, positive_out, negative_out)
loss.backward()
optimizer.step()

Dibyendu_Das · January 4, 2023, 1:44pm

Here is the code for TripletLoss:

class TripletLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin
        
    def calc_euclidean(self, x1, x2):
        return (x1 - x2).pow(2).sum(1)
    
    def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor:
        distance_positive = self.calc_euclidean(anchor, positive)
        distance_negative = self.calc_euclidean(anchor, negative)
        losses = torch.relu(distance_positive - distance_negative + self.margin)

        return losses.mean()

ptrblck · January 4, 2023, 7:20pm

My code snippet still works for your custom TripletLoss.

Dibyendu_Das · January 5, 2023, 3:18pm

I really don’t understand why I am getting this error then. Is it related to any version of pytorch? or is it related to the following?


torch.manual_seed(2020)
np.random.seed(2020)
random.seed(2020)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if device.type == "cuda":
    torch.cuda.get_device_name()

ptrblck · January 5, 2023, 10:28pm

I don’t know, so could you post a minimal, executable code snippet to reproduce the issue?