Distributed: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [2048]] is at version 4; expected version 3 instead

Rachel00 · July 29, 2021, 12:43pm

I have converted a repo:

to use the distributed framework of pytorch, but I get this weird error:

-- Process 3 terminated with the following error:
Traceback (most recent call last):
  File "/home/rachel/miniconda3/envs/pt_models/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
    fn(i, *args)
  File "/home/rachel/rachelve/pt_models/mshifted_mp/main.py", line 172, in main_worker
    mean_eer_arr,auc_arr,loss_arr=train_model(model, train_loader, test_loader, train_loader_1, args)
  File "/home/rachel/rachelve/pt_models/mshifted_mp/main.py", line 61, in train_model
    running_loss = run_epoch(model, train_loader_1, optimizer, center, args)
  File "/home/rachel/rachelve/pt_models/mshifted_mp/main.py", line 90, in run_epoch
    loss.backward()
  File "/home/rachel/miniconda3/envs/pt_models/lib/python3.7/site-packages/torch/_tensor.py", line 255, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/home/rachel/miniconda3/envs/pt_models/lib/python3.7/site-packages/torch/autograd/__init__.py", line 149, in backward
    allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [2048]] is at version 4; expected version 3 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

It does not exactly tell me which part of the code errors out in loss.backward so I am finding it difficult to debug. However, I think I need to .clone() the problematic output, but again, I am not sure where!

The relavent model and loss code is as follows:

def run_epoch(model, train_loader, optimizer, center, args):
    total_loss, total_num = 0.0, 0
    for batch_idx, ((img1, img2), _) in enumerate(train_loader):#, desc='Train...'):

        if args.gpu is not None:
            img1 = img1.cuda(args.gpu, non_blocking=True)
            img2 = img2.cuda(args.gpu, non_blocking=True)

        optimizer.zero_grad()

        out_1 = model(img1)
        out_2 = model(img2)
        out_1 = out_1 - center
        out_2 = out_2 - center

        center_loss = ((out_1 ** 2).sum(dim=1).mean() + (out_2 ** 2).sum(dim=1).mean())
        loss = contrastive_loss(out_1, out_2) + center_loss

        loss.backward()

        optimizer.step()

        total_num += img1.size(0)
        total_loss += loss.item() * img1.size(0)

    return total_loss / (total_num)

def contrastive_loss(out_1, out_2):
    out_1 = F.normalize(out_1, dim=-1)#.clone()
    out_2 = F.normalize(out_2, dim=-1)#.clone()
    bs = out_1.size(0)
    temp = 0.35 #0.25   #0.35 gives increasing auroc
    # [2*B, D]
    out = torch.cat([out_1, out_2], dim=0)
    # [2*B, 2*B]
    sim_matrix = torch.exp(torch.mm(out, out.t().contiguous()) / temp)
    mask = (torch.ones_like(sim_matrix) - torch.eye(2 * bs, device=sim_matrix.device)).bool()
    # [2B, 2B-1]
    sim_matrix = sim_matrix.masked_select(mask).view(2 * bs, -1)

    # compute loss
    pos_sim = torch.exp(torch.sum(out_1 * out_2, dim=-1) / temp)
    # [2*B]
    pos_sim = torch.cat([pos_sim, pos_sim], dim=0)
    loss = (- torch.log(pos_sim / sim_matrix.sum(dim=-1))).mean()
    return loss

def train_model(model, train_loader, test_loader, train_loader_1, args):
    model.eval()
    auc_arr,loss_arr=[],[],[]
    auc, feature_space = get_score(model, args, train_loader, test_loader)
    print('Epoch: {}, AUROC is: {}'.format(0, auc))
    auc_arr.append(auc)
    mean_eer_arr.append(mean_eer)
    optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=0.00005)
    center = torch.FloatTensor(feature_space).mean(dim=0)
    center = F.normalize(center, dim=-1)
    center = center.cuda(args.gpu, non_blocking=True)
    for epoch in range(args.epochs):
        running_loss = run_epoch(model, train_loader_1, optimizer, center, args)
        print('Epoch: {}, Loss: {}'.format(epoch + 1, running_loss))
        loss_arr.append(running_loss)
        auc, _ = get_score(model, args, train_loader, test_loader)
        print('Epoch: {}, AUROC is: {}'.format(epoch + 1, auc))
        auc_arr.append(auc)
    return auc_arr,loss_arr

the distributed parts of the code was adapted from here:

github.com

facebookresearch/moco/blob/master/main_moco.py

#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import argparse
import builtins
import math
import os
import random
import shutil
import time
import warnings

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed

This file has been truncated. show original

the model code is:

class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = models.resnet152(pretrained=True)
        self.backbone.fc = torch.nn.Identity()
        freeze_parameters(self.backbone, train_fc=False)
    def forward(self, x):
        z1 = self.backbone(x)
        z_n = F.normalize(z1, dim=-1)
        return z1

def freeze_parameters(model, train_fc=False):
    for p in model.conv1.parameters():
        p.requires_grad = False
    for p in model.bn1.parameters():
        p.requires_grad = False
    for p in model.layer1.parameters():
        p.requires_grad = False
    for p in model.layer2.parameters():
        p.requires_grad = False
    if not train_fc:
        for p in model.fc.parameters():
            p.requires_grad = False

I think it fails here: z1 = self.backbone(x)

I tried using clone around it etc, but it does not seem to help.