RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [1, 1, 256, 256]], which is output 0 of ReluBackward0, is at version 1; expected version 0 instead

zeeshannisar · February 7, 2022, 3:22pm

Hi,

Frankly speaking, I am a newbie to Pycharm and familiar with Tensorflow. While reproducing the code available at Here using Pycharm as the IDE, I am facing the following error. Can you please help me to resolve it? I would really appreciate your help. Thanks in advance.

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [1, 1, 256, 256]], which is output 0 of ReluBackward0, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

I have just edited the dataset file (to load the dataset in an unsupervised way from the directories) and created a new train.py file to run the code in Pycharm. All of the remaining code is exactly the same as in the mentioned repository.

The dataset.py file is edited to this:

import os
import glob
import torch
import random
import torch.utils.data as data
from PIL import Image
import torchvision.transforms as transforms


class Images_with_Names(data.Dataset):
    """ can act both as Supervised or Un-supervised """

    def __init__(self, directory_A, directory_B, unsupervised=True, transform=None):
        self.directory_A = directory_A
        self.directory_B = directory_B
        self.unsupervised = unsupervised
        self.transform = transform

        self.imageList_A = sorted(glob.glob(f"{directory_A}/*.jpg*"))
        self.imageList_B = sorted(glob.glob(f"{directory_B}/*.jpg*"))

    def __getitem__(self, index):
        image_A = Image.open(self.imageList_A[index])
        if self.unsupervised:
            image_B = Image.open(self.imageList_B[random.randint(0, len(self.imageList_B) - 1)])
        else:
            image_B = Image.open(self.imageList_B[index])

        if self.transform is not None:
            image_A = self.transform(image_A)
            image_B = self.transform(image_B)

        return image_A, image_B

    def __len__(self):
        return max(len(self.imageList_A), len(self.imageList_B))

def preprocessing(x):
    x = (x / 127.5) - 1
    x = torch.reshape(x, (-1, x.shape[0], x.shape[1], x.shape[2]))
    return x

The train.py file is:

import os
import torch
import torchvision.transforms as transforms
from torchsummary import summary

from utils import train_UGAC
from dataset import Images_with_Names
from dataset import preprocessing
from Networks import CasUNet_3head, NLayerDiscriminator


# First instantiate the generators and discriminators
netG_A = CasUNet_3head(3, 3)
netD_A = NLayerDiscriminator(3, n_layers=4)
netG_B = CasUNet_3head(3, 3)
netD_B = NLayerDiscriminator(3, n_layers=4)

data_directory = "../code/UncertaintyAwareCycleConsistency/data/"
directory_A = os.path.join(data_directory, "A")
directory_B = os.path.join(data_directory, "B")

data_transformer = transforms.Compose([transforms.PILToTensor(),
                                       transforms.Lambda(lambda x: preprocessing(x))])

train_loader = Images_with_Names(directory_A=directory_A, directory_B=directory_B, unsupervised=True,
                                 transform=data_transformer)

# summary(netG_A.cuda(), input_size=(3, 256, 256))
train_UGAC(netG_A, netG_B, netD_A, netD_B, train_loader, dtype=torch.cuda.FloatTensor, device='cuda',
           num_epochs=10, init_lr=1e-5, ckpt_path='..saved_models/checkpoints/UGAC',
           list_of_hp=[1, 0.015, 0.01, 0.001, 1, 0.015, 0.01, 0.001, 0.05, 0.05, 0.01])

Attempts that I have tried to resolve the issue are:

Setting inplace=False to all Relu and LeakyReluactivations following this but failed.
Tried to get traceback of forward call that caused the error with torch.autograd.set_detect_anomaly(True), it says the following:

[W python_anomaly_mode.cpp:104] Warning: Error detected in ReluBackward0. Traceback of forward call that caused the error:

File "/home/xyz/code/UncertaintyAwareCycleConsistency/src/train.py", line 29, in <module>
    netG_A, netG_B, netD_A, netD_B = train_UGAC(netG_A, netG_B, netD_A, netD_B, train_loader, dtype=torch.cuda.FloatTensor,
  File "/home/xyz/code/UncertaintyAwareCycleConsistency/src/utils.py", line 69, in train_UGAC
    t0, t0_alpha, t0_beta = netG_B(xA)
  File "/home/xyz/.conda/envs/pytorch/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/xyz/code/UncertaintyAwareCycleConsistency/src/Networks.py", line 205, in forward
    y = self.unet_list[i](y + x)
  File "/home/xyz/.conda/envs/pytorch/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/xyz/code/UncertaintyAwareCycleConsistency/src/Networks.py", line 181, in forward
    y_mean, y_alpha, y_beta = self.out_mean(x), self.out_alpha(x), self.out_beta(x)

Looking forward to hearing from you soon. Thanks.

@ptrblck

ptrblck · February 8, 2022, 10:25pm

It’s hard to tell where the error is coming from without seeing the model definition. Check the forward implementation of your model(s) and remove all inplace operations (e.g. tensor += a) and replace them with their out-of-place versions.

Qian_Li · June 16, 2022, 12:00pm

@zeeshannisar Have you resolved it? I run into this problem too. Setting inplace=False to all Relu
failed.

twopieye · October 10, 2022, 3:39pm

dawg. I’m throwing together a bare-bones implementation of perceiverIO and your suggestion got my training loop to work. Thanks man

wbf22 · January 26, 2023, 3:57pm

I also had a similar error and my problem was having two optimizers updating different subsections of my model. Many optimizers keep track of previous passes to change how the weights are modified. My first optimizer would step the weights leaving a a different version in those weights. When my second optimizer came around and tried update those weights with other weights that had not been trained yet, it threw the error.

I found the solution was to add up the loss from both of the different ways I wanted to train the model, and then call backward at once for both. Then I put the entire model on one optimizer and called step right after that backward call. This fixed the problem.

For me this error was really misleading, but I was doing something pretty weird haha. Hopefully this helps someone

FD-Star · February 4, 2023, 8:11pm

Thank you for your sharing. I have also encountered this problem recently when doing similar work. If it is convenient, can you share the code? Thank you very much.

mahmoud_ramadan_muha · May 14, 2023, 11:35am

if you can help me

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [200]] is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to comput vision

Traceback (most recent call last): File “E:\predict_ddi-master\predict_ddi-master\src\run.py”, line 42, in model.fit(dataloader, i) File “E:\predict_ddi-master\predict_ddi-master\src\model.py”, line 94, in fit DNN_loss.backward() File “C:\Users\mahmoud\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\torch_tensor.py”, line 488, in backward torch.autograd.backward( File “C:\Users\mahmoud\AppData\Local\Packages\Pyth…

charlesxu90 · July 3, 2023, 7:30am

I met with the same problem while calculating mlm loss and contrastive loss on the same input.

The simple solution is to clone the inputs to make a copy for it.

text_mlm = text.clone()
loss_itc = itc(text, image)
loss_mlm = mlm(text)

For your program, I think you should check forward in train_UGAC to see if similar problem exists.

MRH · November 12, 2023, 8:18pm

Thanks a lot. it worked for me.

imzhangyd · December 21, 2023, 8:52am

@ptrblck I also met this problem, when I want to select part of the tensor to do self-attention, here is my code

for idx in range(10) :
    mask = (indices == idx)
    x[mask] = self.selfattention(x[mask], x[mask], x[mask])

I tried copy x, but that didn’t work to this problem. Could you please help me to slove it?

ptrblck · December 21, 2023, 12:47pm

Try to .clone() the output of the computation before assigning it back.

UpendraKatara127 · June 10, 2024, 2:44am

RuntimeError Traceback (most recent call last)
Cell In[177], line 52
50 print(loss)
51 loss = loss * (-reward)
—> 52 loss.backward()
53 #progbar.set_description("Loss : {:.3f} ".format(loss))
54 #player.train_model(reward)
55 player.finalize_episode(ans[‘ans’])

File /opt/homebrew/anaconda3/envs/myenv/lib/python3.9/site-packages/torch/_tensor.py:492, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
482 if has_torch_function_unary(self):
483 return handle_torch_function(
484 Tensor.backward,
485 (self,),
(…)
490 inputs=inputs,
491 )
→ 492 torch.autograd.backward(
493 self, gradient, retain_graph, create_graph, inputs=inputs
494 )

File /opt/homebrew/anaconda3/envs/myenv/lib/python3.9/site-packages/torch/autograd/init.py:251, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
246 retain_graph = create_graph
248 # The reason we repeat the same comment below is that
249 # some Python versions print out the first line of a multi-line function
250 # calls in the traceback and some print out the last line
→ 251 Variable.execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
252 tensors,
253 grad_tensors,
254 retain_graph,
255 create_graph,
256 inputs,
257 allow_unreachable=True,
258 accumulate_grad=True,
259 )

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 26]], which is output 0 of SigmoidBackward0, is at version 1; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

This is my training loop
for episode_set in progbar :
for _ in range(update_episode) :
state = env.reset()
done = False
correct_count = 0
probs_ =
while not done :
# print(“Train”)
guessed = player.get_guessed_mat(state)
guess, probs = player.select_action(state)
state, reward, done, ans = env.step(guess)
correct_mask_answer = np.array([1 if l in env.guess_word else 0 for l in letters])
correct_mask_guessed = np.array([1 if l in guessed else 0 for l in letters])
obj = (1-correct_mask_guessed)*correct_mask_answer
obj = obj[:-1]
# print(probs.shape)
# print(state)
if reward > 0 :
correct_count += 1.0
if reward == env.win_reward :
wins_avg += 1.0
target_tensor = torch.from_numpy(obj)
target = target_tensor.unsqueeze(0)
target = torch.tensor(target,dtype=torch.float32)
probs_ = probs.clone()
target.detach()
loss = loss_func(probs,target)
print(loss)
loss = loss * (-reward)
loss.backward()
#progbar.set_description("Loss : {:.3f} ".format(loss))
#player.train_model(reward)
player.finalize_episode(ans[‘ans’])
avg_correct += correct_count

UpendraKatara127 · June 10, 2024, 2:45am

can you help me in my query

ptrblck · June 10, 2024, 1:52pm

It’s unclear where the error comes from in your code so could you post a minimal and executable code snippet reproducing the issue, please? The posted stacktrace does not show enough information to see where a disallowed inplace manipulation is done.

Riyaj_Atar · August 16, 2024, 12:11pm

i am having same error .
this is my code

    for epoch in range(1, epochs):

        with tqdm(train_loader, unit="batch") as tepoch:

            for i, batch in enumerate(tepoch):

                tepoch.set_description(f"Epoch {epoch}")

                query, passages = batch["query"], batch["passage"]

                q_reps = encode(model, query.to(device))
                p_reps = encode(model, passages.to(device))

                sim_scores = compute_similarity(q_reps, p_reps)

                scores = sim_scores.view(q_reps.size(0), -1)
                

                target = torch.arange(batch_size) * group_size
                target = target.to(device)
                loss = ce_loss(scores, target)
                
                # target2 = torch.arange(batch_size, device=device, dtype=torch.long)
                # s_clone = scores.clone()
                # loss2 = ce_loss(s_clone[:,target].transpose(0,1), target2)
                
                # loss = 0.5*(loss1 + loss2 )
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()


def main_worker(gpu, ngpus_per_node, args):

    model_name_or_path = args.model_name_or_path
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True)

    # for n,p in model.named_parameters():
    #     if "attention"  in n or "attn_ln"  in n:#if "embeddings" in n:
    #         p.requires_grad = True
    #         print(f"setting grad :{n} True")
    #     else:
    #         p.requires_grad = False
    #         print(f"setting grad :{n} False")
            
    args.gpu = gpu
    ngpus_per_node = torch.cuda.device_count()

    print("Use GPU: {} for training".format(args.gpu))

    args.rank = args.rank * ngpus_per_node + gpu

    dist.init_process_group(
        backend=args.dist_backend,
        init_method=args.dist_url,
        world_size=args.world_size,
        rank=args.rank,
    )

    print("==> Making model..")

    torch.cuda.set_device(args.gpu)

    model.cuda(args.gpu)

    args.batch_size = int(args.batch_size / ngpus_per_node)
    args.num_workers = int(args.num_workers / ngpus_per_node)

    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])

    train_dataset = TrainDatasetForEmbedding(args=data_args, tokenizer=tokenizer)
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=EmbedCollator(tokenizer),
        sampler=train_sampler,
    )

Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.LongTensor [8, 30]] is at version 3; expected version 2 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).