RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [200]] is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to comput

mahmoud_ramadan_muha · May 14, 2023, 11:34am

Traceback (most recent call last):
File “E:\predict_ddi-master\predict_ddi-master\src\run.py”, line 42, in
model.fit(dataloader, i)
File “E:\predict_ddi-master\predict_ddi-master\src\model.py”, line 94, in fit
DNN_loss.backward()
File “C:\Users\mahmoud\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\torch_tensor.py”, line 488, in backward
torch.autograd.backward(
File “C:\Users\mahmoud\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\torch\autograd_init_.py”, line 197, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [200]] is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

run.py code

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.model_selection import RepeatedStratifiedKFold
import json

from model import build_model
from utils import index_data, convert_tensor

SS_mat = pd.read_pickle('../data/structural_similarity_matrix.pkl')
TS_mat = pd.read_pickle('../data/target_similarity_matrix.pkl')
GS_mat = pd.read_pickle('../data/GO_similarity_matrix.pkl')

mlb, _, idx2label, drugPair2effectIdx = index_data()
pd.to_pickle(mlb, '../data/mlb.pkl')
pd.to_pickle(idx2label, '../data/idx2label.pkl')
    
x_idx = []
y_idx = []
for k, v in drugPair2effectIdx.items():
    x_idx.append(k)
    y_idx.append(v)
x_idx, y_idx = np.array(x_idx), np.array(y_idx)

with open('../data/hyperparameter.json') as fp:
    hparam = json.load(fp)

kf = RepeatedStratifiedKFold(n_splits=hparam['n_splits'], n_repeats=hparam['n_repeats'], random_state=2020)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for i, (train_idx, test_idx) in enumerate(kf.split(x_idx, y_idx)):    
    x_train = x_idx[train_idx]
    y_train = y_idx[train_idx]    
    
    SS, TS, GS, y = convert_tensor(x_train, y_train, SS_mat, TS_mat, GS_mat, mlb, idx2label)
    dataset = torch.utils.data.TensorDataset(SS, TS, GS, y)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=256, shuffle=True)
    
    model = build_model(hparam)
    model.to(device)
    model.fit(dataloader, i)
        
    x_test = x_idx[test_idx]
    y_test = y_idx[test_idx]
    pd.to_pickle([x_test, y_test], model.path+'test_data.pkl')
    del x_test, y_test

model.py code

import torch
import torch.nn as nn
import os
import pandas as pd
import numpy as np
from math import ceil

class build_model(nn.Module):
    def __init__(self, hyperparameter):
        super(build_model, self).__init__()       
        
        input_size = hyperparameter['input_size']
        output_size = hyperparameter['output_size']
        code_size = hyperparameter['code_size']
        AE_lr = hyperparameter['AE_lr']
        DNN_lr = hyperparameter['DNN_lr']
        drop_rate = hyperparameter['drop_rate']
        
        self.epoch = hyperparameter['epoch']
        self.n_repeats = hyperparameter['n_repeats']
        self.n_splits = hyperparameter['n_splits']
        self.save_path = hyperparameter['save_path']
        self.patience = hyperparameter['patience']
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.encoder1 = self.build_encoder(input_size, code_size, drop_rate)
        self.decoder1 = self.build_decoder(input_size, code_size)
        AE_params1 = list(self.encoder1.parameters()) + list(self.decoder1.parameters())
        self.AE_opt1 = torch.optim.RMSprop(AE_params1, lr=AE_lr)
        
        self.encoder2 = self.build_encoder(input_size, code_size, drop_rate)
        self.decoder2 = self.build_decoder(input_size, code_size)
        AE_params2 = list(self.encoder2.parameters()) + list(self.decoder2.parameters())
        self.AE_opt2 = torch.optim.RMSprop(AE_params2, lr=AE_lr)
        
        self.encoder3 = self.build_encoder(input_size, code_size, drop_rate)
        self.decoder3 = self.build_decoder(input_size, code_size)
        AE_params3 = list(self.encoder3.parameters()) + list(self.decoder3.parameters())
        self.AE_opt3 = torch.optim.RMSprop(AE_params3, lr=AE_lr)
        
        self.AE_criterion = nn.MSELoss()        
        
        self.DNN = self.build_DNN(code_size*3, output_size, drop_rate)
        DNN_params = list(self.encoder1.parameters()) + list(self.encoder2.parameters()) + list(self.encoder3.parameters()) + list(self.DNN.parameters())        
        self.DNN_opt = torch.optim.Adam(DNN_params, lr=DNN_lr)
        
        self.DNN_criterion = nn.BCEWithLogitsLoss()
        
    
    def forward(self, x1, x2, x3):
        x1 = self.encoder1(x1)
        x1_de = self.decoder1(x1)
        
        x2 = self.encoder2(x2)
        x2_de = self.decoder2(x2)
        
        x3 = self.encoder3(x3)
        x3_de = self.decoder3(x3)
        
        x_dnn = torch.cat((x1, x2, x3), 1)
        pred = self.DNN(x_dnn)
        
        return x1_de, x2_de, x3_de, pred
    
    def fit(self, dataloader, repeat):        
        n = ceil(len(dataloader.dataset)/dataloader.batch_size)
        loss_per_epoch = [] # ['DNN', 'SSP', 'TSP', 'GSP']        
        previous = -1
        for i in range(1, self.epoch+1):
            dnn, ae1, ae2, ae3, j = 0, 0, 0, 0, 1
            
            for x1, x2, x3, y in dataloader:
                x1, x2, x3, y = x1.to(self.device), x2.to(self.device), x3.to(self.device), y.to(self.device)
                o1, o2, o3, pred = self(x1, x2, x3)
                
                self.AE_opt1.zero_grad()
                AE_loss1 = self.AE_criterion(o1, x1) 
                AE_loss1.backward(retain_graph=True)
                self.AE_opt1.step()
                
                self.AE_opt2.zero_grad()
                AE_loss2 = self.AE_criterion(o2, x2)
                AE_loss2.backward(retain_graph=True)
                self.AE_opt2.step()
                
                self.AE_opt3.zero_grad()
                AE_loss3 = self.AE_criterion(o3, x3)
                AE_loss3.backward(retain_graph=True)
                self.AE_opt3.step()
                
                self.DNN_opt.zero_grad()
                DNN_loss = self.DNN_criterion(pred, y)
                DNN_loss.backward()
                self.DNN_opt.step()
                
                tmp_loss = list(map(lambda x: round(float(x), 6), [DNN_loss, AE_loss1, AE_loss2, AE_loss3]))
                dnn += tmp_loss[0]
                ae1 += tmp_loss[1]
                ae2 += tmp_loss[2]
                ae3 += tmp_loss[3]
                
                if j % 50 == 0:
                    print(f'Repeat {repeat+1}/{self.n_repeats*self.n_splits}  Epoch {i}/{self.epoch}  Iter {j}/{n} \n Loss:  DNN {dnn/j:.6f}  SSP {ae1/j:.6f}  TSP {ae2/j:.6f}  GSP {ae3/j:.6f}')
                    print()
                    
                j += 1
                
            j -= 1
            print(f'Repeat {repeat+1}/{self.n_repeats*self.n_splits}  Epoch {i}/{self.epoch}  Iter {j}/{n} \n Loss:  DNN {dnn/j:.6f}  SSP {ae1/j:.6f}  TSP {ae2/j:.6f}  GSP {ae3/j:.6f}')
            print()
            loss_per_epoch.append(list(map(lambda x: x/(j), [dnn, ae1, ae2, ae3])))            
            
            
            if len(loss_per_epoch) > self.patience:
                sum_loss = np.sum(np.array(loss_per_epoch), 1)
                current = np.argmin(sum_loss)
                
                if previous != current:
                    previous = current
                    self.save_model(repeat, loss_per_epoch)
                    
                elif (previous == current) and (previous + self.patience == len(loss_per_epoch)):
                    print('===================Early Stopping===================')
                    break
                                            
    
    def build_encoder(self, input_size, code_size, drop_rate):        
        encoder = nn.Sequential(            
            nn.Linear(input_size, 1000),
            nn.BatchNorm1d(1000),
            nn.Dropout(p=drop_rate, inplace=True),
            nn.ReLU(True),            
            nn.Linear(1000, code_size),
            nn.BatchNorm1d(code_size),
            nn.Dropout(p=drop_rate, inplace=True),
            nn.ReLU(True))
        
        return encoder
    
    def build_decoder(self, input_size, code_size):
        decoder = nn.Sequential(
            nn.Linear(code_size, 1000),
            nn.BatchNorm1d(1000),
            nn.ReLU(True),            
            nn.Linear(1000, input_size),            
            nn.Sigmoid())
        
        return decoder
    
    def build_DNN(self, input_size, output_size, drop_rate):
        DNN = nn.Sequential(            
            nn.Linear(input_size, 2000),
            nn.BatchNorm1d(2000),
            nn.Dropout(p=drop_rate, inplace=True),
            nn.ReLU(True),            
            nn.Linear(2000, 2000),
            nn.BatchNorm1d(2000),
            nn.Dropout(p=drop_rate, inplace=True),
            nn.ReLU(True),
            nn.Linear(2000, 2000),
            nn.BatchNorm1d(2000),
            nn.Dropout(p=drop_rate, inplace=True),
            nn.ReLU(True),
            nn.Linear(2000, 2000),
            nn.BatchNorm1d(2000),
            nn.Dropout(p=drop_rate, inplace=True),
            nn.ReLU(True),
            nn.Linear(2000, 2000),
            nn.BatchNorm1d(2000),
            nn.Dropout(p=drop_rate, inplace=True),
            nn.ReLU(True),
            nn.Linear(2000, 2000),
            nn.BatchNorm1d(2000),
            nn.Dropout(p=drop_rate, inplace=True),
            nn.ReLU(True),
            nn.Linear(2000, 2000),
            nn.BatchNorm1d(2000),
            nn.Dropout(p=drop_rate, inplace=True),
            nn.ReLU(True),
            nn.Linear(2000, output_size))
        
        return DNN
    
    
    def save_model(self, repeat, loss_per_epoch):
        self.path = self.save_path + str(repeat) + '/'
        if not os.path.isdir(self.path):
            os.mkdir(self.path)
        torch.save(self.state_dict(), self.path+'model_checkpoint')
        pd.to_pickle(loss_per_epoch, self.path+'loss_per_epoch.pkl')
    
    def load_model(self, path):
        weights = torch.load(path, map_location=self.device)
        self.load_state_dict(weights)

utils.py code

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import recall_score, precision_score, accuracy_score

def index_data():    
    drugPair2effect = pd.read_pickle('../data/drugPair2effect_idx.pkl')
    y_all = list(drugPair2effect.values())
    
    mlb = MultiLabelBinarizer()
    mlb.fit_transform(y_all)
    
    labels = sorted(list(set(y_all)))
    
    label2idx = {}
    for i, j in enumerate(labels):
        label2idx[j] = i
        
    drugPair2effectIdx = {}
    for k, v in drugPair2effect.items():
        drugPair2effectIdx[k] = label2idx[v]
        
    idx2label = np.zeros(len(label2idx), dtype='O')
    for k, v in label2idx.items():
        idx2label[v] = np.array(k)
    
    return mlb, label2idx, idx2label, drugPair2effectIdx


def convert_tensor(x_idx, y_idx, SS_mat, TS_mat, GS_mat, mlb, idx2label):    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    SS = torch.tensor(SS_mat[x_idx].reshape(len(x_idx), len(SS_mat)*2)).float()
    TS = torch.tensor(TS_mat[x_idx].reshape(len(x_idx), len(TS_mat)*2)).float()
    GS = torch.tensor(GS_mat[x_idx].reshape(len(x_idx), len(GS_mat)*2)).float()
    y = torch.tensor(mlb.transform(idx2label[y_idx])).float()
    
    return SS, TS, GS, y

def evaluate_model(answer, prediction):
    accuracy = accuracy_score(answer, prediction)
    macro_recall = recall_score(answer, prediction, average='macro')
    macro_precision = precision_score(answer, prediction, average='macro')
    micro_recall = recall_score(answer, prediction, average='micro')
    micro_precision = precision_score(answer, prediction, average='micro')
    
    return accuracy, macro_recall, macro_precision, micro_recall, micro_precision

i need help please

KFrank · May 14, 2023, 4:52pm

Hi Mahmoud!

For some suggestions about how to debug such inplace-modification errors,
see this post:

"RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [64, 1]], which is output 0 of AsStridedBackward0, is at version 3; expected version 2 instead. Hint: the backtrace further a autograd

Hi Fahmyadan and Sangyoon! Here are some suggestions about how to track down (and maybe fix) inplace-modification errors. Note that an inplace modification in the forward pass is not necessarily* an error – it depends on whether and how the tensor that was modified is used in the backward pass. Note that inplace operations can be useful for saving memory – if you replace an innocent inplace operation with an out-of-place equivalent, your training will use more memory (and, to a minor e…

                self.AE_opt1.zero_grad()
                AE_loss1 = self.AE_criterion(o1, x1) 
                AE_loss1.backward(retain_graph=True)
                self.AE_opt1.step()
                
                self.AE_opt2.zero_grad()
                AE_loss2 = self.AE_criterion(o2, x2)
                AE_loss2.backward(retain_graph=True)
                self.AE_opt2.step()
                
                self.AE_opt3.zero_grad()
                AE_loss3 = self.AE_criterion(o3, x3)
                AE_loss3.backward(retain_graph=True)
                self.AE_opt3.step()

As discussed in the linked post, these backward (retain_graph=True) calls
often cause inplace-modification errors, so you should make a particular point
of looking at them while debugging your issue.

Best.

K. Frank