Simple trainning but got error: one of the variables needed for gradient computation has been modified by an inplace operation

Hi, everyone.
I have this error when I try to retrain my network.

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [64, 10]], which is output 0 of AsStridedBackward0, is at version 3; expected version 2 instead.

I created and trained two simple network using the MNIST dataset, and it successfully trained two network.

class Net1(nn.Module):
    def __init__(self):
        super(Net1, self).__init__()
        self.fc1 = nn.Linear(784, 256)
        self.fc2 = nn.Linear(256, 64)
        self.fc3 = nn.Linear(64, 10)
        self.r1 = nn.ReLU(inplace=False)
        self.r2 = nn.ReLU(inplace=False)

    def forward(self, x):
        x = torch.flatten(x, 1)
        x = self.r1(self.fc1(x))
        x = self.r2(self.fc2(x))
        x = self.fc3(x)
        return x

But when I reload the network and want to retrain the network, it shows the above error. Below is my main code for re-train.

class MyDataset(Dataset):
    def __init__(self, index, input_x, label):
        self.size = len(index)
        self.input_list = input_x
        self.label_list = label

    def __len__(self):
        return self.size

    def __getitem__(self, item):
        image = self.input_list[index[item]]
        labels = self.label_list[index[item]]
        return image, labels


if __name__ == '__main__':
    small_path = './mnist_fnn_s.pth'
    large_path = './mnist_fnn_l.pth'
    temp_path = './mnist_temp.pth'
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    net1 = Net1()
    net1.load_state_dict(torch.load(large_path, map_location=torch.device('cpu')))
    net2 = Net1()
    net2.load_state_dict(torch.load(small_path, map_location=torch.device('cpu')))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net2.parameters(), lr=0.01)

    image_path = './14.jpg'
    IM = imageio.v2.imread(image_path)
    IM = IM[:, :, np.newaxis]
    IM = IM / 255
    IM = (IM - 0.5) / 0.5
    lb = -0.05
    ub = 0.05
    max_range = [[1.5, 1.], [1.5, 1.], [1.5, 1.], [1.5, 1.], [1.5, 1.], [1.5, 1.], [1.5, 1.], [1.5, 1.], [1.5, 1.], [1.5, 1.]]
    max_range = np.array(max_range)

    # create original dataset
    n_dataset = 5000
    e = 0.0001

    for k in range(10):
        print("%d iteration" % k)
        input_list = []
        ref_list = []
        output_l_list = []
        output_s_list = []
        index = []

        for i in range(n_dataset):
            if (i + 1) % 1000 == 0:
                print("%d train sample created" % (i + 1))
            noise = lb + (ub - lb) * np.random.random()
            IM_sample = IM.copy()
            IM_sample[13:15, 13:15, :] += noise
            input_list.append(IM_sample.transpose(2, 0, 1))
            outputs1 = net1(torch.from_numpy(IM_sample.transpose(2, 0, 1)).unsqueeze_(0).to(torch.float32))
            outputs2 = net2(torch.from_numpy(IM_sample.transpose(2, 0, 1)).unsqueeze_(0).to(torch.float32))
            output_l_list.append(outputs1)
            output_s_list.append(outputs2)
            ref_list.append(outputs2.squeeze() + (outputs1.squeeze() - outputs2.squeeze() + e) / 2)
            for label in range(10):
                label_diff = outputs1[0, label] - outputs2[0, label]
                max_label = max(abs(max_range[label]))
                if (abs(label_diff) > max_label * 2 / 3) & (i not in index):
                    index.append(i)

        print(output_s_list[0:10])
        train_dataset = MyDataset(index, input_list, ref_list)
        train_dataloader = DataLoader(dataset=train_dataset, batch_size=1, shuffle=False, num_workers=0)

        print("Start re-train")
        net2.train()
        for epoch in range(20):
            if (epoch + 1) % 5 == 0:
                print("%d epoch re-train" % (epoch + 1))
            for i, data in enumerate(train_dataloader, 0):
                inputs, labels = data[0], data[1]
                outputs = net2(inputs.to(torch.float32))
                optimizer.zero_grad()
                loss = criterion(outputs, labels)
                # loss.backward(retain_graph=True)
                # loss.backward(inputs=list(net2.parameters()))
                loss.backward()
                optimizer.step()

        net2.eval()
        torch.save(net2.state_dict(), temp_path)

I know this kind of problem has been discussed many times in other topic. But no solution works for my case. So I want you guys help on this. Thank you!

Based on your code it seems you are creating a dataset based on differentiable outputs from your models. Do you really want to backpropagate through the data creation or would it make sense to create these samples in a no_grad context?

Those created data are for my re-training. So I do want to backpropagate those data to train my model closer to what I want.

Hello, @ptrblck. Could you give me some advice on this problem?