RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [32, 768]], which is output 0 of TanhBackward, is at version 2; expected version 0 instead. Hint: enable anomaly detectio

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [32, 768]], which is output 0 of TanhBackward, is at version 2; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

def forward(self, input, labels):
    result = self.bert(**input).pooler_output

    label_one = [idx for idx, dat in enumerate(labels) if dat.item() == 0]
    label_two = [idx for idx, dat in enumerate(labels) if dat.item() == 1]
    alpha = 0.5
    lam = np.random.beta(alpha, alpha)
    temp_result = result.detach()  # 拷贝一个共享内存,合并操作不能进行梯度修改
    # 把原始节点的特征, 替换为新节点的特征
    if label_one:
        label_one_tensor = torch.tensor(label_one).to(device)
        deep_label_one = copy.deepcopy(label_one)
        random.shuffle(deep_label_one)
        label_one_tensor_shuffle = torch.tensor(deep_label_one).to(device)
        # with torch.no_grad():
        result[label_one_tensor] = lam * temp_result[label_one_tensor, :] + (1 - lam) * temp_result[label_one_tensor_shuffle, :]

    print(result)
    if label_two:
        label_two_tensor = torch.tensor(label_two).to(device)
        deep_label_two = copy.deepcopy(label_two)
        random.shuffle(deep_label_two)
        label_two_tensor_shuffle = torch.tensor(deep_label_two).to(device)
        # with torch.no_grad():
        result[label_two_tensor] = lam * temp_result[label_two_tensor, :] + (1 - lam) * temp_result[label_two_tensor_shuffle, :]

    data = self.head(result)
    return data, self.fc(data), labels

I guess these inplace operations might be causing the issue:

result[label_one_tensor] = lam * temp_result[label_one_tensor, :] + (1 - lam) * temp_result[label_one_tensor_shuffle, :]

result[label_two_tensor] = lam * temp_result[label_two_tensor, :] + (1 - lam) * temp_result[label_two_tensor_shuffle, :]

and you might need to store the output in a temporal tensor and torch.cat or torch.stack them to the final result tensor.