I’m new in PyTorch. I’m implementing simple VAE-GAN model, based on this great notebook: https://www.kaggle.com/carloalbertobarbano/faceswap-trump-in-a-cage/notebook. During training Autoencoder (generator) model I always get the same error:
“RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [2048, 1024]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).”
Here is the part of my code that raises this error (one step training functions):
def train_discriminator(D, criterion, optimizer, real, fake):
optimizer.zero_grad()
with torch.set_grad_enabled(True):
pred_real = D(real)
pred_fake = D(fake)
loss_real = criterion(pred_real, torch.ones(real.size(0), 1).to(device))
loss_fake = criterion(pred_fake, torch.zeros(fake.size(0), 1).to(device))
loss = loss_real + loss_fake
loss.backward(retain_graph=True)
optimizer.step()
return loss.item()
def train_generator(D, criterion_G, criterion_D, optimizer, x, fake, mu, logvar):
optimizer.zero_grad()
with torch.set_grad_enabled(True):
prediction = D(fake)
#before = list(D.parameters())[0].clone()
target = torch.ones(x.size(0), 1).to(device)
d_loss = criterion_D(prediction, target)
d_loss.backward(retain_graph=True) # EXCEPTION raises here
d_loss = d_loss.item()
if criterion_G is not None:
g_loss = criterion_G(fake, x, mu, logvar)
grads = torch.ones_like(g_loss)
g_loss.backward(grads, retain_graph=True)
d_loss += g_loss.mean().item()
optimizer.step()
#after = list(D.parameters())[0].clone()
#print(torch.equal(before.data, after.data))
return d_loss
Training loop:
model.train()
D_A.train()
D_B.train()
loss_hist = {'D': list(), 'G': list()}
for epoch in range(epochs):
print(f'Epoch {epoch+1}/{epochs}')
g_loss = 0
d_loss = 0
for i, batch in enumerate(tqdm(dataloader, leave=False), 1):
x_A, x_B = batch
fake_A, mu_A, logvar_A = model(x_A)
fake_B, mu_B, logvar_B = model(x_B, select='B')
d_loss += train_discriminator(D_A, criterion_D, optimizerD_A, x_A, fake_A)
d_loss += train_discriminator(D_B, criterion_D, optimizerD_B, x_B, fake_B)
g_loss += train_generator(D_A, criterion, criterion_D, optimizerA, x_A, fake_A, mu_A, logvar_A)
g_loss += train_generator(D_B, criterion, criterion_D, optimizerB, x_B, fake_B, mu_B, logvar_B) # EXCEPTION raises here, inplace op error
fake_A2, mu_A2, logvar_A2 = model(x_A, select='B')
fake_B2, mu_B2, logvar_B2 = model(x_B, select='A')
g_loss += train_generator(D_B, None, criterion_D, optimizerB, x_A, fake_A2, mu_A2, logvar_A2)
g_loss += train_generator(D_A, None, criterion_D, optimizerA, x_B, fake_B2, mu_B2, logvar_B2)
d_loss /= i*2
g_loss /= i*4
loss_hist['D'].append(d_loss)
loss_hist['G'].append(g_loss)
print(f'Epoch g_loss: {g_loss:.4f}, d_loss: {d_loss:.4f}')
print(50*'-')
early(d_loss+g_loss, epoch=epoch, model=model, D_A=D_A, D_B=D_B, optimizerA=optimizerA, optimizerB=optimizerB,
optimizerD_A=optimizerD_A, optimizerD_B=optimizerD_B, )
if early.early_stop:
print(f'Train loss did not improve for {early.patience} epochs. Training stopped.')
model, D_A, D_B, optimizerA, optimizerB, optimizerD_A, optimizerD_B, _, early = load_model(PATH)
break
I suppose, that the problem might be in using the computation graph multiple times. I’ve tried almost everything (setting retain_graph=False, using .clone() with different tensors, detaching different tensors, etc.), but I still can’t figure out where this inplace operation took place and how to avoid it.
For interested readers here is the full code (if you have issues to open it, I can upload it anywhere you prefer): Colab notebook
I’m stuck and confused. I would appreciate any suggestions, this is my Everest for now, help me, please, to conquer it