Pytorch1.5.1: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation when i backward twice the loss

Here is my network
class Encoder(nn.Module):
def init(self, input_channels, output_channels, representation_size=64):
super(Encoder, self).init()
# input parameters
self.input_channels = input_channels
self.output_channels = output_channels

    self.features = nn.Sequential(
        # nc x 64 x 64
        nn.Conv2d(self.input_channels, representation_size, 5, stride=2, padding=2),
        nn.BatchNorm2d(representation_size),
        nn.ReLU(),
        # hidden_size x 32 x 32
        nn.Conv2d(representation_size, representation_size * 2, 5, stride=2, padding=2),
        nn.BatchNorm2d(representation_size * 2),
        nn.ReLU(),
        # hidden_size*2 x 16 x 16
        nn.Conv2d(representation_size * 2, representation_size * 4, 5, stride=2, padding=2),
        nn.BatchNorm2d(representation_size * 4),
        nn.ReLU())
    # hidden_size*4 x 8 x 8

    self.mean = nn.Sequential(
        nn.Linear(representation_size * 4 * 8 * 8, 2048),
        nn.BatchNorm1d(2048),
        nn.ReLU(),
        nn.Linear(2048, output_channels))

    self.logvar = nn.Sequential(
        nn.Linear(representation_size * 4 * 8 * 8, 2048),
        nn.BatchNorm1d(2048),
        nn.ReLU(),
        nn.Linear(2048, output_channels))

def forward(self, x):
    batch_size = x.size()[0]

    hidden_representation = self.features(x)

    mean = self.mean(hidden_representation.view(batch_size, -1))
    logvar = self.logvar(hidden_representation.view(batch_size, -1))

    return mean, logvar

def hidden_layer(self, x):
    batch_size = x.size()[0]
    output = self.features(x)
    return output

class Decoder(nn.Module):
def init(self, input_size, representation_size):
super(Decoder, self).init()
self.input_size = input_size
self.representation_size = representation_size
dim = representation_size[0] * representation_size[1] * representation_size[2]

    self.preprocess = nn.Sequential(
        nn.Linear(input_size, dim),
        nn.BatchNorm1d(dim),
        nn.ReLU())

    # 256 x 8 x 8
    self.deconv1 = nn.ConvTranspose2d(representation_size[0], 256, 5, stride=2, padding=2)
    self.act1 = nn.Sequential(nn.BatchNorm2d(256),
                              nn.ReLU())
    # 256 x 16 x 16
    self.deconv2 = nn.ConvTranspose2d(256, 128, 5, stride=2, padding=2)
    self.act2 = nn.Sequential(nn.BatchNorm2d(128),
                              nn.ReLU())
    # 128 x 32 x 32
    self.deconv3 = nn.ConvTranspose2d(128, 32, 5, stride=2, padding=2)
    self.act3 = nn.Sequential(nn.BatchNorm2d(32),
                              nn.ReLU())
    # 32 x 64 x 64
    self.deconv4 = nn.ConvTranspose2d(32, 3, 5, stride=1, padding=2)
    # 3 x 64 x 64
    self.activation = nn.Tanh()

def forward(self, code):
    bs = code.size()[0]
    preprocessed_codes = self.preprocess(code)
    preprocessed_codes = preprocessed_codes.view(-1,
                                                 self.representation_size[0],
                                                 self.representation_size[1],
                                                 self.representation_size[2])
    output = self.deconv1(preprocessed_codes, output_size=(bs, 256, 16, 16))
    output = self.act1(output)
    output = self.deconv2(output, output_size=(bs, 128, 32, 32))
    output = self.act2(output)
    output = self.deconv3(output, output_size=(bs, 32, 64, 64))
    output = self.act3(output)
    output = self.deconv4(output, output_size=(bs, 3, 64, 64))
    output = self.activation(output)
    return output

class VAE_GAN_Generator(nn.Module):
def init(self, input_channels, hidden_size, representation_size=(256, 8, 8)):
super(VAE_GAN_Generator, self).init()
self.input_channels = input_channels
self.hidden_size = hidden_size
self.representation_size = representation_size

    self.encoder = Encoder(input_channels, hidden_size)
    self.decoder = Decoder(hidden_size, representation_size)

def forward(self, x):
    batch_size = x.size()[0]
    mean, logvar = self.encoder(x)
    std = logvar.mul(0.5).exp_()

    reparametrized_noise = Variable(torch.randn((batch_size, self.hidden_size))).cuda()

    reparametrized_noise = mean + std * reparametrized_noise

    rec_images = self.decoder(reparametrized_noise)

    return mean, logvar, rec_images

class Discriminator(nn.Module):
def init(self, input_channels=3, representation_size=(256, 8, 8)):
super(Discriminator, self).init()
self.representation_size = representation_size
dim = representation_size[0] * representation_size[1] * representation_size[2]

    self.main = nn.Sequential(
        nn.Conv2d(input_channels, 32, 5, stride=1, padding=2),
        nn.BatchNorm2d(32),
        nn.LeakyReLU(0.2),
        nn.Conv2d(32, 128, 5, stride=2, padding=2),
        nn.BatchNorm2d(128),
        nn.LeakyReLU(0.2),
        nn.Conv2d(128, 256, 5, stride=2, padding=2),
        nn.BatchNorm2d(256),
        nn.LeakyReLU(0.2),
        nn.Conv2d(256, 256, 5, stride=2, padding=2),
        nn.BatchNorm2d(256),
        nn.LeakyReLU(0.2))

    self.lth_features = nn.Sequential(
        nn.Linear(dim, 2048),
        nn.LeakyReLU(0.2))

    self.sigmoid_output = nn.Sequential(
        nn.Linear(2048, 1),
        nn.Sigmoid())

def forward(self, x):
    batch_size = x.size()[0]
    features = self.main(x)
    lth_rep = self.lth_features(features.view(batch_size, -1))
    output = self.sigmoid_output(lth_rep)
    return output

def similarity(self, x):
    batch_size = x.size()[0]
    features = self.main(x)
    lth_rep = self.lth_features(features.view(batch_size, -1))
    return lth_rep

Here is the training process:
G = VAE_GAN_Generator(input_channels, hidden_size).cuda()
D = Discriminator(input_channels).cuda()

criterion = nn.BCELoss()
criterion.cuda()

opt_enc = optim.RMSprop(G.encoder.parameters(), lr=lr)
opt_dec = optim.RMSprop(G.decoder.parameters(), lr=lr)
opt_dis = optim.RMSprop(D.parameters(), lr=lr * alpha)

for data, _ in dataloader:
batch_size = data.size()[0]
ones_label = Variable(torch.ones(batch_size)).cuda()
zeros_label = Variable(torch.zeros(batch_size)).cuda()

        # print (data.size())
        datav = Variable(data).cuda()
        mean, logvar, rec_enc = G(datav)
        # print ("The size of rec_enc:", rec_enc.size())

        noisev = Variable(torch.randn(batch_size, hidden_size)).cuda()
        rec_noise = G.decoder(noisev)

        # train discriminator
        output = D(datav)
        errD_real = criterion(output, ones_label)
        D_real_list.append(output.data.mean())
        output = D(rec_enc)
        errD_rec_enc = criterion(output, zeros_label)
        D_rec_enc_list.append(output.data.mean())
        output = D(rec_noise)
        errD_rec_noise = criterion(output, zeros_label)
        D_rec_noise_list.append(output.data.mean())

        dis_img_loss = errD_real + errD_rec_enc + errD_rec_noise
        # print ("print (dis_img_loss)", dis_img_loss)
        D_list.append(dis_img_loss.data.mean())
        opt_dis.zero_grad()
        dis_img_loss.backward(retain_graph=True)
        opt_dis.step()

        # train decoder
        output = D(datav)
        errD_real = criterion(output, ones_label)
        output = D(rec_enc)
        errD_rec_enc = criterion(output, zeros_label)
        output = D(rec_noise)
        errD_rec_noise = criterion(output, zeros_label)

        similarity_rec_enc = D.similarity(rec_enc)
        similarity_data = D.similarity(datav)

        dis_img_loss = errD_real + errD_rec_enc + errD_rec_noise
        # print (dis_img_loss)
        # gen_img_loss =  -dis_img_loss

        gen_img_loss = -dis_img_loss

        g_loss_list.append(gen_img_loss.data.mean())
        rec_loss = ((similarity_rec_enc - similarity_data) ** 2).mean()
        rec_loss_list.append(rec_loss.data.mean())
        err_dec = gamma * rec_loss + gen_img_loss

        # train encoder
        prior_loss = 1 + logvar - mean.pow(2) - logvar.exp()
        prior_loss = (-0.5 * torch.sum(prior_loss)) / torch.numel(mean.data)
        # print (prior_loss, mean, std)
        prior_loss_list.append(prior_loss.data.mean())
        # err_enc = prior_loss + beta * rec_loss
        # err_enc= prior_loss #it's okay

        opt_dec.zero_grad()
        err_dec.backward(retain_graph=True)  
        opt_dec.step()

        err_enc = rec_loss  # 
        opt_enc.zero_grad()
        err_enc.backward()  #
        # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation:
        # [torch.cuda.FloatTensor [32, 3, 5, 5]] is at version 2; expected version 1 instead. Hint: enable anomaly detection
        # to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

        opt_enc.step()

when I change the backward order of the err_dec and err_enc like this
err_enc = rec_loss # 报错
opt_enc.zero_grad()
err_enc.backward(retain_graph=True)
opt_enc.step()

        opt_dec.zero_grad()
        err_dec.backward()  
        # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [2048, 64]]
        opt_dec.step()

I guess the main problem is the rec_loss, but the same code work well with pytorch1.2, So I wonder how to make it work for pytorch1.5.1.

Could you add code formatting to your post by wrapping the code into three backticks ```?
This would make debugging easier, as your code is quite long and thus hard to read. :slight_smile:

Thank you so much. Do I need to delete this first? I couldn’t find the menu of editing.

class Encoder(nn.Module):
    def __init__(self, input_channels, output_channels, representation_size=64):
        super(Encoder, self).__init__()
        # input parameters
        self.input_channels = input_channels
        self.output_channels = output_channels

        self.features = nn.Sequential(
            # nc x 64 x 64
            nn.Conv2d(self.input_channels, representation_size, 5, stride=2, padding=2),
            nn.BatchNorm2d(representation_size),
            nn.ReLU(),
            # hidden_size x 32 x 32
            nn.Conv2d(representation_size, representation_size * 2, 5, stride=2, padding=2),
            nn.BatchNorm2d(representation_size * 2),
            nn.ReLU(),
            # hidden_size*2 x 16 x 16
            nn.Conv2d(representation_size * 2, representation_size * 4, 5, stride=2, padding=2),
            nn.BatchNorm2d(representation_size * 4),
            nn.ReLU())
        # hidden_size*4 x 8 x 8

        self.mean = nn.Sequential(
            nn.Linear(representation_size * 4 * 8 * 8, 2048),
            nn.BatchNorm1d(2048),
            nn.ReLU(),
            nn.Linear(2048, output_channels))

        self.logvar = nn.Sequential(
            nn.Linear(representation_size * 4 * 8 * 8, 2048),
            nn.BatchNorm1d(2048),
            nn.ReLU(),
            nn.Linear(2048, output_channels))

    def forward(self, x):
        batch_size = x.size()[0]

        hidden_representation = self.features(x)

        mean = self.mean(hidden_representation.view(batch_size, -1))
        logvar = self.logvar(hidden_representation.view(batch_size, -1))

        return mean, logvar

    def hidden_layer(self, x):
        batch_size = x.size()[0]
        output = self.features(x)
        return output


class Decoder(nn.Module):
    def __init__(self, input_size, representation_size):
        super(Decoder, self).__init__()
        self.input_size = input_size
        self.representation_size = representation_size
        dim = representation_size[0] * representation_size[1] * representation_size[2]

        self.preprocess = nn.Sequential(
            nn.Linear(input_size, dim),
            nn.BatchNorm1d(dim),
            nn.ReLU())

        # 256 x 8 x 8
        self.deconv1 = nn.ConvTranspose2d(representation_size[0], 256, 5, stride=2, padding=2)
        self.act1 = nn.Sequential(nn.BatchNorm2d(256),
                                  nn.ReLU())
        # 256 x 16 x 16
        self.deconv2 = nn.ConvTranspose2d(256, 128, 5, stride=2, padding=2)
        self.act2 = nn.Sequential(nn.BatchNorm2d(128),
                                  nn.ReLU())
        # 128 x 32 x 32
        self.deconv3 = nn.ConvTranspose2d(128, 32, 5, stride=2, padding=2)
        self.act3 = nn.Sequential(nn.BatchNorm2d(32),
                                  nn.ReLU())
        # 32 x 64 x 64
        self.deconv4 = nn.ConvTranspose2d(32, 3, 5, stride=1, padding=2)
        # 3 x 64 x 64
        self.activation = nn.Tanh()

    def forward(self, code):
        bs = code.size()[0]
        preprocessed_codes = self.preprocess(code)
        preprocessed_codes = preprocessed_codes.view(-1,
                                                     self.representation_size[0],
                                                     self.representation_size[1],
                                                     self.representation_size[2])
        output = self.deconv1(preprocessed_codes, output_size=(bs, 256, 16, 16))
        output = self.act1(output)
        output = self.deconv2(output, output_size=(bs, 128, 32, 32))
        output = self.act2(output)
        output = self.deconv3(output, output_size=(bs, 32, 64, 64))
        output = self.act3(output)
        output = self.deconv4(output, output_size=(bs, 3, 64, 64))
        output = self.activation(output)
        return output


class VAE_GAN_Generator(nn.Module):
    def __init__(self, input_channels, hidden_size, representation_size=(256, 8, 8)):
        super(VAE_GAN_Generator, self).__init__()
        self.input_channels = input_channels
        self.hidden_size = hidden_size
        self.representation_size = representation_size

        self.encoder = Encoder(input_channels, hidden_size)
        self.decoder = Decoder(hidden_size, representation_size)

    def forward(self, x):
        batch_size = x.size()[0]
        mean, logvar = self.encoder(x)
        std = logvar.mul(0.5).exp_()

        reparametrized_noise = Variable(torch.randn((batch_size, self.hidden_size))).cuda()

        reparametrized_noise = mean + std * reparametrized_noise

        rec_images = self.decoder(reparametrized_noise)

        return mean, logvar, rec_images


class Discriminator(nn.Module):
    def __init__(self, input_channels=3, representation_size=(256, 8, 8)):
        super(Discriminator, self).__init__()
        self.representation_size = representation_size
        dim = representation_size[0] * representation_size[1] * representation_size[2]

        self.main = nn.Sequential(
            nn.Conv2d(input_channels, 32, 5, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.LeakyReLU(0.2),
            nn.Conv2d(32, 128, 5, stride=2, padding=2),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2),
            nn.Conv2d(128, 256, 5, stride=2, padding=2),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2),
            nn.Conv2d(256, 256, 5, stride=2, padding=2),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2))

        self.lth_features = nn.Sequential(
            nn.Linear(dim, 2048),
            nn.LeakyReLU(0.2))

        self.sigmoid_output = nn.Sequential(
            nn.Linear(2048, 1),
            nn.Sigmoid())

    def forward(self, x):
        batch_size = x.size()[0]
        features = self.main(x)
        lth_rep = self.lth_features(features.view(batch_size, -1))
        output = self.sigmoid_output(lth_rep)
        return output

    def similarity(self, x):
        batch_size = x.size()[0]
        features = self.main(x)
        lth_rep = self.lth_features(features.view(batch_size, -1))
        return lth_rep

Here is the training process:

   dataset = 'cifar10'
    # dataset = 'mnist'
    batchSize = 64
    # imageSize = 28
    nz = 100
    nepoch = 20
    if not os.path.exists('./img_VAE-GAN'):
        os.mkdir('./img_VAE-GAN')
    print("Random Seed: 88")
    random.seed(88)
    torch.manual_seed(88)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    cudnn.benchmark = True
    if dataset == 'cifar10':
        dataset = dset.CIFAR10(root='/home/liushangqing/projects/GANS/datasets/', download=True,
                               transform=transforms.Compose([
                                   transforms.Resize(64),
                                   transforms.ToTensor(),
                                   transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
                               ])
                               )
        n_channel = 3
    elif dataset == 'mnist':
        dataset = dset.MNIST(root='./data',
                             train=True,
                             transform=transforms.Compose([transforms.ToTensor()]),
                             download=True
                             )
        n_channel = 1
    # print(dataset.size)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batchSize,
                                             shuffle=True)

    data, _ = next(iter(dataloader))
    print(data.size())
    # print(data[1, :])
    show_and_save("gen", make_grid((data * 0.5 + 0.5).cpu(), 8))

    # define constant
    input_channels = 3
    hidden_size = 256
    max_epochs = 250
    lr = 3e-4

    beta = 5
    alpha = 0.1
    gamma = 15

    batch_size = 64

    G = VAE_GAN_Generator(input_channels, hidden_size).cuda()
    D = Discriminator(input_channels).cuda()

    criterion = nn.BCELoss()
    criterion.cuda()

    opt_enc = optim.RMSprop(G.encoder.parameters(), lr=lr)
    opt_dec = optim.RMSprop(G.decoder.parameters(), lr=lr)
    opt_dis = optim.RMSprop(D.parameters(), lr=lr * alpha)

    fixed_noise = Variable(torch.randn(batch_size, hidden_size)).cuda()
    data, _ = next(iter(dataloader))
    fixed_batch = Variable(data).cuda()

    for epoch in range(max_epochs):
        D_real_list, D_rec_enc_list, D_rec_noise_list, D_list = [], [], [], []
        g_loss_list, rec_loss_list, prior_loss_list = [], [], []
        for data, _ in dataloader:
            print(data.dtype)
            print(data.size)
            batch_size = data.size()[0]
            ones_label = Variable(torch.ones(batch_size)).cuda()
            zeros_label = Variable(torch.zeros(batch_size)).cuda()

            # print (data.size())
            datav = Variable(data).cuda()
            print(datav.dtype)
            mean, logvar, rec_enc = G(datav)
            # print ("The size of rec_enc:", rec_enc.size())

            noisev = Variable(torch.randn(batch_size, hidden_size)).cuda()
            rec_noise = G.decoder(noisev)

            # train discriminator
            output = D(datav)
            errD_real = criterion(output, ones_label)
            # print(output.cuda().data.cpu())
            # print(output.cuda().data.cpu().numpy())
            # print(output.cuda().data.cpu().mean())
            # print(output.cuda().data.cpu().numpy().mean())
            D_real_list.append(output.cuda().data.cpu().numpy().mean())
            output = D(rec_enc)
            errD_rec_enc = criterion(output, zeros_label)
            D_rec_enc_list.append(output.cuda().data.cpu().numpy().mean())
            output = D(rec_noise)
            errD_rec_noise = criterion(output, zeros_label)

            D_rec_noise_list.append(output.cuda().data.cpu().numpy().mean())

            dis_img_loss = errD_real + errD_rec_enc + errD_rec_noise

            # print ("print (dis_img_loss)", dis_img_loss.cuda().data.cpu().numpy())
            D_list.append(dis_img_loss.cuda().data.cpu().numpy())
            opt_dis.zero_grad()
            dis_img_loss.backward(retain_graph=True)
            opt_dis.step()

            # train decoder
            output = D(datav)
            errD_real = criterion(output, ones_label)
            output = D(rec_enc)
            errD_rec_enc = criterion(output, zeros_label)
            output = D(rec_noise)
            errD_rec_noise = criterion(output, zeros_label)

            similarity_rec_enc = D.similarity(rec_enc)
            similarity_data = D.similarity(datav)

            dis_img_loss = errD_real + errD_rec_enc + errD_rec_noise
            # print (dis_img_loss)
            gen_img_loss = - dis_img_loss

            g_loss_list.append(gen_img_loss.cuda().data.cpu().numpy())
            rec_loss = ((similarity_rec_enc - similarity_data) ** 2).mean()
            rec_loss_list.append(rec_loss.cuda().data.cpu().numpy())
            err_dec = gamma * rec_loss + gen_img_loss

            opt_dec.zero_grad()
            err_dec.backward(retain_graph=True)
            opt_dec.step()

            # train encoder
            prior_loss = 1 + logvar - mean.pow(2) - logvar.exp()
            prior_loss = (-0.5 * torch.sum(prior_loss)) / torch.numel(mean.data)
            # print ("prior loss",prior_loss)
            prior_loss_list.append(prior_loss.cuda().data.cpu().numpy())
            err_enc = prior_loss + beta * rec_loss

            opt_enc.zero_grad()
            err_enc.backward()
            opt_enc.step()

Then the error happened:

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [32, 3, 5, 5]] is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

when I change the backward order of the err_dec and err_enc like this

err_enc = rec_loss 
opt_enc.zero_grad()
err_enc.backward(retain_graph=True)
opt_enc.step()

opt_dec.zero_grad()
err_dec.backward()  
       
opt_dec.step()

Then the error happened:

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [2048, 64]]

I guess the main problem is the rec_loss, but the same code work well with pytorch1.2, So I wonder how to make it work for pytorch1.5.1.

I think the error is raised, if you are calling the step() function on optimizers which contain (a subset of) the same parameters as explained in the release notes for 1.5.0.
Could this be the case and if so, could you move the step() calls to the end of the iteration?

1 Like

Thank you so much. I tried the way you said and it worked.