RuntimeError: ones needs to be contiguous

sinAshish · February 22, 2020, 12:38pm

I am trying to train a gan but while training the generator network, during the backward(), I get the error: RuntimeError: ones needs to be contiguous

Here is the output during debugging:

> <ipython-input-34-c3622f575644>(80)<module>()
-> fake = gen(z, thetas)
(Pdb) n
> <ipython-input-34-c3622f575644>(81)<module>()
-> h5s, h5, cont_vars, _, _, _, _ = disc(fake)
(Pdb) n
> <ipython-input-34-c3622f575644>(82)<module>()
-> aux_loss = latent_lambda * mse(z, cont_vars)
(Pdb) n
> <ipython-input-34-c3622f575644>(85)<module>()
-> gen_loss = ce_loss(h5, torch.ones_like(h5)) + aux_loss
(Pdb) n
> <ipython-input-34-c3622f575644>(86)<module>()
-> gen_loss.backward()
(Pdb) n
RuntimeError: ones needs to be contiguous

Why is this happening and what is the possible solution?

ptrblck · February 23, 2020, 8:30am

Could you post a small executable code snippet to reproduce this issue?
If I’m not mistaken, ones is used for the bias grad accumulation, so I’m not sure what could have caused this issue.

sinAshish · February 23, 2020, 3:15pm

This is the discriminator network.

def calc_mean_std(feat, eps=1e-5):
    # eps is a small value added to the variance to avoid divide-by-zero.
    size = feat.size()
    N, C = size[:2]
    feat_var = feat.view(N, C, -1).var(dim=2) + eps
    if len(size) == 4:
        feat_std = feat_var.sqrt().view(N, C, 1, 1)
        feat_mean = feat.view(N, C, -1).mean(dim=2).view(N, C, 1, 1)
    elif len(size) == 5:
        feat_std = feat_var.sqrt().view(N, C, 1, 1, 1)
        feat_mean = feat.view(N, C, -1).mean(dim=2).view(N, C, 1, 1, 1)
    else:
        assert 1 == 0
    return feat_mean, feat_std

class Discriminator(nn.Module):

    def __init__(self, cont_dim):
        super(Discriminator, self).__init__()

        self.cont_dim = cont_dim

        self.noise_generator = Normal(loc=0.0, scale=0.02)

        self.convolve0 = spectral_norm(nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=2))
        nn.init.normal_(self.convolve0.weight, std=0.02)
        torch.nn.init.zeros_(self.convolve0.bias)

        self.convolve1 = spectral_norm(nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=2))
        nn.init.normal_(self.convolve1.weight, std=0.02)
        torch.nn.init.zeros_(self.convolve1.bias)
        self.normalize1 = nn.InstanceNorm2d(num_features=128)

        self.convolve2 = spectral_norm(nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2))
        nn.init.normal_(self.convolve2.weight, std=0.02)
        torch.nn.init.zeros_(self.convolve2.bias)
        self.normalize2 = nn.InstanceNorm2d(num_features=256)

        self.convolve3 = spectral_norm(nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2))
        nn.init.normal_(self.convolve3.weight, std=0.02)
        torch.nn.init.zeros_(self.convolve3.bias)
        self.normalize3 = nn.InstanceNorm2d(num_features=512)

        self.convolve4 = spectral_norm(nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=2))
        nn.init.normal_(self.convolve4.weight, std=0.02)
        torch.nn.init.zeros_(self.convolve4.bias)
        self.normalize4 = nn.InstanceNorm2d(num_features=1024)

        self.linear_classifier1 = nn.Linear(256, 1)
        self.linear_classifier2 = nn.Linear(512, 1)
        self.linear_classifier3 = nn.Linear(1024, 1)
        self.linear_classifier4 = nn.Linear(2048, 1)
        self.linear_classifier5 = nn.Linear(9216, 1)

        nn.init.normal_(self.linear_classifier1.weight, std=0.02)
        nn.init.normal_(self.linear_classifier2.weight, std=0.02)
        nn.init.normal_(self.linear_classifier3.weight, std=0.02)
        nn.init.normal_(self.linear_classifier4.weight, std=0.02)
        nn.init.normal_(self.linear_classifier5.weight, std=0.02)

        torch.nn.init.zeros_(self.linear_classifier1.bias)
        torch.nn.init.zeros_(self.linear_classifier2.bias)
        torch.nn.init.zeros_(self.linear_classifier3.bias)
        torch.nn.init.zeros_(self.linear_classifier4.bias)
        torch.nn.init.zeros_(self.linear_classifier5.bias)

        self.linear_projector1 = nn.Linear(9216, 128)
        self.linear_projector2 = nn.Linear(128, self.cont_dim)

        nn.init.normal_(self.linear_projector1.weight, std=0.02)
        nn.init.normal_(self.linear_projector2.weight, std=0.02)

        torch.nn.init.zeros_(self.linear_projector1.bias)
        torch.nn.init.zeros_(self.linear_projector2.bias)

    def forward(self, x, negative_slope=0.2):

        x = x / 127.5 - 1.
        if torch.cuda.is_available():
            x = x + self.noise_generator.sample(sample_shape=x.shape).cuda()
        else:
            x = x + self.noise_generator.sample(sample_shape=x.shape)

        h0 = F.leaky_relu(self.convolve0(x), negative_slope=negative_slope)

        h1 = self.convolve1(h0)
        h1_mean, h1_var = calc_mean_std(h1)
        h1 = self.normalize1(h1)
        d_h1_style = torch.cat((h1_mean, h1_var), 1)
        d_h1_style = d_h1_style.view(d_h1_style.shape[0], -1)
        d_h1_logits = self.linear_classifier1(d_h1_style)
        # d_h1_logits = torch.sigmoid(d_h1)
        h1 = F.leaky_relu(h1, negative_slope=negative_slope)

        h2 = self.convolve2(h1)
        h2_mean, h2_var = calc_mean_std(h2)
        h2 = self.normalize2(h2)
        d_h2_style = torch.cat((h2_mean, h2_var), 1)
        d_h2_style = d_h2_style.view(d_h2_style.shape[0], -1)
        d_h2_logits = self.linear_classifier2(d_h2_style)
        # d_h2_logits = torch.sigmoid(d_h2)
        h2 = F.leaky_relu(h2, negative_slope=negative_slope)

        h3 = self.convolve3(h2)
        h3_mean, h3_var = calc_mean_std(h3)
        h3 = self.normalize3(h3)
        d_h3_style = torch.cat((h3_mean, h3_var), 1)
        d_h3_style = d_h3_style.view(d_h3_style.shape[0], -1)
        d_h3_logits = self.linear_classifier3(d_h3_style)
        # d_h3_logits = torch.sigmoid(d_h3)
        h3 = F.leaky_relu(h3, negative_slope=negative_slope)

        h4 = self.convolve4(h3)
        h4_mean, h4_var = calc_mean_std(h4)
        h4 = self.normalize4(h4)
        d_h4_style = torch.cat((h4_mean, h4_var), 1)
        d_h4_style = d_h4_style.view(d_h4_style.shape[0], -1)
        d_h4_logits = self.linear_classifier4(d_h4_style)
        # d_h4_logits = torch.sigmoid(d_h4)
        h4 = F.leaky_relu(h4, negative_slope=negative_slope)
        h4f = h4.view(h4.shape[0], -1)

        h5 = self.linear_classifier5(h4f)
        h5s = torch.sigmoid(h5)

        encoding = F.leaky_relu(self.linear_projector1(h4f), negative_slope=negative_slope)
        cont_vars = torch.tanh(self.linear_projector2(encoding))

        return h5s, h5, cont_vars, d_h1_logits, d_h2_logits, d_h3_logits, d_h4_logits

And this is the code while updating the loss:

h5s, h5, cont_vars, _, _, _, _ = disc(fake)
aux_loss = latent_lambda * mse(z, cont_vars)        
gen_loss = ceLoss(h5, torch.ones_like(h5)) + aux_loss #where ceLoss = BCE with Logits
gen_loss.backward()

I tried adding .contiguous() to some bits of the code, but I can’t seem to figure out why the error occurs!

ptrblck · February 24, 2020, 1:34am

Thanks for the code.
How are fake, latent_lambda, z defined?

sinAshish · February 24, 2020, 11:18am

z is defined as:

a = np.random.uniform(0., 1., (1, 128))
a = np.repeat(a, batch_size, axis=0)
z = torch.from_numpy(a).float()

fake is the output from the generator w/ B x C x W x H
latent_lambda is a penalty parameter = 10

ptrblck · February 25, 2020, 6:01am

Thanks for the information.

I had to use some workarounds to get the code working, but this code snippet runs fine:

a = np.random.uniform(0., 1., (1, 128))
a = np.repeat(a, batch_size, axis=0)
z = torch.from_numpy(a).float()

disc = Discriminator(1)
fake = torch.randn(batch_size, 3, 150, 150)
h5s, h5, cont_vars, _, _, _, _ = disc(fake)
aux_loss = 10 * F.mse_loss(z, cont_vars)        
gen_loss = F.cross_entropy(h5, torch.zeros_like(h5).squeeze().long()) + aux_loss #where ceLoss = BCE with Logits
gen_loss.backward()

Could you check that and let me know, what would be needed to change in order to run into the issue?

sinAshish · February 25, 2020, 3:27pm

I am not able to figure out why the error occurs. But when I use the code as suggested by you, I get CUDA error: device-side assert triggered.

If it helps to reproduce the issue, here is the generator network:

def adaptive_instance_normalization(features, style_feat):
    partition = style_feat.size()[1] // 2
    scale, bias = style_feat[:, :partition], style_feat[:, partition:]
    mean, variance = calc_mean_std(features)  # Only consider spatial dimension
    sigma = torch.rsqrt(variance + 1e-8)
    normalized = (features - mean) * sigma
    scale_broadcast = scale.view(mean.size())
    bias_broadcast = bias.view(mean.size())
    normalized = scale_broadcast * normalized
    normalized += bias_broadcast
    return normalized

class Generator(nn.Module):

    def __init__(self):
        super(Generator, self).__init__()

        self.const = nn.Parameter(torch.empty(size=(1, 512, 4, 4, 4)), requires_grad=True)
        nn.init.normal_(self.const, std=0.02)

        self.mlp0 = nn.Linear(128, 512 * 2)
        nn.init.normal_(self.mlp0.weight, std=0.02)
        torch.nn.init.zeros_(self.mlp0.bias)

        self.mlp1 = nn.Linear(128, 256 * 2)
        self.dcv3d1 = nn.ConvTranspose3d(in_channels=512, out_channels=256, kernel_size=3, stride=2, padding=1, output_padding=1)

        nn.init.normal_(self.mlp1.weight, std=0.02)
        torch.nn.init.zeros_(self.mlp1.bias)
        nn.init.normal_(self.dcv3d1.weight, std=0.02)
        torch.nn.init.zeros_(self.dcv3d1.bias)

        self.mlp2 = nn.Linear(128, 128 * 2)
        self.dcv3d2 = nn.ConvTranspose3d(in_channels=256, out_channels=128, kernel_size=3, stride=2, padding=1, output_padding=1)

        nn.init.normal_(self.mlp2.weight, std=0.02)
        torch.nn.init.zeros_(self.mlp2.bias)
        nn.init.normal_(self.dcv3d2.weight, std=0.02)
        torch.nn.init.zeros_(self.dcv3d2.bias)

        self.dcv3d3 = nn.ConvTranspose3d(in_channels=128, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.dcv3d4 = nn.ConvTranspose3d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1)

        nn.init.normal_(self.dcv3d3.weight, std=0.02)
        torch.nn.init.zeros_(self.dcv3d3.bias)
        nn.init.normal_(self.dcv3d4.weight, std=0.02)
        torch.nn.init.zeros_(self.dcv3d4.bias)

        self.dcv2d1 = nn.ConvTranspose2d(in_channels=1024, out_channels=512, kernel_size=1, stride=1, padding=1)

        nn.init.normal_(self.dcv2d1.weight, std=0.02)
        torch.nn.init.zeros_(self.dcv2d1.bias)

        self.mlp3 = nn.Linear(128, 256 * 2)
        self.dcv2d2 = nn.ConvTranspose2d(in_channels=512, out_channels=256, kernel_size=4, stride=2)

        nn.init.normal_(self.mlp3.weight, std=0.02)
        torch.nn.init.zeros_(self.mlp3.bias)
        nn.init.normal_(self.dcv2d2.weight, std=0.02)
        torch.nn.init.zeros_(self.dcv2d2.bias)

        self.mlp4 = nn.Linear(128, 64 * 2)
        self.dcv2d3 = nn.ConvTranspose2d(in_channels=256, out_channels=64, kernel_size=4, stride=2)

        nn.init.normal_(self.mlp4.weight, std=0.02)
        torch.nn.init.zeros_(self.mlp4.bias)
        nn.init.normal_(self.dcv2d3.weight, std=0.02)
        torch.nn.init.zeros_(self.dcv2d3.bias)

        self.mlp5 = nn.Linear(128, 32 * 2)
        self.dcv2d4 = nn.ConvTranspose2d(in_channels=64, out_channels=32, kernel_size=4, stride=2, padding=1, output_padding=1)

        nn.init.normal_(self.mlp5.weight, std=0.02)
        torch.nn.init.zeros_(self.mlp5.bias)
        nn.init.normal_(self.dcv2d4.weight, std=0.02)
        torch.nn.init.zeros_(self.dcv2d4.bias)

        self.dcv2d5 = nn.ConvTranspose2d(in_channels=32, out_channels=3, kernel_size=4, stride=1)
        nn.init.normal_(self.dcv2d5.weight, std=0.02)
        torch.nn.init.zeros_(self.dcv2d5.bias)

    def forward(self, z, pose, negative_slope=0.2):

        w_tile = self.const.repeat((z.shape[0], 1, 1, 1, 1))
        z0 = F.leaky_relu(self.mlp0(z), negative_slope=negative_slope)
        h0 = adaptive_instance_normalization(w_tile, z0)
        h0 = F.leaky_relu(h0, negative_slope=negative_slope)

        h1 = self.dcv3d1(h0)
        z1 = F.leaky_relu(self.mlp1(z), negative_slope=negative_slope)
        h1 = adaptive_instance_normalization(h1, z1)
        h1 = F.leaky_relu(h1, negative_slope=negative_slope)

        h2 = self.dcv3d2(h1)
        z2 = F.leaky_relu(self.mlp2(z), negative_slope=negative_slope)
        h2 = adaptive_instance_normalization(h2, z2)
        h2 = F.leaky_relu(h2, negative_slope=negative_slope)

        h2_rotated = F.grid_sample(h2, F.affine_grid(pose, h2.size(), align_corners=True), align_corners=True)

        h2_proj1 = F.leaky_relu(self.dcv3d3(h2_rotated), negative_slope=negative_slope)
        h2_proj2 = F.leaky_relu(self.dcv3d4(h2_proj1), negative_slope=negative_slope)

        h2_2d = h2_proj2.view(h2_proj2.shape[0], h2_proj2.shape[1] * h2_proj2.shape[2], h2_proj2.shape[3], h2_proj2.shape[4])

        h3 = F.leaky_relu(self.dcv2d1(h2_2d), negative_slope=negative_slope)

        h4 = self.dcv2d2(h3)
        z3 = F.leaky_relu(self.mlp3(z), negative_slope=negative_slope)
        h4 = adaptive_instance_normalization(h4, z3)
        h4 = F.leaky_relu(h4, negative_slope=negative_slope)

        h5 = self.dcv2d3(h4)
        z4 = F.leaky_relu(self.mlp4(z), negative_slope=negative_slope)
        h5 = adaptive_instance_normalization(h5, z4)
        h5 = F.leaky_relu(h5, negative_slope=negative_slope)

        h6 = self.dcv2d4(h5)
        z5 = F.leaky_relu(self.mlp5(z), negative_slope=negative_slope)
        h6 = adaptive_instance_normalization(h6, z5)
        h6 = F.leaky_relu(h6, negative_slope=negative_slope)

        h7 = self.dcv2d5(h6)
        output = torch.tanh(h7)

        return output