100x more time cost of DCGAN Hessian vector product

Hongkai_Zheng · September 18, 2019, 7:25am

I found a weird computation cost when I was trying to compute Hessian vector product of DCGAN.
The Hessian vector product for DCGAN costs 100x more than the other GAN , even though it contains less parameters. I’m confused because the problem seems not caused by one single module since the other GAN contains linear, pooling, convolution while DCGAN only has convolution. Could anyone figure out what’s going on here?

I use CUDA events to measure time cost of backward of each model and Hessian vector product. And here is the result.

=====Test1=====
Discriminator backward :  10.1550083ms
Generator backward :  6.1214719ms
Hessian vector product :  13.5014400ms
=====Test2=====
Discriminator backward :  18.8948479ms
Generator backward :  31.2074242ms
Hessian vector product :  1206.1020508ms

The batchnorms and activation functions don’t make a difference so I removed all of them to make the code clear. The math form of hessian vector product here is
$\frac{\partial^2f}{\partial G\partial D} \frac{\partial f}{\partial D}$

class DC_generator(nn.Module):
    def __init__(self, z_dim=100, channel_num=3, feature_num=64):
        super(DC_generator, self).__init__()
        self.main = nn.Sequential(
            nn.ConvTranspose2d(z_dim, feature_num * 8, kernel_size=4, stride=1, padding=0, bias=True),
            # (feature_num * 8) * 4x4
            nn.ConvTranspose2d(feature_num * 8, feature_num * 4, kernel_size=4, stride=2, padding=1, bias=True),
            # (feature_num * 4) * 8x8
            nn.ConvTranspose2d(feature_num * 4, feature_num * 2, kernel_size=4, stride=2, padding=1, bias=True),
            # (feature_num * 2) * 16x16
            nn.ConvTranspose2d(feature_num * 2, feature_num, kernel_size=4, stride=2, padding=1, bias=True),
            # (feature_num * 2) * 32x32
            nn.ConvTranspose2d(feature_num, channel_num, kernel_size=4, stride=2, padding=1, bias=True),
            # channel_num * 64x64
        )

    def forward(self, input):
        return self.main(input)


class DC_discriminator(nn.Module):
    def __init__(self, channel_num=3, feature_num=64):
        super(DC_discriminator, self).__init__()
        self.main = nn.Sequential(
            # channel_num * 64x64
            nn.Conv2d(channel_num, feature_num, kernel_size=4, stride=2, padding=1, bias=True),
            # (feature_num) * 32x32
            nn.Conv2d(feature_num, feature_num * 2, kernel_size=4, stride=2, padding=1, bias=True),
            # (feature_num * 2) * 16x16
            nn.Conv2d(feature_num * 2, feature_num * 4, kernel_size=4, stride=2, padding=1, bias=True),
            # (feature_num * 4) * 8x8
            nn.Conv2d(feature_num * 4, feature_num * 8, kernel_size=4, stride=2, padding=1, bias=True),
            # (feature_num * 8) * 4x4
            nn.Conv2d(feature_num * 8, 1, kernel_size=4, stride=1, padding=0, bias=True),
            # feature_num * 16x16
        )

    def forward(self, input):
        return self.main(input)


class dc_D(nn.Module):
    def __init__(self):
        super(dc_D, self).__init__()
        self.conv = nn.Sequential(
            # 3 * 32x32
            nn.Conv2d(in_channels=3, out_channels=128, kernel_size=5, stride=1),
            nn.LeakyReLU(0.01),
            nn.MaxPool2d(2,2),
            # 32 * 14x14
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=5, stride=1),
            nn.LeakyReLU(0.01),
            nn.MaxPool2d(2,2)
            # 64 * 5x5
        )
        self.fc = nn.Sequential(
            nn.Linear(1600 * 2, 1024),
            nn.LeakyReLU(0.01),
            nn.Linear(1024, 1)
        )

    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.shape[0], -1)
        return self.fc(x)


class dc_G(nn.Module):
    def __init__(self, z_dim=96):
        super(dc_G, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(z_dim, 1024),
            nn.Linear(1024, 8 * 8 * 128),
        )
        self.convt = nn.Sequential(
            nn.ConvTranspose2d(in_channels=128, out_channels=64, kernel_size=4, stride=2, padding=1),
            nn.ConvTranspose2d(in_channels=64, out_channels=3, kernel_size=4, stride=2, padding=1)
        )

    def forward(self, x):
        x = self.fc(x)
        x = x.view(x.shape[0], 128, 8, 8)
        return self.convt(x)

def hvptest(D, G, z):
    criterion = nn.BCEWithLogitsLoss()
    d_fake = D(G(z))
    loss = criterion(d_fake, torch.zeros(d_fake.shape, device=device))

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    grad_d = torch.autograd.grad(loss, D.parameters(), create_graph=True, retain_graph=True)
    end.record()
    torch.cuda.synchronize(device=device)
    print('Discriminator backward :  %.7fms' % (start.elapsed_time(end)))
    grad_d_vec = torch.cat([g.contiguous().view(-1) for g in grad_d])

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    grad_g = torch.autograd.grad(loss, G.parameters(), create_graph=True, retain_graph=True)
    end.record()
    torch.cuda.synchronize(device=device)
    print('Generator backward :  %.7fms' % (start.elapsed_time(end)))

    grad_g_vec = torch.cat([g.contiguous().view(-1) for g in grad_g])
    # print('Discriminator parameter number: %d' % grad_d_vec.numel())
    # print('Generator parameter number: %d' % grad_g_vec.numel())
    vec_d = grad_d_vec.clone().detach()

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    hvp_d = torch.autograd.grad(grad_d_vec, G.parameters(), grad_outputs=vec_d, retain_graph=True)
    end.record()
    torch.cuda.synchronize(device=device)
    print('Hessian vector product :  %.7fms' % (start.elapsed_time(end)))


if __name__ == '__main__':
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    batch_size = 128
    print('=====Test1=====')
    D = dc_D().to(device)
    G = dc_G(z_dim=96).to(device)
    img_cifar = (torch.rand((batch_size, 3, 32, 32), device=device) - 0.5) / 0.5
    z_mnist = torch.randn((batch_size, 96), device=device)
    hvptest(D=D, G=G, z=z_mnist)
    
    print('=====Test2=====')
    D = DC_discriminator(channel_num=3, feature_num=64).to(device)
    G = DC_generator(z_dim=100, channel_num=3, feature_num=64).to(device)
    z_celeba = torch.randn((batch_size, 100, 1, 1), device=device)
    hvptest(D=D, G=G, z=z_celeba)

PS: Pytorch version: 1.1.0, GPU: Tesla V100

albanD · September 18, 2019, 2:30pm

Hi,
Thanks for the repro code !

I ran this on cpu (I don’t have a gpu install at hand) with the following modifications:

import time
def hvptest(D, G, z):
    criterion = nn.BCEWithLogitsLoss()
    d_fake = D(G(z))
    loss = criterion(d_fake, torch.zeros(d_fake.shape, device=device))

    start = time.time()
    grad_d = torch.autograd.grad(loss, D.parameters(), create_graph=True, retain_graph=True)
    end = time.time()
    print('Discriminator backward :  %.7fms' % (end - start))
    grad_d_vec = torch.cat([g.contiguous().view(-1) for g in grad_d])

    start = time.time()
    grad_g = torch.autograd.grad(loss, G.parameters(), create_graph=True, retain_graph=True)
    end = time.time()
    print('Generator backward :  %.7fms' % (end - start))

    grad_g_vec = torch.cat([g.contiguous().view(-1) for g in grad_g])
    # print('Discriminator parameter number: %d' % grad_d_vec.numel())
    # print('Generator parameter number: %d' % grad_g_vec.numel())
    vec_d = grad_d_vec.clone().detach()

    start = time.time()
    hvp_d = torch.autograd.grad(grad_d_vec, G.parameters(), grad_outputs=vec_d, retain_graph=True)
    end = time.time()
    print('Hessian vector product :  %.7fms' % (end - start))


if __name__ == '__main__':
    device = torch.device('cpu')
    batch_size = 128
    print('=====Test1=====')
    D = dc_D().to(device)
    G = dc_G(z_dim=96).to(device)
    img_cifar = (torch.rand((batch_size, 3, 32, 32), device=device) - 0.5) / 0.5
    z_mnist = torch.randn((batch_size, 96), device=device)
    hvptest(D=D, G=G, z=z_mnist)

    print('=====Test2=====')
    D = DC_discriminator(channel_num=3, feature_num=64).to(device)
    G = DC_generator(z_dim=100, channel_num=3, feature_num=64).to(device)
    z_celeba = torch.randn((batch_size, 100, 1, 1), device=device)
    hvptest(D=D, G=G, z=z_celeba)

And I get`

=====Test1=====
Discriminator backward :  0.2947176ms
Generator backward :  0.4657667ms
Hessian vector product :  1.2660155ms
=====Test2=====
Discriminator backward :  0.1807017ms
Generator backward :  0.9121838ms
Hessian vector product :  2.6689870ms

Do you see similar results on cpu?
If you still see the problem, which version of pytorch are you using?

Hongkai_Zheng · September 18, 2019, 4:40pm

Thank you for reply. I got the similar results on CPU. But I don’t get it. Could you explain a bit more? I still don’t understand why DCGAN has much more time cost of Hessian vector product. (BTW, my Pytorch version is 1.1.0)

albanD · September 18, 2019, 5:43pm

Hi,

Your results are similar to mine? Or similar to your GPU results?

Hongkai_Zheng · September 18, 2019, 5:47pm

I tried on CPU too. The result on CPU is similar to yours. But for GPU, we need CUDA event to do accurate time measurement.

albanD · September 18, 2019, 7:10pm

You can use

torch.cuda.synchronize()
start = time.time()

everywhere there is a time.time() call to avoid using events. Can you check what is the behavior of this when you run on gpu? Does it give the same result as cpu or the same result as with events?

Hongkai_Zheng · September 18, 2019, 7:20pm

I got the same result as with events.

=====Test1=====
Discriminator backward :  0.0120358 s
Discriminator backward :  0.0067697 s
Hessian vector product :  0.0130763 s
=====Test2=====
Discriminator backward :  0.0188158 s
Discriminator backward :  0.0291374 s
Hessian vector product :  1.1108479 s

albanD · September 18, 2019, 7:32pm

Interesting, maybe @ngimel has some insight in this? Otherwise I’ll set up a gpu install to test this.

Hongkai_Zheng · September 23, 2019, 1:12am

Hi, I messaged her but she didn’t reply. could you help me ask @ngimel for help?

ngimel · September 23, 2019, 2:17am

Sorry, did not get notification, and sorry, don’t have insight on this. @albanD can you take a look at this or should I?

ptrblck · September 23, 2019, 10:36am

In case that helps: Repro on TitanV with Pytorch 1.3.0.dev20190910 (nightly) with CUDA10.0 and NVIDIA driver 418.56:

=====Test1=====
Discriminator backward :  0.0056169ms
Generator backward :  0.0103526ms
Hessian vector product :  0.0136104ms
=====Test2=====
Discriminator backward :  0.0149076ms
Generator backward :  0.0242238ms
Hessian vector product :  1.0203860ms

(just added torch.cuda.synchronize() before starting and stopping the timers, no warmup etc.)

SimonW · September 23, 2019, 1:16pm

I suspect that double backward for conv T is really slow.

ngimel · September 23, 2019, 3:21pm

@simonw but there should be double backward for ConvT both in test1 and test2, and only test2 is painfully slow.
@ptrblck can you get nvvp for it and may be run it through pyprof, though I suspect pyprof will choke on double backward.

ptrblck · September 23, 2019, 5:38pm

Sure.
Rerunning on a V100 32GB (Titan V is currently busy):
Reference (using @albanD’s code + adding torch.cuda.synchronize()):

 =====Test1=====
Discriminator backward :  0.0079403s
Generator backward :  0.0082126s
Hessian vector product :  0.0118001s
=====Test2=====
Discriminator backward :  0.0121739s
Generator backward :  0.0269434s
Hessian vector product :  0.7245934s

I’ve changed the output to seconds, as I believe time.time() calculates the seconds not ms.
Attaching nvprof.
Let me know, if you need more profiling output etc.

SimonW · September 23, 2019, 11:15pm

You are right. I just saw that DC_generator has considerably more convTs and was wondering.

ngimel · September 24, 2019, 2:25am

The solution is simple, use torch.backends.cudnn.benchmark=True. cudnn heuristics picks truly atrocious algorithm for couple layers (one of the kernels takes 600 ms, another 50 ms). When cudnn is actually forced to benchmark and pick the best algo, the hessian computation speed becomes reasonable.

Hongkai_Zheng · September 24, 2019, 7:05pm

Hi, I set benchmark=True but got worse results. Could you share your code?

ngimel · September 24, 2019, 8:05pm

import torch
import time
import torch.nn as nn
torch.backends.cudnn.benchmark=True
class DC_generator(nn.Module):
    def __init__(self, z_dim=100, channel_num=3, feature_num=64):
        super(DC_generator, self).__init__()
        self.main = nn.Sequential(
            nn.ConvTranspose2d(z_dim, feature_num * 8, kernel_size=4, stride=1, padding=0, bias=True),
            # (feature_num * 8) * 4x4
            nn.ConvTranspose2d(feature_num * 8, feature_num * 4, kernel_size=4, stride=2, padding=1, bias=True),
            # (feature_num * 4) * 8x8
            nn.ConvTranspose2d(feature_num * 4, feature_num * 2, kernel_size=4, stride=2, padding=1, bias=True),
            # (feature_num * 2) * 16x16
            nn.ConvTranspose2d(feature_num * 2, feature_num, kernel_size=4, stride=2, padding=1, bias=True),
            # (feature_num * 2) * 32x32
            nn.ConvTranspose2d(feature_num, channel_num, kernel_size=4, stride=2, padding=1, bias=True),
            # channel_num * 64x64
        )

    def forward(self, input):
        return self.main(input)


class DC_discriminator(nn.Module):
    def __init__(self, channel_num=3, feature_num=64):
        super(DC_discriminator, self).__init__()
        self.main = nn.Sequential(
            # channel_num * 64x64
            nn.Conv2d(channel_num, feature_num, kernel_size=4, stride=2, padding=1, bias=True),
            # (feature_num) * 32x32
            nn.Conv2d(feature_num, feature_num * 2, kernel_size=4, stride=2, padding=1, bias=True),
            # (feature_num * 2) * 16x16
            nn.Conv2d(feature_num * 2, feature_num * 4, kernel_size=4, stride=2, padding=1, bias=True),
            # (feature_num * 4) * 8x8
            nn.Conv2d(feature_num * 4, feature_num * 8, kernel_size=4, stride=2, padding=1, bias=True),
            # (feature_num * 8) * 4x4
            nn.Conv2d(feature_num * 8, 1, kernel_size=4, stride=1, padding=0, bias=True),
            # feature_num * 16x16
        )

    def forward(self, input):
        return self.main(input)

class dc_D(nn.Module):
    def __init__(self):
        super(dc_D, self).__init__()
        self.conv = nn.Sequential(
            # 3 * 32x32
            nn.Conv2d(in_channels=3, out_channels=128, kernel_size=5, stride=1),
            nn.LeakyReLU(0.01),
            nn.MaxPool2d(2,2),
            # 32 * 14x14
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=5, stride=1),
            nn.LeakyReLU(0.01),
            nn.MaxPool2d(2,2)
            # 64 * 5x5
        )
        self.fc = nn.Sequential(
            nn.Linear(1600 * 2, 1024),
            nn.LeakyReLU(0.01),
            nn.Linear(1024, 1)
        )

    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.shape[0], -1)
        return self.fc(x)


class dc_G(nn.Module):
    def __init__(self, z_dim=96):
        super(dc_G, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(z_dim, 1024),
            nn.Linear(1024, 8 * 8 * 128),
        )
        self.convt = nn.Sequential(
            nn.ConvTranspose2d(in_channels=128, out_channels=64, kernel_size=4, stride=2, padding=1),
            nn.ConvTranspose2d(in_channels=64, out_channels=3, kernel_size=4, stride=2, padding=1)
        )

    def forward(self, x):
        x = self.fc(x)
        x = x.view(x.shape[0], 128, 8, 8)
        return self.convt(x)



def hvptest(D, G, z):
    criterion = nn.BCEWithLogitsLoss()
    d_fake = D(G(z))
    loss = criterion(d_fake, torch.zeros(d_fake.shape, device=device))

    torch.cuda.synchronize()
    start = time.time()
    grad_d = torch.autograd.grad(loss, D.parameters(), create_graph=True, retain_graph=True)
    torch.cuda.synchronize()
    end = time.time()
    print('Discriminator backward :  %.7fms' % (end - start))
    grad_d_vec = torch.cat([g.contiguous().view(-1) for g in grad_d])

    torch.cuda.synchronize()
    start = time.time()
    grad_g = torch.autograd.grad(loss, G.parameters(), create_graph=True, retain_graph=True)
    torch.cuda.synchronize()
    end = time.time()
    print('Generator backward :  %.7fms' % (end - start))

    grad_g_vec = torch.cat([g.contiguous().view(-1) for g in grad_g])
    # print('Discriminator parameter number: %d' % grad_d_vec.numel())
    # print('Generator parameter number: %d' % grad_g_vec.numel())
    vec_d = grad_d_vec.clone().detach()
    torch.cuda.synchronize()
    start = time.time()
    hvp_d = torch.autograd.grad(grad_d_vec, G.parameters(), grad_outputs=vec_d, retain_graph=True)
    torch.cuda.synchronize()
    end = time.time()
    print('Hessian vector product :  %.7fms' % (end - start))


if __name__ == '__main__':
    device = torch.device('cuda')
    batch_size = 128
    print('=====Test1=====')
    D = dc_D().to(device)
    G = dc_G(z_dim=96).to(device)
    img_cifar = (torch.rand((batch_size, 3, 32, 32), device=device) - 0.5) / 0.5
    z_mnist = torch.randn((batch_size, 96), device=device)
    for i in range(10):
        hvptest(D=D, G=G, z=z_mnist)

    print('=====Test2=====')
    D = DC_discriminator(channel_num=3, feature_num=64).to(device)
    G = DC_generator(z_dim=100, channel_num=3, feature_num=64).to(device)
    z_celeba = torch.randn((batch_size, 100, 1, 1), device=device)
    for i in range(10):
        hvptest(D=D, G=G, z=z_celeba)

Remember that you have to benchmark several iterations, cudnn.benchmark always slows down the first iteration.

Hongkai_Zheng · September 24, 2019, 8:11pm

Oh, I see. thank you so much.