Pytorch 1.0 nightly version is much slower than the stable 0.4.1

Hi,
I am grateful for the convenience brought by pytorch. But after switching to the nightly version of 1.0, I got to find that the same code runs much slower that the 0.4.1 version.

I actually submitted an issue in the github, but it still may be my abusing the new version, so I feel I could also ask my question here.

The main part of my code is like this:
The model:

import torchvision
import torch
import torch.nn as nn
import torch.nn.functional as F



class Model(nn.Module):
    def __init__(self, in_dim, out_dim, *args, **kwargs):
        super(Model, self).__init__(*args, **kwargs)
        vgg16 = torchvision.models.vgg16()

        layers = []
        layers.append(nn.Conv2d(in_dim, 64, kernel_size = 3, stride = 1, padding = 1))
        layers.append(nn.ReLU(inplace = True))
        layers.append(nn.Conv2d(64, 64, kernel_size = 3, stride = 1, padding = 1))
        layers.append(nn.ReLU(inplace = True))
        layers.append(nn.MaxPool2d(3, stride = 2, padding = 1))

        layers.append(nn.Conv2d(64, 128, kernel_size = 3, stride = 1, padding = 1))
        layers.append(nn.ReLU(inplace = True))
        layers.append(nn.Conv2d(128, 128, kernel_size = 3, stride = 1, padding = 1))
        layers.append(nn.ReLU(inplace = True))
        layers.append(nn.MaxPool2d(3, stride = 2, padding = 1))

        layers.append(nn.Conv2d(128, 256, kernel_size = 3, stride = 1, padding = 1))
        layers.append(nn.ReLU(inplace = True))
        layers.append(nn.Conv2d(256, 256, kernel_size = 3, stride = 1, padding = 1))
        layers.append(nn.ReLU(inplace = True))
        layers.append(nn.Conv2d(256, 256, kernel_size = 3, stride = 1, padding = 1))
        layers.append(nn.ReLU(inplace = True))
        layers.append(nn.MaxPool2d(3, stride = 2, padding = 1))

        layers.append(nn.Conv2d(256, 512, kernel_size = 3, stride = 1, padding = 1))
        layers.append(nn.ReLU(inplace = True))
        layers.append(nn.Conv2d(512, 512, kernel_size = 3, stride = 1, padding = 1))
        layers.append(nn.ReLU(inplace = True))
        layers.append(nn.Conv2d(512, 512, kernel_size = 3, stride = 1, padding = 1))
        layers.append(nn.ReLU(inplace = True))
        layers.append(nn.MaxPool2d(3, stride = 1, padding = 1))

        layers.append(nn.Conv2d(512,
            512,
            kernel_size = 3,
            stride = 1,
            padding = 2,
            dilation = 2))
        layers.append(nn.ReLU(inplace = True))
        layers.append(nn.Conv2d(512,
            512,
            kernel_size = 3,
            stride = 1,
            padding = 2,
            dilation = 2))
        layers.append(nn.ReLU(inplace = True))
        layers.append(nn.Conv2d(512,
            512,
            kernel_size = 3,
            stride = 1,
            padding = 2,
            dilation = 2))
        layers.append(nn.ReLU(inplace = True))
        layers.append(nn.MaxPool2d(3, stride = 1, padding = 1))
        self.features = nn.Sequential(*layers)

        classifier = []
        classifier.append(nn.AvgPool2d(3, stride = 1, padding = 1))
        classifier.append(nn.Conv2d(512,
            1024,
            kernel_size = 3,
            stride = 1,
            padding = 12,
            dilation = 12))
        classifier.append(nn.ReLU(inplace = True))
        classifier.append(nn.Conv2d(1024, 1024, kernel_size = 1, stride = 1, padding = 0))
        classifier.append(nn.ReLU(inplace = True))
        classifier.append(nn.Dropout(p = 0.5))
        classifier.append(nn.Conv2d(1024, out_dim, kernel_size = 1))
        self.classifier = nn.Sequential(*classifier)

        self.init_weights()


    def forward(self, x):
        im = x
        x = self.features(x)
        x = self.classifier(x)
        return x

    def init_weights(self):
        vgg = torchvision.models.vgg16(pretrained = True)
        state_vgg = vgg.features.state_dict()
        self.features.load_state_dict(state_vgg)

        for ly in self.classifier.children():
            if isinstance(ly, nn.Conv2d):
                nn.init.kaiming_normal_(ly.weight, a=1)
                nn.init.constant_(ly.bias, 0)

The running process:

import torch
import torch.nn as nn
import torch.nn.functional as F
import time
from model import Model


if __name__ == "__main__":

    net = Model(3, 21)
    net.train()
    net.cuda()
    net = nn.DataParallel(net)
    Loss = nn.CrossEntropyLoss(ignore_index = 255)
    Loss.cuda()
    optim = torch.optim.SGD(net.parameters(), lr = 1e-3, momentum = 0.9, weight_decay = 5e-4)

    st = time.time()
    scale = [0.5, 0.75, 1]
    loss_avg = []
    for i in range(10000):
        in_ten = torch.randn(70, 3, 224, 224)
        label = torch.randint(0, 21, [70, 1, 224, 224])
        in_ten = in_ten.cuda()
        label = label.cuda()
        label = torch.tensor(label).long().cuda()
        optim.zero_grad()
        H, W = in_ten.size()[2:]
        for sub_i, s in enumerate(scale):
            print(time.time() - st)
            h, w = int(H * s), int(W * s)
            in_ten_s = F.interpolate(in_ten, (h, w), mode = 'bilinear')
            out = net(in_ten_s)
            out = F.interpolate(out, [H, W], mode = 'bilinear')
            label = torch.squeeze(label)
            loss = Loss(out, label)
            loss.backward()
            loss = loss.detach().cpu().numpy()
            loss_avg.append(loss)
        optim.step()

        if i % 20 == 0 and not i == 0:
            ed = time.time()
            interval = ed - st
            st = ed
            loss_avg = sum(loss_avg) / len(loss_avg)
            print('iter: {}, time: {}, loss: {}'.format(i, interval, loss_avg))
            loss_avg = []

I got to find that this same code runs much slower with pytorch 1.0 than that of 0.4.1.

My environment is python3.5 and ubuntu16.04 with two 1080ti gpus. I installed my pytorch 1.0 with pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu90/torch_nightly.html.

Do I have anyway to avoid this ?

Also found the same thing in pytorch 1.0 stable. In my own program, 100 batch in 0.4.1 costs 28 seconds and in 1.0 costs 132 seconds. I use nv1070 + cuda9.0.

I tried to compile from the source and that problem is fixed, maybe you could also try this