Training losses cannot be reduced

liyunfei1994 · September 11, 2019, 7:49am

Hi ! I recently found a strange problem when using the PyTorch training network.

When the model is trained, the loss does not decrease
I’m not sure what caused this
Please allow me to show my code below.
This is my train.py where I run the training code.

import torch
import torch.nn as nn
import torch.optim as optim
from make_data import train_dataloader, test_dataloader
from make_net import Net
import time

num_epochs = 900
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

criterion = nn.MSELoss()
def train_model(model, my_criterion):
    since = time.time()
    train_loader = train_dataloader
    criterion = my_criterion
    net = model()
    opt_Adam = optim.Adam(net.parameters(), lr=0.2, betas=(0.9, 0.99))

    if torch.cuda.is_available():
        print("Let's use", torch.cuda.device_count(), "GPUs")
        net = nn.DataParallel(net)
    net.to(device)

    for epoch in range(num_epochs):
        running_loss = 0.0

        for i, sample in enumerate(train_loader, 0):
            image, pressure = sample['image'], sample['pressure']

            image = image.float()
            image = image.to(device)
            # image.shape torch.Size([256, 403, 640])
            # print("image.shape", image.shape)

            pressure = pressure.float()
            pressure = pressure.to(device)

            opt_Adam.zero_grad()
            output = net(pressure)
            # output.shape torch.Size([256, 403, 640])
            # print("output.shape", output.shape)
            loss = criterion(output, image)
            loss.backward()
            opt_Adam.step()

            running_loss += loss.item()

            if i % 10 == 9:
                # print every 200 mini-batch
                print("[%d, %5d], loss: %.3f" % (epoch+1, i+1, running_loss/10))
                running_loss = 0.0

    print("Finished Training!")


train_model(Net, criterion)

And this is my net.py where I define my network

import torch
import torch.nn as nn
import torch.nn.functional as F
from make_ops import conv_out_size_same
from make_data import batch_size


s_h, s_w = 403, 640
# 403,640
s_h2, s_w2 = conv_out_size_same(s_h, 2), conv_out_size_same(s_w, 2)
# 202, 320
s_h4, s_w4 = conv_out_size_same(s_h2, 2), conv_out_size_same(s_w2, 2)
# 101, 160
s_h8, s_w8 = conv_out_size_same(s_h4, 2), conv_out_size_same(s_w4, 2)
# 51, 80
s_h16, s_w16 = conv_out_size_same(s_h8, 2), conv_out_size_same(s_w8, 2)
# 25, 40
s_h32, s_w32 = conv_out_size_same(s_h16, 2), conv_out_size_same(s_w16, 2)
# 12, 20
s_h64, s_w64 = conv_out_size_same(s_h32, 2), conv_out_size_same(s_w32, 2)
# 6, 10
s_h128, s_w128 = conv_out_size_same(s_h64, 2), conv_out_size_same(s_w64, 2)
# 3，5
s_h256, s_w256 = conv_out_size_same(s_h128, 2), conv_out_size_same(s_w128, 2)
# 2, 3


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()

        self.CONV1_DEPTH = 2
        self.CONV2_DEPTH = 4
        self.CONV3_DEPTH = 8
        self.CONV4_DEPTH = 16
        self.CONV5_DEPTH = 32
        self.CONV6_DEPTH = 64
        self.CONV7_DEPTH = 128
        self.CONV8_DEPTH = 256
        self.f_dim = 32
        self.channel_dim = 1
        self.FC_NODE = 512
        self.IMG_HEIGHT = 403
        self.IMG_WIDTH = 640
        self.batch_size = batch_size

        self.fc1 = nn.Linear(in_features=10, out_features=self.f_dim*8)
        self.fc2 = nn.Linear(in_features=self.f_dim*8, out_features=self.f_dim*8*s_w256*s_h256)
        self.fc3 = nn.Linear(in_features=self.CONV8_DEPTH*s_h256*s_w256, out_features=self.FC_NODE)
        self.fc4 = nn.Linear(in_features=self.FC_NODE, out_features=self.IMG_HEIGHT * self.IMG_WIDTH)

        self.avg_pool = nn.AdaptiveAvgPool2d((s_h256, s_w256))

        self.deconv1 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=self.f_dim*8, out_channels=self.f_dim*4,
                               kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.f_dim*4),
            nn.ELU()
        )
        self.deconv2 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=self.f_dim * 4, out_channels=self.f_dim * 2,
                               kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.f_dim * 2),
            nn.ELU()
        )
        self.deconv3 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=self.f_dim * 2, out_channels=self.f_dim,
                               kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.f_dim),
            nn.ELU()
        )
        self.deconv4 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=self.f_dim, out_channels=self.f_dim//2,
                               kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.f_dim//2),
            nn.ELU()
        )
        self.deconv5 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=self.f_dim//2, out_channels=self.f_dim//4,
                               kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.f_dim//4),
            nn.ELU()
        )
        self.deconv6 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=self.f_dim//4, out_channels=self.f_dim//8,
                               kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.f_dim//8),
            nn.ELU()
        )
        self.deconv7 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=self.f_dim//8, out_channels=self.f_dim//16,
                               kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.f_dim//16),
            nn.ELU()
        )
        self.deconv8 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=self.f_dim//16, out_channels=self.channel_dim,
                               kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.channel_dim),
            nn.Tanh()
        )

        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=self.channel_dim, out_channels=self.CONV1_DEPTH,
                      kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.CONV1_DEPTH),
            nn.ELU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=self.CONV1_DEPTH, out_channels=self.CONV2_DEPTH,
                      kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.CONV2_DEPTH),
            nn.ELU()
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=self.CONV2_DEPTH, out_channels=self.CONV3_DEPTH,
                      kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.CONV3_DEPTH),
            nn.ELU()
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=self.CONV3_DEPTH, out_channels=self.CONV4_DEPTH,
                      kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.CONV4_DEPTH),
            nn.ELU()
        )
        self.conv5 = nn.Sequential(
            nn.Conv2d(in_channels=self.CONV4_DEPTH, out_channels=self.CONV5_DEPTH,
                      kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.CONV5_DEPTH),
            nn.ELU()
        )
        self.conv6 = nn.Sequential(
            nn.Conv2d(in_channels=self.CONV5_DEPTH, out_channels=self.CONV6_DEPTH,
                      kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.CONV6_DEPTH),
            nn.ELU()
        )
        self.conv7 = nn.Sequential(
            nn.Conv2d(in_channels=self.CONV6_DEPTH, out_channels=self.CONV7_DEPTH,
                      kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.CONV7_DEPTH),
            nn.ELU()
        )
        self.conv8 = nn.Sequential(
            nn.Conv2d(in_channels=self.CONV7_DEPTH, out_channels=self.CONV8_DEPTH,
                      kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.CONV8_DEPTH),
            nn.ELU()
        )
        self.layer = nn.Sequential(
            nn.BatchNorm2d(num_features=self.f_dim*8),
            nn.ELU()
        )

    def forward(self, x):

        x = self.fc1(x)
        x = self.fc2(x)
        x = x.view(-1, self.f_dim*8, s_h256, s_w256)
        x = self.layer(x)

        x = self.deconv1(x)
        x = self.deconv2(x)
        x = self.deconv3(x)
        x = self.deconv4(x)
        x = self.deconv5(x)
        x = self.deconv6(x)
        x = self.deconv7(x)
        x = self.deconv8(x)

        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)
        x = self.conv7(x)
        x = self.conv8(x)

        x = self.avg_pool(x)
        x = x.view(-1, self.num_flat_features(x))
        x = F.elu(self.fc3(x))
        x = F.elu(self.fc4(x))
        x = x.view(-1, self.IMG_HEIGHT, self.IMG_WIDTH)

        return x

    def num_flat_features(self, x):
        size = x.size()[1:]
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

When I run the training code, I got the loss values as shown below.

[1,    10], loss: 27806.338
[1,    20], loss: 1088497.401
[1,    30], loss: 2364.557
[1,    40], loss: 2366.722
[1,    50], loss: 2368.215
[1,    60], loss: 2370.851
[1,    70], loss: 2365.583
[1,    80], loss: 2366.041
[2,    10], loss: 2368.178
[2,    20], loss: 2363.056
[2,    30], loss: 2374.572
[2,    40], loss: 2361.862
[2,    50], loss: 2364.390
[2,    60], loss: 2366.633
[2,    70], loss: 2372.771
[2,    80], loss: 2362.416
[3,    10], loss: 2369.942
[3,    20], loss: 2367.277

Sooo confused !!!
I would appreciate it if you could put forward some suggestions on this question.
Thank you very much!!!
Wish you a happy life!!

ptrblck · September 11, 2019, 10:17am

Try to play around with some hyperparameters, e.g. lowering the learning rate.