Training losses cannot be reduced

Hi ! I recently found a strange problem when using the PyTorch training network.

  • When the model is trained, the loss does not decrease
  • I’m not sure what caused this
    Please allow me to show my code below.
    This is my train.py where I run the training code.
import torch
import torch.nn as nn
import torch.optim as optim
from make_data import train_dataloader, test_dataloader
from make_net import Net
import time

num_epochs = 900
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

criterion = nn.MSELoss()
def train_model(model, my_criterion):
    since = time.time()
    train_loader = train_dataloader
    criterion = my_criterion
    net = model()
    opt_Adam = optim.Adam(net.parameters(), lr=0.2, betas=(0.9, 0.99))

    if torch.cuda.is_available():
        print("Let's use", torch.cuda.device_count(), "GPUs")
        net = nn.DataParallel(net)
    net.to(device)

    for epoch in range(num_epochs):
        running_loss = 0.0

        for i, sample in enumerate(train_loader, 0):
            image, pressure = sample['image'], sample['pressure']

            image = image.float()
            image = image.to(device)
            # image.shape torch.Size([256, 403, 640])
            # print("image.shape", image.shape)

            pressure = pressure.float()
            pressure = pressure.to(device)

            opt_Adam.zero_grad()
            output = net(pressure)
            # output.shape torch.Size([256, 403, 640])
            # print("output.shape", output.shape)
            loss = criterion(output, image)
            loss.backward()
            opt_Adam.step()

            running_loss += loss.item()

            if i % 10 == 9:
                # print every 200 mini-batch
                print("[%d, %5d], loss: %.3f" % (epoch+1, i+1, running_loss/10))
                running_loss = 0.0

    print("Finished Training!")


train_model(Net, criterion)

And this is my net.py where I define my network

import torch
import torch.nn as nn
import torch.nn.functional as F
from make_ops import conv_out_size_same
from make_data import batch_size


s_h, s_w = 403, 640
# 403,640
s_h2, s_w2 = conv_out_size_same(s_h, 2), conv_out_size_same(s_w, 2)
# 202, 320
s_h4, s_w4 = conv_out_size_same(s_h2, 2), conv_out_size_same(s_w2, 2)
# 101, 160
s_h8, s_w8 = conv_out_size_same(s_h4, 2), conv_out_size_same(s_w4, 2)
# 51, 80
s_h16, s_w16 = conv_out_size_same(s_h8, 2), conv_out_size_same(s_w8, 2)
# 25, 40
s_h32, s_w32 = conv_out_size_same(s_h16, 2), conv_out_size_same(s_w16, 2)
# 12, 20
s_h64, s_w64 = conv_out_size_same(s_h32, 2), conv_out_size_same(s_w32, 2)
# 6, 10
s_h128, s_w128 = conv_out_size_same(s_h64, 2), conv_out_size_same(s_w64, 2)
# 3,5
s_h256, s_w256 = conv_out_size_same(s_h128, 2), conv_out_size_same(s_w128, 2)
# 2, 3


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()

        self.CONV1_DEPTH = 2
        self.CONV2_DEPTH = 4
        self.CONV3_DEPTH = 8
        self.CONV4_DEPTH = 16
        self.CONV5_DEPTH = 32
        self.CONV6_DEPTH = 64
        self.CONV7_DEPTH = 128
        self.CONV8_DEPTH = 256
        self.f_dim = 32
        self.channel_dim = 1
        self.FC_NODE = 512
        self.IMG_HEIGHT = 403
        self.IMG_WIDTH = 640
        self.batch_size = batch_size

        self.fc1 = nn.Linear(in_features=10, out_features=self.f_dim*8)
        self.fc2 = nn.Linear(in_features=self.f_dim*8, out_features=self.f_dim*8*s_w256*s_h256)
        self.fc3 = nn.Linear(in_features=self.CONV8_DEPTH*s_h256*s_w256, out_features=self.FC_NODE)
        self.fc4 = nn.Linear(in_features=self.FC_NODE, out_features=self.IMG_HEIGHT * self.IMG_WIDTH)

        self.avg_pool = nn.AdaptiveAvgPool2d((s_h256, s_w256))

        self.deconv1 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=self.f_dim*8, out_channels=self.f_dim*4,
                               kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.f_dim*4),
            nn.ELU()
        )
        self.deconv2 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=self.f_dim * 4, out_channels=self.f_dim * 2,
                               kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.f_dim * 2),
            nn.ELU()
        )
        self.deconv3 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=self.f_dim * 2, out_channels=self.f_dim,
                               kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.f_dim),
            nn.ELU()
        )
        self.deconv4 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=self.f_dim, out_channels=self.f_dim//2,
                               kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.f_dim//2),
            nn.ELU()
        )
        self.deconv5 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=self.f_dim//2, out_channels=self.f_dim//4,
                               kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.f_dim//4),
            nn.ELU()
        )
        self.deconv6 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=self.f_dim//4, out_channels=self.f_dim//8,
                               kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.f_dim//8),
            nn.ELU()
        )
        self.deconv7 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=self.f_dim//8, out_channels=self.f_dim//16,
                               kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.f_dim//16),
            nn.ELU()
        )
        self.deconv8 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=self.f_dim//16, out_channels=self.channel_dim,
                               kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.channel_dim),
            nn.Tanh()
        )

        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=self.channel_dim, out_channels=self.CONV1_DEPTH,
                      kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.CONV1_DEPTH),
            nn.ELU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=self.CONV1_DEPTH, out_channels=self.CONV2_DEPTH,
                      kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.CONV2_DEPTH),
            nn.ELU()
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=self.CONV2_DEPTH, out_channels=self.CONV3_DEPTH,
                      kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.CONV3_DEPTH),
            nn.ELU()
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=self.CONV3_DEPTH, out_channels=self.CONV4_DEPTH,
                      kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.CONV4_DEPTH),
            nn.ELU()
        )
        self.conv5 = nn.Sequential(
            nn.Conv2d(in_channels=self.CONV4_DEPTH, out_channels=self.CONV5_DEPTH,
                      kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.CONV5_DEPTH),
            nn.ELU()
        )
        self.conv6 = nn.Sequential(
            nn.Conv2d(in_channels=self.CONV5_DEPTH, out_channels=self.CONV6_DEPTH,
                      kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.CONV6_DEPTH),
            nn.ELU()
        )
        self.conv7 = nn.Sequential(
            nn.Conv2d(in_channels=self.CONV6_DEPTH, out_channels=self.CONV7_DEPTH,
                      kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.CONV7_DEPTH),
            nn.ELU()
        )
        self.conv8 = nn.Sequential(
            nn.Conv2d(in_channels=self.CONV7_DEPTH, out_channels=self.CONV8_DEPTH,
                      kernel_size=2, stride=2),
            nn.BatchNorm2d(num_features=self.CONV8_DEPTH),
            nn.ELU()
        )
        self.layer = nn.Sequential(
            nn.BatchNorm2d(num_features=self.f_dim*8),
            nn.ELU()
        )

    def forward(self, x):

        x = self.fc1(x)
        x = self.fc2(x)
        x = x.view(-1, self.f_dim*8, s_h256, s_w256)
        x = self.layer(x)

        x = self.deconv1(x)
        x = self.deconv2(x)
        x = self.deconv3(x)
        x = self.deconv4(x)
        x = self.deconv5(x)
        x = self.deconv6(x)
        x = self.deconv7(x)
        x = self.deconv8(x)

        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)
        x = self.conv7(x)
        x = self.conv8(x)

        x = self.avg_pool(x)
        x = x.view(-1, self.num_flat_features(x))
        x = F.elu(self.fc3(x))
        x = F.elu(self.fc4(x))
        x = x.view(-1, self.IMG_HEIGHT, self.IMG_WIDTH)

        return x

    def num_flat_features(self, x):
        size = x.size()[1:]
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

When I run the training code, I got the loss values as shown below.

[1,    10], loss: 27806.338
[1,    20], loss: 1088497.401
[1,    30], loss: 2364.557
[1,    40], loss: 2366.722
[1,    50], loss: 2368.215
[1,    60], loss: 2370.851
[1,    70], loss: 2365.583
[1,    80], loss: 2366.041
[2,    10], loss: 2368.178
[2,    20], loss: 2363.056
[2,    30], loss: 2374.572
[2,    40], loss: 2361.862
[2,    50], loss: 2364.390
[2,    60], loss: 2366.633
[2,    70], loss: 2372.771
[2,    80], loss: 2362.416
[3,    10], loss: 2369.942
[3,    20], loss: 2367.277

Sooo confused !!!
I would appreciate it if you could put forward some suggestions on this question.
Thank you very much!!!
Wish you a happy life!!

Try to play around with some hyperparameters, e.g. lowering the learning rate.

1 Like