Hi ! I recently found a strange problem when using the PyTorch training network.
- When the model is trained, the loss does not decrease
- I’m not sure what caused this
Please allow me to show my code below.
This is my train.py where I run the training code.
import torch
import torch.nn as nn
import torch.optim as optim
from make_data import train_dataloader, test_dataloader
from make_net import Net
import time
num_epochs = 900
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
criterion = nn.MSELoss()
def train_model(model, my_criterion):
since = time.time()
train_loader = train_dataloader
criterion = my_criterion
net = model()
opt_Adam = optim.Adam(net.parameters(), lr=0.2, betas=(0.9, 0.99))
if torch.cuda.is_available():
print("Let's use", torch.cuda.device_count(), "GPUs")
net = nn.DataParallel(net)
net.to(device)
for epoch in range(num_epochs):
running_loss = 0.0
for i, sample in enumerate(train_loader, 0):
image, pressure = sample['image'], sample['pressure']
image = image.float()
image = image.to(device)
# image.shape torch.Size([256, 403, 640])
# print("image.shape", image.shape)
pressure = pressure.float()
pressure = pressure.to(device)
opt_Adam.zero_grad()
output = net(pressure)
# output.shape torch.Size([256, 403, 640])
# print("output.shape", output.shape)
loss = criterion(output, image)
loss.backward()
opt_Adam.step()
running_loss += loss.item()
if i % 10 == 9:
# print every 200 mini-batch
print("[%d, %5d], loss: %.3f" % (epoch+1, i+1, running_loss/10))
running_loss = 0.0
print("Finished Training!")
train_model(Net, criterion)
And this is my net.py where I define my network
import torch
import torch.nn as nn
import torch.nn.functional as F
from make_ops import conv_out_size_same
from make_data import batch_size
s_h, s_w = 403, 640
# 403,640
s_h2, s_w2 = conv_out_size_same(s_h, 2), conv_out_size_same(s_w, 2)
# 202, 320
s_h4, s_w4 = conv_out_size_same(s_h2, 2), conv_out_size_same(s_w2, 2)
# 101, 160
s_h8, s_w8 = conv_out_size_same(s_h4, 2), conv_out_size_same(s_w4, 2)
# 51, 80
s_h16, s_w16 = conv_out_size_same(s_h8, 2), conv_out_size_same(s_w8, 2)
# 25, 40
s_h32, s_w32 = conv_out_size_same(s_h16, 2), conv_out_size_same(s_w16, 2)
# 12, 20
s_h64, s_w64 = conv_out_size_same(s_h32, 2), conv_out_size_same(s_w32, 2)
# 6, 10
s_h128, s_w128 = conv_out_size_same(s_h64, 2), conv_out_size_same(s_w64, 2)
# 3,5
s_h256, s_w256 = conv_out_size_same(s_h128, 2), conv_out_size_same(s_w128, 2)
# 2, 3
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.CONV1_DEPTH = 2
self.CONV2_DEPTH = 4
self.CONV3_DEPTH = 8
self.CONV4_DEPTH = 16
self.CONV5_DEPTH = 32
self.CONV6_DEPTH = 64
self.CONV7_DEPTH = 128
self.CONV8_DEPTH = 256
self.f_dim = 32
self.channel_dim = 1
self.FC_NODE = 512
self.IMG_HEIGHT = 403
self.IMG_WIDTH = 640
self.batch_size = batch_size
self.fc1 = nn.Linear(in_features=10, out_features=self.f_dim*8)
self.fc2 = nn.Linear(in_features=self.f_dim*8, out_features=self.f_dim*8*s_w256*s_h256)
self.fc3 = nn.Linear(in_features=self.CONV8_DEPTH*s_h256*s_w256, out_features=self.FC_NODE)
self.fc4 = nn.Linear(in_features=self.FC_NODE, out_features=self.IMG_HEIGHT * self.IMG_WIDTH)
self.avg_pool = nn.AdaptiveAvgPool2d((s_h256, s_w256))
self.deconv1 = nn.Sequential(
nn.ConvTranspose2d(in_channels=self.f_dim*8, out_channels=self.f_dim*4,
kernel_size=2, stride=2),
nn.BatchNorm2d(num_features=self.f_dim*4),
nn.ELU()
)
self.deconv2 = nn.Sequential(
nn.ConvTranspose2d(in_channels=self.f_dim * 4, out_channels=self.f_dim * 2,
kernel_size=2, stride=2),
nn.BatchNorm2d(num_features=self.f_dim * 2),
nn.ELU()
)
self.deconv3 = nn.Sequential(
nn.ConvTranspose2d(in_channels=self.f_dim * 2, out_channels=self.f_dim,
kernel_size=2, stride=2),
nn.BatchNorm2d(num_features=self.f_dim),
nn.ELU()
)
self.deconv4 = nn.Sequential(
nn.ConvTranspose2d(in_channels=self.f_dim, out_channels=self.f_dim//2,
kernel_size=2, stride=2),
nn.BatchNorm2d(num_features=self.f_dim//2),
nn.ELU()
)
self.deconv5 = nn.Sequential(
nn.ConvTranspose2d(in_channels=self.f_dim//2, out_channels=self.f_dim//4,
kernel_size=2, stride=2),
nn.BatchNorm2d(num_features=self.f_dim//4),
nn.ELU()
)
self.deconv6 = nn.Sequential(
nn.ConvTranspose2d(in_channels=self.f_dim//4, out_channels=self.f_dim//8,
kernel_size=2, stride=2),
nn.BatchNorm2d(num_features=self.f_dim//8),
nn.ELU()
)
self.deconv7 = nn.Sequential(
nn.ConvTranspose2d(in_channels=self.f_dim//8, out_channels=self.f_dim//16,
kernel_size=2, stride=2),
nn.BatchNorm2d(num_features=self.f_dim//16),
nn.ELU()
)
self.deconv8 = nn.Sequential(
nn.ConvTranspose2d(in_channels=self.f_dim//16, out_channels=self.channel_dim,
kernel_size=2, stride=2),
nn.BatchNorm2d(num_features=self.channel_dim),
nn.Tanh()
)
self.conv1 = nn.Sequential(
nn.Conv2d(in_channels=self.channel_dim, out_channels=self.CONV1_DEPTH,
kernel_size=2, stride=2),
nn.BatchNorm2d(num_features=self.CONV1_DEPTH),
nn.ELU()
)
self.conv2 = nn.Sequential(
nn.Conv2d(in_channels=self.CONV1_DEPTH, out_channels=self.CONV2_DEPTH,
kernel_size=2, stride=2),
nn.BatchNorm2d(num_features=self.CONV2_DEPTH),
nn.ELU()
)
self.conv3 = nn.Sequential(
nn.Conv2d(in_channels=self.CONV2_DEPTH, out_channels=self.CONV3_DEPTH,
kernel_size=2, stride=2),
nn.BatchNorm2d(num_features=self.CONV3_DEPTH),
nn.ELU()
)
self.conv4 = nn.Sequential(
nn.Conv2d(in_channels=self.CONV3_DEPTH, out_channels=self.CONV4_DEPTH,
kernel_size=2, stride=2),
nn.BatchNorm2d(num_features=self.CONV4_DEPTH),
nn.ELU()
)
self.conv5 = nn.Sequential(
nn.Conv2d(in_channels=self.CONV4_DEPTH, out_channels=self.CONV5_DEPTH,
kernel_size=2, stride=2),
nn.BatchNorm2d(num_features=self.CONV5_DEPTH),
nn.ELU()
)
self.conv6 = nn.Sequential(
nn.Conv2d(in_channels=self.CONV5_DEPTH, out_channels=self.CONV6_DEPTH,
kernel_size=2, stride=2),
nn.BatchNorm2d(num_features=self.CONV6_DEPTH),
nn.ELU()
)
self.conv7 = nn.Sequential(
nn.Conv2d(in_channels=self.CONV6_DEPTH, out_channels=self.CONV7_DEPTH,
kernel_size=2, stride=2),
nn.BatchNorm2d(num_features=self.CONV7_DEPTH),
nn.ELU()
)
self.conv8 = nn.Sequential(
nn.Conv2d(in_channels=self.CONV7_DEPTH, out_channels=self.CONV8_DEPTH,
kernel_size=2, stride=2),
nn.BatchNorm2d(num_features=self.CONV8_DEPTH),
nn.ELU()
)
self.layer = nn.Sequential(
nn.BatchNorm2d(num_features=self.f_dim*8),
nn.ELU()
)
def forward(self, x):
x = self.fc1(x)
x = self.fc2(x)
x = x.view(-1, self.f_dim*8, s_h256, s_w256)
x = self.layer(x)
x = self.deconv1(x)
x = self.deconv2(x)
x = self.deconv3(x)
x = self.deconv4(x)
x = self.deconv5(x)
x = self.deconv6(x)
x = self.deconv7(x)
x = self.deconv8(x)
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)
x = self.conv4(x)
x = self.conv5(x)
x = self.conv6(x)
x = self.conv7(x)
x = self.conv8(x)
x = self.avg_pool(x)
x = x.view(-1, self.num_flat_features(x))
x = F.elu(self.fc3(x))
x = F.elu(self.fc4(x))
x = x.view(-1, self.IMG_HEIGHT, self.IMG_WIDTH)
return x
def num_flat_features(self, x):
size = x.size()[1:]
num_features = 1
for s in size:
num_features *= s
return num_features
When I run the training code, I got the loss values as shown below.
[1, 10], loss: 27806.338
[1, 20], loss: 1088497.401
[1, 30], loss: 2364.557
[1, 40], loss: 2366.722
[1, 50], loss: 2368.215
[1, 60], loss: 2370.851
[1, 70], loss: 2365.583
[1, 80], loss: 2366.041
[2, 10], loss: 2368.178
[2, 20], loss: 2363.056
[2, 30], loss: 2374.572
[2, 40], loss: 2361.862
[2, 50], loss: 2364.390
[2, 60], loss: 2366.633
[2, 70], loss: 2372.771
[2, 80], loss: 2362.416
[3, 10], loss: 2369.942
[3, 20], loss: 2367.277
Sooo confused !!!
I would appreciate it if you could put forward some suggestions on this question.
Thank you very much!!!
Wish you a happy life!!