Huge Training Loss at first epoch

I am currently building a CNN Regression model.
My model architecture is like the following:

class RegressionNet(torch.nn.Module):

    def __init__(self, input_size=None, output_size=None):
        super(RegressionNet, self).__init__()

        self.conv1 = torch.nn.Conv2d(3, 16, 3)
        self.conv2 = torch.nn.Conv2d(16, 32, 3)
        
        self.pool1 = torch.nn.MaxPool2d(2, 2)
        self.pool2 = torch.nn.MaxPool2d(2, 2)

        self.input_size = input_size
        self.neurons = self.linear_input_neurons()

        self.fc1 = torch.nn.Linear(self.linear_input_neurons(), 1024)
        self.fc2 = torch.nn.Linear(1024, 1024)
        self.fc3 = torch.nn.Linear(1024, output_size)

    def forward(self, x):
        x = torch.nn.functional.relu(self.conv1(x))
        x = self.pool1(x)
        x = torch.nn.functional.relu(self.conv2(x))
        x = self.pool2(x)     
        x = x.view(-1, self.neurons)
        x = torch.nn.functional.relu(self.fc1(x))
        x = self.fc3(x)

        return x
    
    # here we apply convolution operations before linear layer, and it returns the 4-dimensional size tensor. 
    def size_after_relu(self, x):
        x = self.pool1(torch.nn.functional.relu(self.conv1(x)))
        x = self.pool2(torch.nn.functional.relu(self.conv2(x)))

        return x.size()


    # after obtaining the size in above method, we call it and multiply all elements of the returned size.
    def linear_input_neurons(self):
        size = self.size_after_relu(torch.rand(1, self.input_size[1], self.input_size[2], self.input_size[3])) 
        m = 1
        for i in size[1:]:
            m *= i

        return int(m)

My Optimizer and Loss function:

optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()

Here comes the problem. The training loss in the first epoch is always so huge compared to the valid loss. Is this considered normal as I am using MSE or is there something wrong with my model architecture?

Training for fold 1...
#epoch:0	train loss: 27131208.35759782	valid loss: 0.46788424253463745
#epoch:1	train loss: 1.5370321702212095	valid loss: 0.1842787116765976
#epoch:2	train loss: 0.778756458312273	valid loss: 0.0377110888560613
#epoch:3	train loss: 0.5970308864489198	valid loss: 0.15821123123168945
#epoch:4	train loss: 0.4514023452997208	valid loss: 0.04207026958465576
#epoch:5	train loss: 0.377790588264664	valid loss: 0.059929460287094116
#epoch:6	train loss: 0.32543586088078363	valid loss: 0.06739359242575509
#epoch:7	train loss: 0.2817680644802749	valid loss: 0.051879264414310455
#epoch:8	train loss: 0.2456693003575007	valid loss: 0.028462330500284832
#epoch:9	train loss: 0.22231043577194215	valid loss: 0.03528459072113037
#epoch:10	train loss: 0.20242950726639142	valid loss: 0.03365590355613015
#epoch:11	train loss: 0.18334306310862303	valid loss: 0.024180124203364056
#epoch:12	train loss: 0.16663667731560194	valid loss: 0.010484489110799937
#epoch:13	train loss: 0.156691970835839	valid loss: 0.020529921565737044
#epoch:14	train loss: 0.14648786981900533	valid loss: 0.021238839626312254
#epoch:15	train loss: 0.13558033714070916	valid loss: 0.011424148455262184

As long as the validation set loss is low enough for the target task you need, there is no problem. It is certainly possible for MSE loss to be explode and that usually means the network has a problem learning the underlying data distribution. However, here, your validation loss tells a different story.

First thing, initialization of parameters with torch.nn.init.kaiming_uniform_ or something other than random which is what you have. Second, check your source code; are you printing correctly (i.e. did you initialize variables correctly, etc)? If the network is performing poorly on your testset, post the train code here – it may be a small oversight/bug in the train/val loops.

1 Like

Thank you for your reply.

I have tried the initialization of parameters but still something wrong with my results. The output has a variance of close to 0.
My result:
Note: Left-hand side is the distribution of training data, while the right-hand side is the distribution of outputs from my model. Every row corresponds to one fold in cross validation.

My train function:

from sklearn.model_selection import KFold

EPOCH = 20
batch_size = 128
n_splits = 10

def train(n_splits, X_train, Y_train):
    fold = KFold(n_splits=n_splits, shuffle=False)

    # Score of folds
    valid_loss_per_fold = []
    training_loss_per_fold = []
    train_target_fold, train_output_fold = [], []
    val_target_fold, val_output_fold = [], []
    
    for fold_idx, (train_idx, val_idx) in enumerate(fold.split(X_train)):
        
        X_train_ = [X_train[i] for i in train_idx] 
        Y_train_ = [Y_train[i] for i in train_idx] 
        
        X_val_ = [X_train[i] for i in val_idx]
        Y_val_ = [Y_train[i] for i in val_idx]
        
        train_set = data_loader(X_train_, Y_train_, transform)
        train_loader = DataLoader(train_set, shuffle=True, batch_size=batch_size)
        
        val_set = data_loader(X_val_, Y_val_, transform)
        val_loader = DataLoader(val_set, shuffle=True, batch_size=batch_size)

        train_loss = []
        valid_loss = []
        
        net = RegressionNet(input_size=[batch_size, 3, n_grid, n_grid] , output_size=1)
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        net = net.to(device)

        optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
        criterion = torch.nn.MSELoss()
        
        print('Training for fold {}...'.format(fold_idx + 1))
        
        for epoch in range(EPOCH):
            net.train()
            running_train_loss = 0.0
            with torch.set_grad_enabled(True):
                for inputs, labels in train_loader:
                    inputs = inputs.view(-1, 3, 100, 100)  
                    inputs = inputs.to(device).float()
                    labels = labels.to(device).float()
                    optimizer.zero_grad()
                    outputs = net(inputs)
                    loss = criterion(outputs, labels)
                    running_train_loss += loss.item()
                    loss.backward()
                    optimizer.step()
            train_loss.append(running_train_loss / len(train_set))

            # Validation
            net.eval()
            running_valid_loss = 0.0
            with torch.set_grad_enabled(False):
                for inputs, labels in val_loader:
                    inputs = inputs.view(-1, 3, 100, 100)  
                    inputs = inputs.to(device).float()
                    labels = labels.to(device).float()
                    outputs = net(inputs)
                    running_valid_loss += loss.item()

            valid_loss.append(running_valid_loss / len(val_set))

            print('#epoch:{}\ttrain loss: {}\tvalid loss: {}'.format(epoch, running_train_loss / len(train_loss), running_valid_loss / len(valid_loss)))
        
        net.eval()
        train_target, train_output = [], []
        val_target, val_output = [], []
        with torch.set_grad_enabled(False):
            for inputs, labels in train_loader:
                inputs = inputs.view(-1, 3, 100, 100)  
                inputs = inputs.to(device).float()
                labels = labels.to(device).float()
                outputs = net(inputs)
                train_target.append(labels)
                train_output.append(outputs)
                
            for inputs, labels in train_loader:
                inputs = inputs.view(-1, 3, 100, 100)  
                inputs = inputs.to(device).float()
                labels = labels.to(device).float()
                outputs = net(inputs)
                val_target.append(labels)
                val_output.append(outputs)
        
        flat_train_target = [item.cpu().numpy() for sublist in train_target for item in sublist]
        flat_train_target = np.vstack(flat_train_target)
        flat_train_output = [item.cpu().numpy() for sublist in train_output for item in sublist]
        flat_train_output = np.vstack(flat_train_output)
        flat_val_target = [item.cpu().numpy() for sublist in val_target for item in sublist]
        flat_val_target = np.vstack(flat_val_target)
        flat_val_output = [item.cpu().numpy() for sublist in val_output for item in sublist]
        flat_val_output = np.vstack(flat_val_output)
        train_target_fold.append(flat_train_target)
        train_output_fold.append(flat_train_output)
        val_target_fold.append(flat_val_target)
        val_output_fold.append(flat_val_output)
        valid_loss_per_fold.append(np.mean(valid_loss) / fold.n_splits)
        training_loss_per_fold.append(np.mean(train_loss) / fold.n_splits)
        fold_idx += 1
    

    return training_loss_per_fold, valid_loss_per_fold, train_target_fold, train_output_fold, val_target_fold, val_output_fold

Is there something wrong with my model architecture or train function?