Loss curve change drastically for the same training data:

I am trying to do 10 fold CV with DNN. However, the loss curve change drastically when I do it in different ways. When I run the 10 fold CV in the fit method of the Module class, the loss curve has large fluctuations. But when I run the for loop on the KFolds object, and fit the training data separately, the loss curve is smooth and converges quickly.

Below is my code:

from sklearn.model_selection import KFold

#tune model in 10-fold CV:
#n fold CV on hc data:
nfold = 10
seed = 111
kf = KFold(n_splits=nfold, shuffle = True, random_state=seed)

############################################# DNN (pytorch)
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.autograd import Variable
from torch.optim.lr_scheduler import ReduceLROnPlateau

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt

torch.manual_seed(999) # reproducible

   def smooth(y, box_pts):
        box = np.ones(box_pts)/box_pts
        y_smooth = np.convolve(y, box, mode='same')
        return y_smooth
> def weights_init(m):
>     if isinstance(m, nn.Linear):
>         print('initialize weight...')
>         nn.init.xavier_uniform_(m.weight.data)
> class Model(nn.Module): 
>     
>     def __init__(self, n_hidden1=400, n_hidden2=150, n_hidden3=50, 
>                  min_loss = .5, l_rate = .001, max_epochs = 10000):
>         
>         super(Model, self).__init__() 
>         self.n_hidden1 = n_hidden1
>         self.n_hidden2 = n_hidden2
>         self.n_hidden3 = n_hidden3
>         self.min_loss = min_loss
>         self.l_rate = l_rate
>         self.max_epochs = max_epochs
>         
>         
>     def build_layer(self, input_dim):
>         
>         if input_dim<100:
>             print('4 layers .................')
>             # MD and FA:
>             n_hidden1 = 50
>             n_hidden2 = 20
>             n_hidden3 = 5
>             
>             self.layer1 = nn.Linear(input_dim, n_hidden1)
>             self.layer2 = nn.Linear(n_hidden1, n_hidden2)
>             self.layer3 = nn.Linear(n_hidden2, n_hidden3)
>             self.layer4 = nn.Linear(n_hidden3, 1)
>             #self.layer5 = nn.Linear(5, 1)
>             
>         elif input_dim<150:
>             print('5 layers .................')
>             # gray matter volume
>             n_hidden1 = 200
>             n_hidden2 = 100
>             n_hidden3 = 50
> 
>             self.layer1 = nn.Linear(input_dim, n_hidden1)
>             self.layer2 = nn.Linear(n_hidden1, n_hidden2)
>             self.layer3 = nn.Linear(n_hidden2, n_hidden3)
>             self.layer4 = nn.Linear(n_hidden3, 20)
>             self.layer5 = nn.Linear(20, 1)
>             #self.layer6 = nn.Linear(5, 1)
>             
>         elif input_dim<300:
>             print('5 layers .................')
>             # ALFF and ReHo
>             n_hidden1 = 200
>             n_hidden2 = 150
>             n_hidden3 = 50
> 
>             self.layer1 = nn.Linear(input_dim, n_hidden1)
>             self.layer2 = nn.Linear(n_hidden1, n_hidden2)
>             self.layer3 = nn.Linear(n_hidden2, 1)
>             #self.layer4 = nn.Linear(n_hidden3, 20)
>             #self.layer5 = nn.Linear(20, 1)
>             #self.layer6 = nn.Linear(5, 1)
>             
>         else:
>             print('6 layers .................')
>             self.layer1 = nn.Linear(input_dim, self.n_hidden1)
>             self.layer2 = nn.Linear(self.n_hidden1, self.n_hidden2)
>             self.layer3 = nn.Linear(self.n_hidden2, self.n_hidden3)
>             self.layer4 = nn.Linear(self.n_hidden3, 30)
>             self.layer5 = nn.Linear(30, 5)
>             self.layer6 = nn.Linear(5, 1)
>         
>         self.sigmoid = torch.nn.Sigmoid()
>       
>     
>     def forward (self, x, **kwargs):
>         
>         out = F.relu(self.layer1(x)) # activation function for hidden layer
>         #out1 = self.sigmoid(self.layer1(x))
>         #out1 = self.dropout(out1)
>         
>         #out2 = F.relu(self.layer2(out1))
>         out = self.sigmoid(self.layer2(out))
>         #out2 = self.dropout(out2)
>         
>         
>         
>         if x.shape[1]<100:
>             out = F.relu(self.layer3(out))
>             #out = self.dropout(out)
>             
>             y_pred = self.layer4(out)
>             
>         elif x.shape[1]<300:
>             out = self.sigmoid(self.layer3(out))
>             out = F.relu(self.layer4(out))
>             y_pred = self.layer5(out) # linear output
>             
>         else:
>             out = self.sigmoid(self.layer3(out))
>             out = self.sigmoid(self.layer4(out))
>             out = F.relu(self.layer5(out))
>             y_pred = self.layer6(out) # linear output
>             
>         return y_pred
>     
>     
>     def update_plot(loss_list, corr_list, loss_info):
>         corr_list_s = smooth(corr_list, box_pts = 100)
>         ax.clear()
>         ax.plot(range(epoch), loss_list,'r--', range(epoch), corr_list*10, 'g--',
>                range(epoch), corr_list_s*10, 'y--')
>         plt.ylim(0, 30) 
>         plt.text(10, 30 , loss_info, fontsize=10)
>         fig.canvas.draw()
>     
>     
>     def fit (self, X, y,  max_loss = 10, tune_loss = False):
>         """
>         if tune_loss:
>         1. use 80% of data as training and 20% as test set to get the optimized number of epochs
>         2. reset the model use all the data to train the model.
>         
>         Note: the tuned loss may not work better for independent validation data. As in K-fold CV, the training
>         and validation set are not independent. (e.g. larger mean value of training set indicates smaller mean
>         of the validataion set). It is recommend to first use tune_loss to explore the best min_loss, and run
>         the model again with tune_loss=False.
>         """
>         
>         min_loss = self.min_loss
>         max_epochs = self.max_epochs
>         
>         #torch.manual_seed(999)    # reproducible
>         self.build_layer(input_dim = X.shape[1])
> 
>         criterion = nn.MSELoss()# Mean Squared Loss
>         
>         optimizer = torch.optim.SGD(self.parameters(), 
>                                     lr = self.l_rate, 
>                                     weight_decay=1e-3, 
>                                     momentum=0.9, 
>                                     dampening = 0,
>                                     nesterov=True) #Stochastic Gradient Descent 
>         
>         scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=.5)
>         
>         if tune_loss==True:
>             print('optimizing loss value ...')
>             opt_loss_list = []
> 
>             for train_index, test_index in kf.split(X, y):
>                 #print("TRAIN:", train_index, "TEST:", test_index)
>                 X_train, X_test = X[train_index], X[test_index]
>                 y_train, y_test = y[train_index], y[test_index]
> 
>                 X_train_torch = Variable(torch.from_numpy(X_train))
>                 y_train_torch = Variable(torch.from_numpy(y_train))
>                 X_train_torch = X_train_torch.float()
>                 y_train_torch = y_train_torch.float()
>                 labels = y_train_torch.view(y_train.shape[0],1)
> 
>                 X_test_torch = Variable(torch.from_numpy(X_test))
>                 X_test_torch = X_test_torch.float()
> 
>                 # Collect errors to evaluate performance
>                 loss_list = []
>                 corr_list = []
> 
>                 torch.manual_seed(999)
>                 self.apply(weights_init)
> 
>                 epoch = 0
>                 fig = plt.figure(figsize=(5,5))
>                 ax = fig.add_subplot(111)
>                 plt.ion()
>                 fig.show()
>                 fig.canvas.draw()
>                 
>                 
>                 while True:
>                     
>                     #increase the number of epochs by 1 every time
>                     epoch +=1
>                     #scheduler.step()
>                     #clear grads 
>                     optimizer.zero_grad()
>         #             forward to get predicted values
>                     outputs = self(X_train_torch) # model predict,  outputs = net.forward(inputs)
> 
>                     loss = criterion(outputs, labels)
>                     loss.backward()# back props
>                     optimizer.step()# update the parameters 
>                     
> 
>                     y_prediction = self(X_test_torch)
>                     y_prediction = y_prediction.detach().numpy().flatten()
> 
>                     corr = np.corrcoef(y_prediction, y_test)[0,1]
>                     corr_list = np.append(corr_list, corr) 
>                     loss_list = np.append(loss_list, loss.item())
> 
>                     max_corr = np.amax(corr_list)
>                     
>                     
> 
>                     if epoch % 100 == 0:
>                         loss_info = 'epoch %d, loss %.4f, test cor %.4f' % (epoch, loss.item(), corr)
>                         update_plot(loss_list, corr_list, loss_info)
> 
>                     if epoch>max_epochs or loss.item()< min_loss:
>                         break
> 
>                 opt_epochs = corr_list_s.argmax()
>                 opt_loss = loss_list[opt_epochs]
> 
>                 if opt_loss>max_loss:
>                     opt_loss = np.nan
>                     
>                 opt_loss_list = np.append(opt_loss_list, opt_loss)
> 
>                 print('stop at epochs: %d, loss %.4f, with test cor: %.4f' % \
>                       (opt_epochs, opt_loss, corr_list[opt_epochs]))
>                 
>                 loss_info = 'epoch %d, loss %.4f, test cor %.4f' % (epoch, loss.item(), corr)
>                 opt_loss_info = 'opt_epoch %d, opt_loss %.4f, test cor %.4f' % (opt_epochs, opt_loss, corr_list[opt_epochs])
>                 ax.clear()
>                 ax.plot(range(epoch), loss_list,'r--', range(epoch), corr_list*10, 'g--',
>                        range(epoch), corr_list_s*10, 'y--')
>                 plt.ylim(0, 30) 
>                 plt.text(10, 30 , loss_info, fontsize=10)
>                 plt.text(10, 20 , opt_loss_info, fontsize=10)
>                 ax.axvline(opt_epochs)
>                 fig.canvas.draw()
> 
>             opt_loss_mean = np.nanmean(opt_loss_list)
>             print('boots finished with optimized loss: %.4f' %opt_loss_mean)
>             
>             
>             
>         else:
>             torch.manual_seed(999) 
>             self.apply(weights_init)
>             print('training with all training data:')
>             opt_loss_mean = -9999
>             
>             X_train_torch = Variable(torch.from_numpy(X))
>             y_train_torch = Variable(torch.from_numpy(y))
>             X_train_torch = X_train_torch.float()
>             y_train_torch = y_train_torch.float()
> 
>             labels = y_train_torch.view(y.shape[0],1)
>             loss_list = []
>             epoch = 0
>             
>             fig = plt.figure(figsize=(8,8))
>             ax = fig.add_subplot(111)
>             plt.ion()
> 
>             fig.show()
>             fig.canvas.draw()
>             
>             while True:            
>                 #increase the number of epochs by 1 every time
>                 epoch +=1
> 
>                 #clear grads 
>                 #scheduler.step()
>                 optimizer.zero_grad()
>                 #forward to get predicted values
>                 outputs = self(X_train_torch) # model predict,  outputs = net.forward(inputs)
> 
>                 loss = criterion(outputs, labels)
>                 loss.backward()# back props
>                 optimizer.step()# update the parameters 
>                 
>                 loss_list = np.append(loss_list, loss.item())
>                 
>                 if epoch % 100 == 0:
>                     loss_info = 'epoch %d, loss %.4f' % (epoch, loss.item())
>                     ax.clear()
>                     ax.plot(range(epoch), loss_list,'r--')
>                     plt.ylim(0, 30) 
>                     plt.text(10, 30 , loss_info, fontsize=20)
>                     fig.canvas.draw()
> 
>                 # define the mean loss to prevent overfitting with bad data.
>                 if loss.item()<opt_loss_mean or loss.item()<min_loss or epoch>max_epochs:
>                     print('stop with loss', loss.item())
>                     break
>     
>     def predict(self, X_test):
>         
>         X_test_torch = Variable(torch.from_numpy(X_test))
>         X_test_torch = X_test_torch.float()
>         
>         y_prediction = self(X_test_torch)
>         y_prediction = y_prediction.detach().numpy().flatten()
>         
>         return y_prediction

if I run with tune_loss=True (do cv in the fit method with initate_weights before each loop):

net = Model(min_loss = .9, l_rate = 1e-3, max_epochs = 10000)
%matplotlib notebook
net.fit(X_hc, y_hc, tune_loss = True)

I got the curve like this:
loss_curve

when I run fit in the for loop of 10-fold CV:

for train_index, test_index in kf.split(X):
            print('run_model on CV: %d' % i)

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            net.fit(X_train, y_train, tune_loss = False)

I got the curve like this:
curve_loss2

Could anyone explain why this happens?