ResNet152 stops after certain time of training

Suryanshg · December 13, 2021, 4:39am

Hello,

I am currently trying to apply ResNet152 on my dataset. It is a regression problem I am trying to solve and thus, I have modified the fc layer to have only 1 output. I have also disabled training on all layers (except the last one).

For my project, I am applying ResNet152 with Leave-one-subject-out Cross Validation. After some folds, it stops doing anything.

This is some part of my code:

# Function for Training
def train(model, dataloader, epochs=10):

    # Disabling the training on all layers except the last one
    for param in model.parameters():
        param.requires_grad = False

    for param in model.fc.parameters():
        param.requires_grad = True
    
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)


    print("Training\n-----------------------")

    for epoch in range(epochs):
        
        running_loss = 0.0

        for X,y in tqdm(dataloader, total = int(len(dataloader.dataset)/dataloader.batch_size)+1):
            X,y = X.to(device), y.to(device)
            

            # Compute prediction and its error
            pred = model(X).reshape(-1)
            loss = loss_fn(pred.float(), y.float())


            # Do Backpropagation
            optimizer.zero_grad() 
            loss.backward()
            optimizer.step()


            running_loss += loss.item() * X.size(0)

        epoch_loss = running_loss / len(dataloader.dataset)
        print(f"Epoch {epoch + 1}, Loss:{epoch_loss}")

    return model

def LOSOCV():
    print("Leave-one-subject-out cross validation:")
    print("----------------------------------")

    logo =  LeaveOneGroupOut()

    sum_of_rmse = 0
    sum_of_pearson_corr = 0
    sum_of_std_dev = 0
    count = 0

    for train_idx, test_idx in logo.split(images, PSPI, subject_ids):

        print("Training Samples:",set(subject_ids[train_idx]))
        print("Testing Samples:",set(subject_ids[test_idx]))

        start_time = time.time()

        X_train = images[train_idx]
        y_train = PSPI[train_idx]
        X_test = images[test_idx]
        y_test = PSPI[test_idx]

        end_time = time.time()

        print("Time elapsed for fetching training and test data", end_time - start_time)

        # Initialize the Dataloaders
        training_dataset = MyDataset(X_train, y_train)
        test_dataset = MyDataset(X_test, y_test)

        training_data_loader = DataLoader(training_dataset, batch_size=64)
        test_data_loader = DataLoader(test_dataset, batch_size=64)


        # Training and predicting using the model       
        resnet = train(resnet, training_data_loader, 5)
        yhat_test = predict(resnet, test_data_loader)

        # Calculating the metrics
        testing_corr, _ = pearsonr(yhat_test, y_test)
        rmse = mean_squared_error(yhat_test, y_test, squared=False)
        std_dev = np.std(yhat_test)
        print("Peason Correlation:",testing_corr)
        print("RMSE:", rmse)
        print("Standard Deviation:", std_dev)

        if(not np.isnan(testing_corr)):
            count+=1
            sum_of_pearson_corr+=testing_corr
        sum_of_rmse+=rmse
        sum_of_std_dev+=std_dev
        
        print()
         
    print()
    print("Average Pearson Correlation:",sum_of_pearson_corr/count)
    print("Average RMSE:",sum_of_rmse/25)
    print("Average Standard Deviation:", sum_of_std_dev/25)

My code pauses in the LOSOCV function after printing out the Training and Testing samples.
Currently for getting the training and the testing set by using indices is taking around 200 - 300 seconds and after 2 folds, the code just gets idle and doesn’t do anything.

Any help will be appreciated.