Torch.save(model.state_dict(), filepath) Error [Errno 13] Permission denied

Danielr13 · December 7, 2020, 9:59pm

Hello, I am attempting to save out my best model using minimum validation loss via a custom method ModelCheckpoint. I think this is a folder access issue but I have run the ModelCheckpoint algorithm many times in the past and it worked so it may be a PyTorch issue. For some reason there is permission to access the file for the first two iterations but on the third it seems to fail to write the .pt file to my logs? Below is the code:

Unique log path:

def generate_unique_logpath(logdir, raw_run_name):
    i = 0
    while(True):
        run_name = raw_run_name + "_" + str(i)
        log_path = os.path.join(logdir, run_name)
        if not os.path.isdir(log_path):
            return log_path
        i = i + 1
        
top_logdir = r"C:\Users\Daniel\OneDrive\Documents\Neural Networks Hw\Best Models Project\logs"
if not os.path.exists(top_logdir):
    os.mkdir(top_logdir)

ModelCheckpoint:

class ModelCheckpoint:

    def __init__(self, filepath, model):
        self.min_loss = None
        self.filepath = filepath
        self.model = model

    def update(self, loss):
        if (self.min_loss is None) or (loss < self.min_loss):
#             print("Saving a better model")
            torch.save(self.model.state_dict(), self.filepath)
            #torch.save(self.model, self.filepath)
            self.min_loss = loss

Usage:

if TESTING:
    #Hyper-Parameter sets
    learning_rates = np.array([1.0, 0.1, 0.01, 0.001])
    hidden_sizes = np.array([5, 10, 20])
    EPOCHS = 40000 # just one since we are using a stopping criteria
    
    #output storage lists
    posttrain_loss_storage = torch.zeros(len(learning_rates), len(hidden_sizes), dtype = torch.float32)
    train_confusion_matrix_storage = np.empty((4,3,3,3), dtype = np.int_)
    test_confusion_matrix_storage = np.empty((4,3,3,3), dtype = np.int_)
    
    lr_count = 0
    hs_cound = 0
    for i in learning_rates:
        hs_count = 0
        for j in hidden_sizes:
            string_path = "MLP_1HL" + str(j) + "_Adam" + str(i)
            logdir = generate_unique_logpath(top_logdir, string_path)
            print("Logging to {}".format(logdir))
            # -> Prints out     Logging to   ./logs/linear_0
            if not os.path.exists(logdir):
                os.mkdir(logdir)
            
            
            num_hidden = j
            model = Network(j)
#             stopping_criteria = StopCriteria(25)

            #sending model to cuda
            model.cuda()
            #X_train.to(device)

            criterion = nn.CrossEntropyLoss() #cross-entropy loss
            optimizer = torch.optim.SGD(model.parameters(), lr = i) # implementing momentum for learning rate

            #Showing test set loss pre-training
            print("-----------------------------------------------------------------")
            print("Learning Rate: " + str(i) + "; " + "Hidden Layer PE: " + str(j))
            
            model.eval()
            y_pred = model(X_test)
            before_train = criterion(y_pred.squeeze(), torch.max(y_test, 1)[1])
            print("Test loss pre training: " + str(before_train.item()))
            print()
            
            model_checkpoint = ModelCheckpoint(logdir + "/best_model.pt", model)
            
            # Training model
            for epoch in range(EPOCHS):
                optimizer.zero_grad()
                output = model.forward(X_train)
                loss = criterion(output.squeeze(), torch.max(y_train, 1)[1]) 
                
#                 if epoch == 0:
#                     print('Epoch: {}; before training loss: {}'.format(epoch,loss.item()))

                #implementing stopping criteria
                val_output = model.forward(X_val)
                val_loss = criterion(val_output.squeeze(), torch.max(y_val, 1)[1])
                model_checkpoint.update(val_loss)
                
#                 if stopping_criteria.step(val_loss):
#                     print('Epoch: {}; after train loss: {}'.format(epoch,loss.item()))
#                     print()
#                     break

               
                #printing epoch and loss 
                if epoch % 4999 == 0:
                    print('Epoch: {} train loss: {}'.format(epoch,loss.item()))
               

                #backpropagation
                loss.backward()
                optimizer.step()
            
            # evaluating the model and storing relevant information
            posttrain_loss_storage[lr_count, hs_count] = loss
            
            model_path = logdir + "/best_model.pt"
            model = Network(j)

            model = model.cuda()

            model.load_state_dict(torch.load(model_path))

            # Switch to eval mode 
            model.eval()
            test_pred = model(X_test)
            test_loss = criterion(test_pred.squeeze(), torch.max(y_test, 1)[1])
            test_CM = ConfusionMatrix(test_pred, y_test)
            test_acc = Accuracy(test_CM.float())
            # test_loss, test_acc, confusion_M = test(model, test_loader)
            print()
            print(" Test: Loss : {:.4f}, Acc : {:.4f}".format(test_loss, test_acc))
            print()
            print("Test Confusion Matrix: \n" + str(test_CM))
            print()
            print("-----------------------------------------------------------------")
            
            #train_confusion_matrix_storage[lr_count,hs_count,:,:] = train_CM
            #test_confusion_matrix_storage[lr_count,hs_count,:,:] = test_CM
            
            hs_count += 1
        lr_count += 1
    
    print()
    print("DONE TESTING HYPERPARAMETERS")

Output and Error:

Logging to C:\Users\Daniel\OneDrive\Documents\Neural Networks Hw\Best Models Project\logs\MLP_1HL5_Adam1.0_30
-----------------------------------------------------------------
Learning Rate: 1.0; Hidden Layer PE: 5
Test loss pre training: 0.69432133436203

Epoch: 0 train loss: 0.6944330930709839
Epoch: 4999 train loss: 0.5665101408958435
Epoch: 9998 train loss: 0.543997585773468
Epoch: 14997 train loss: 0.5248210430145264
Epoch: 19996 train loss: 0.5120164752006531
Epoch: 24995 train loss: 0.5025898814201355
Epoch: 29994 train loss: 0.49639782309532166
Epoch: 34993 train loss: 0.4912642538547516
Epoch: 39992 train loss: 0.4892112910747528

 Test: Loss : 0.6203, Acc : 0.6729

Test Confusion Matrix: 
tensor([[65, 36],
        [34, 79]])

-----------------------------------------------------------------
Logging to C:\Users\Daniel\OneDrive\Documents\Neural Networks Hw\Best Models Project\logs\MLP_1HL10_Adam1.0_12
-----------------------------------------------------------------
Learning Rate: 1.0; Hidden Layer PE: 10
Test loss pre training: 0.6900061368942261

Epoch: 0 train loss: 0.6919026970863342
Epoch: 4999 train loss: 0.5735248923301697
Epoch: 9998 train loss: 0.5334906578063965
Epoch: 14997 train loss: 0.5089364647865295
Epoch: 19996 train loss: 0.48545464873313904
Epoch: 24995 train loss: 0.4619404971599579
Epoch: 29994 train loss: 0.4471399486064911
Epoch: 34993 train loss: 0.43656283617019653
Epoch: 39992 train loss: 0.4294586479663849

 Test: Loss : 0.6900, Acc : 0.5374

Test Confusion Matrix: 
tensor([[  0,   0],
        [ 99, 115]])

-----------------------------------------------------------------
Logging to C:\Users\Daniel\OneDrive\Documents\Neural Networks Hw\Best Models Project\logs\MLP_1HL20_Adam1.0_11
-----------------------------------------------------------------
Learning Rate: 1.0; Hidden Layer PE: 20
Test loss pre training: 0.6932849884033203

Epoch: 0 train loss: 0.6937627196311951
---------------------------------------------------------------------------
PermissionError                           Traceback (most recent call last)
<ipython-input-8-4d76ab467289> in <module>
     58                 val_output = model.forward(X_val)
     59                 val_loss = criterion(val_output.squeeze(), torch.max(y_val, 1)[1])
---> 60                 model_checkpoint.update(val_loss)
     61 
     62 #                 if stopping_criteria.step(val_loss):

<ipython-input-4-4ca0ec529d66> in update(self, loss)
      9         if (self.min_loss is None) or (loss < self.min_loss):
     10 #             print("Saving a better model")
---> 11             torch.save(self.model.state_dict(), self.filepath)
     12             #torch.save(self.model, self.filepath)
     13             self.min_loss = loss

~\anaconda3\envs\pytorch\lib\site-packages\torch\serialization.py in save(obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization)
    325             return
    326 
--> 327     with _open_file_like(f, 'wb') as opened_file:
    328         _legacy_save(obj, opened_file, pickle_module, pickle_protocol)
    329 

~\anaconda3\envs\pytorch\lib\site-packages\torch\serialization.py in _open_file_like(name_or_buffer, mode)
    210 def _open_file_like(name_or_buffer, mode):
    211     if _is_path(name_or_buffer):
--> 212         return _open_file(name_or_buffer, mode)
    213     else:
    214         if 'w' in mode:

~\anaconda3\envs\pytorch\lib\site-packages\torch\serialization.py in __init__(self, name, mode)
    191 class _open_file(_opener):
    192     def __init__(self, name, mode):
--> 193         super(_open_file, self).__init__(open(name, mode))
    194 
    195     def __exit__(self, *args):

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\Daniel\\OneDrive\\Documents\\Neural Networks Hw\\Best Models Project\\logs\\MLP_1HL20_Adam1.0_11/best_model.pt'

Danielr13 · December 7, 2020, 10:40pm

Solved: It seemed to be an issue with having my logs folder in OneDrive. I made the logs file local to my PC and it ran fine.