Hello, I am attempting to save out my best model using minimum validation loss via a custom method ModelCheckpoint. I think this is a folder access issue but I have run the ModelCheckpoint algorithm many times in the past and it worked so it may be a PyTorch issue. For some reason there is permission to access the file for the first two iterations but on the third it seems to fail to write the .pt file to my logs? Below is the code:
Unique log path:
def generate_unique_logpath(logdir, raw_run_name):
i = 0
while(True):
run_name = raw_run_name + "_" + str(i)
log_path = os.path.join(logdir, run_name)
if not os.path.isdir(log_path):
return log_path
i = i + 1
top_logdir = r"C:\Users\Daniel\OneDrive\Documents\Neural Networks Hw\Best Models Project\logs"
if not os.path.exists(top_logdir):
os.mkdir(top_logdir)
ModelCheckpoint:
class ModelCheckpoint:
def __init__(self, filepath, model):
self.min_loss = None
self.filepath = filepath
self.model = model
def update(self, loss):
if (self.min_loss is None) or (loss < self.min_loss):
# print("Saving a better model")
torch.save(self.model.state_dict(), self.filepath)
#torch.save(self.model, self.filepath)
self.min_loss = loss
Usage:
if TESTING:
#Hyper-Parameter sets
learning_rates = np.array([1.0, 0.1, 0.01, 0.001])
hidden_sizes = np.array([5, 10, 20])
EPOCHS = 40000 # just one since we are using a stopping criteria
#output storage lists
posttrain_loss_storage = torch.zeros(len(learning_rates), len(hidden_sizes), dtype = torch.float32)
train_confusion_matrix_storage = np.empty((4,3,3,3), dtype = np.int_)
test_confusion_matrix_storage = np.empty((4,3,3,3), dtype = np.int_)
lr_count = 0
hs_cound = 0
for i in learning_rates:
hs_count = 0
for j in hidden_sizes:
string_path = "MLP_1HL" + str(j) + "_Adam" + str(i)
logdir = generate_unique_logpath(top_logdir, string_path)
print("Logging to {}".format(logdir))
# -> Prints out Logging to ./logs/linear_0
if not os.path.exists(logdir):
os.mkdir(logdir)
num_hidden = j
model = Network(j)
# stopping_criteria = StopCriteria(25)
#sending model to cuda
model.cuda()
#X_train.to(device)
criterion = nn.CrossEntropyLoss() #cross-entropy loss
optimizer = torch.optim.SGD(model.parameters(), lr = i) # implementing momentum for learning rate
#Showing test set loss pre-training
print("-----------------------------------------------------------------")
print("Learning Rate: " + str(i) + "; " + "Hidden Layer PE: " + str(j))
model.eval()
y_pred = model(X_test)
before_train = criterion(y_pred.squeeze(), torch.max(y_test, 1)[1])
print("Test loss pre training: " + str(before_train.item()))
print()
model_checkpoint = ModelCheckpoint(logdir + "/best_model.pt", model)
# Training model
for epoch in range(EPOCHS):
optimizer.zero_grad()
output = model.forward(X_train)
loss = criterion(output.squeeze(), torch.max(y_train, 1)[1])
# if epoch == 0:
# print('Epoch: {}; before training loss: {}'.format(epoch,loss.item()))
#implementing stopping criteria
val_output = model.forward(X_val)
val_loss = criterion(val_output.squeeze(), torch.max(y_val, 1)[1])
model_checkpoint.update(val_loss)
# if stopping_criteria.step(val_loss):
# print('Epoch: {}; after train loss: {}'.format(epoch,loss.item()))
# print()
# break
#printing epoch and loss
if epoch % 4999 == 0:
print('Epoch: {} train loss: {}'.format(epoch,loss.item()))
#backpropagation
loss.backward()
optimizer.step()
# evaluating the model and storing relevant information
posttrain_loss_storage[lr_count, hs_count] = loss
model_path = logdir + "/best_model.pt"
model = Network(j)
model = model.cuda()
model.load_state_dict(torch.load(model_path))
# Switch to eval mode
model.eval()
test_pred = model(X_test)
test_loss = criterion(test_pred.squeeze(), torch.max(y_test, 1)[1])
test_CM = ConfusionMatrix(test_pred, y_test)
test_acc = Accuracy(test_CM.float())
# test_loss, test_acc, confusion_M = test(model, test_loader)
print()
print(" Test: Loss : {:.4f}, Acc : {:.4f}".format(test_loss, test_acc))
print()
print("Test Confusion Matrix: \n" + str(test_CM))
print()
print("-----------------------------------------------------------------")
#train_confusion_matrix_storage[lr_count,hs_count,:,:] = train_CM
#test_confusion_matrix_storage[lr_count,hs_count,:,:] = test_CM
hs_count += 1
lr_count += 1
print()
print("DONE TESTING HYPERPARAMETERS")
Output and Error:
Logging to C:\Users\Daniel\OneDrive\Documents\Neural Networks Hw\Best Models Project\logs\MLP_1HL5_Adam1.0_30
-----------------------------------------------------------------
Learning Rate: 1.0; Hidden Layer PE: 5
Test loss pre training: 0.69432133436203
Epoch: 0 train loss: 0.6944330930709839
Epoch: 4999 train loss: 0.5665101408958435
Epoch: 9998 train loss: 0.543997585773468
Epoch: 14997 train loss: 0.5248210430145264
Epoch: 19996 train loss: 0.5120164752006531
Epoch: 24995 train loss: 0.5025898814201355
Epoch: 29994 train loss: 0.49639782309532166
Epoch: 34993 train loss: 0.4912642538547516
Epoch: 39992 train loss: 0.4892112910747528
Test: Loss : 0.6203, Acc : 0.6729
Test Confusion Matrix:
tensor([[65, 36],
[34, 79]])
-----------------------------------------------------------------
Logging to C:\Users\Daniel\OneDrive\Documents\Neural Networks Hw\Best Models Project\logs\MLP_1HL10_Adam1.0_12
-----------------------------------------------------------------
Learning Rate: 1.0; Hidden Layer PE: 10
Test loss pre training: 0.6900061368942261
Epoch: 0 train loss: 0.6919026970863342
Epoch: 4999 train loss: 0.5735248923301697
Epoch: 9998 train loss: 0.5334906578063965
Epoch: 14997 train loss: 0.5089364647865295
Epoch: 19996 train loss: 0.48545464873313904
Epoch: 24995 train loss: 0.4619404971599579
Epoch: 29994 train loss: 0.4471399486064911
Epoch: 34993 train loss: 0.43656283617019653
Epoch: 39992 train loss: 0.4294586479663849
Test: Loss : 0.6900, Acc : 0.5374
Test Confusion Matrix:
tensor([[ 0, 0],
[ 99, 115]])
-----------------------------------------------------------------
Logging to C:\Users\Daniel\OneDrive\Documents\Neural Networks Hw\Best Models Project\logs\MLP_1HL20_Adam1.0_11
-----------------------------------------------------------------
Learning Rate: 1.0; Hidden Layer PE: 20
Test loss pre training: 0.6932849884033203
Epoch: 0 train loss: 0.6937627196311951
---------------------------------------------------------------------------
PermissionError Traceback (most recent call last)
<ipython-input-8-4d76ab467289> in <module>
58 val_output = model.forward(X_val)
59 val_loss = criterion(val_output.squeeze(), torch.max(y_val, 1)[1])
---> 60 model_checkpoint.update(val_loss)
61
62 # if stopping_criteria.step(val_loss):
<ipython-input-4-4ca0ec529d66> in update(self, loss)
9 if (self.min_loss is None) or (loss < self.min_loss):
10 # print("Saving a better model")
---> 11 torch.save(self.model.state_dict(), self.filepath)
12 #torch.save(self.model, self.filepath)
13 self.min_loss = loss
~\anaconda3\envs\pytorch\lib\site-packages\torch\serialization.py in save(obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization)
325 return
326
--> 327 with _open_file_like(f, 'wb') as opened_file:
328 _legacy_save(obj, opened_file, pickle_module, pickle_protocol)
329
~\anaconda3\envs\pytorch\lib\site-packages\torch\serialization.py in _open_file_like(name_or_buffer, mode)
210 def _open_file_like(name_or_buffer, mode):
211 if _is_path(name_or_buffer):
--> 212 return _open_file(name_or_buffer, mode)
213 else:
214 if 'w' in mode:
~\anaconda3\envs\pytorch\lib\site-packages\torch\serialization.py in __init__(self, name, mode)
191 class _open_file(_opener):
192 def __init__(self, name, mode):
--> 193 super(_open_file, self).__init__(open(name, mode))
194
195 def __exit__(self, *args):
PermissionError: [Errno 13] Permission denied: 'C:\\Users\\Daniel\\OneDrive\\Documents\\Neural Networks Hw\\Best Models Project\\logs\\MLP_1HL20_Adam1.0_11/best_model.pt'