Hello! I keep getting an error when attempting to save my CNN model. I do not have this problem for smaller images like the 32x32 cifar dataset; however, my images are 448x672 (note: a multiple of 224). I am using the model for a regression task. Any help would be much appreciated!!
python 3.7.5
pytorch 1.6.0
Anaconda
Here is my model:
class Network_CNN_batchNorm(nn.Module):
def __init__(self):
super(Network_CNN_batchNorm, self).__init__()
# 3x448x672 input image (RGB)
self.layer1 = nn.Sequential(
# input is 3 channels (RGB) - first parameter
# 64 filters of kernel size 5x5; padding = kernel_size/2 - 1;
nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
# max pooling with stride=2 makes output image 224x336
nn.MaxPool2d(kernel_size=2, stride=2),
nn.BatchNorm2d(64))
self.layer2 = nn.Sequential(
# 2nd layer uses 128 channels (filters) of 3x3
nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.BatchNorm2d(128))
# 3rd layer uses 128 channels (filters) of 3x3
# output feature map is still 224x336
self.layer3 = nn.Sequential(
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.BatchNorm2d(128))
# Average Pooling Layer, 112x168 output
self.avgP1 = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
# Fully connected layers
self.fc1 = nn.Linear(112 * 168 * 128, 1000)
self.fc2 = nn.Linear(1000, 10) # 10 outputs
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = self.avgP1(out)
out = out.reshape(out.size(0), -1) # flatten
out = self.fc1(out)
out = self.fc2(out)
return out
Note that I can see my training and validation loss decrease over multiple epochs so training the model appears to be fine. However, I do notice that when training on my CPU the memory usage is around 30-40Gb which seems excessive.
The code for saving the model is shown, and I can confirm that the path is OK since it works with smaller image sizes.
torch.save(model.state_dict(), os.path.join(Model_Path, 'epoch-{}.pth'.format(epoch)))
The error I am getting is as follows:
File "C:\my.py", line 526, in <module>
model_trained, t_loss, v_loss = train_model(model, criterion, optimizer, trainloader, testloader, num_epochs)
File "C:\my.py", line 356, in train_model
torch.save(model.state_dict(), os.path.join(Model_Path, 'epoch-{}.pth'.format(epoch)))
File "C:\Users\...\anaconda3\envs\TF2.0\lib\site-packages\torch\serialization.py", line 364, in save
_save(obj, opened_zipfile, pickle_module, pickle_protocol)
File "C:\Users\...\anaconda3\envs\TF2.0\lib\site-packages\torch\serialization.py", line 477, in _save
zip_file.write_record(name, storage.data_ptr(), num_bytes)
TypeError: write_record(): incompatible function arguments. The following argument types are supported:
1. (self: torch._C.PyTorchFileWriter, arg0: str, arg1: str, arg2: int) -> None
2. (self: torch._C.PyTorchFileWriter, arg0: str, arg1: int, arg2: int) -> None
Invoked with: <torch._C.PyTorchFileWriter object at 0x0000026AD2154D30>, 'data/2657683100064', 2657910136960, -7546077184