hi,
I am trying to train resnet on my own dataset and the image is not a fixed size, i don’t want to reshape the image, so i change batchnorm to groupnorm in the code of resnet, like this:
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = conv1x1(inplanes, planes)
# self.bn1 = nn.BatchNorm2d(planes)
self.bn1 = nn.GroupNorm(32, planes)
self.conv2 = conv3x3(planes, planes, stride)
# self.bn2 = nn.BatchNorm2d(planes)
self.bn2 = nn.GroupNorm(32, planes)
self.conv3 = conv1x1(planes, planes * self.expansion)
# self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.bn3 = nn.GroupNorm(32, planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
and the training and validation code:
def train_model(model, dataloaders, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
for epoch in range(num_epochs):
print('\nEpoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
scheduler.step()
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
count = 0
# Iterate over data.
for inputs, labels, path in dataloaders[phase]:
count += 1
print(count)
inputs = inputs.to(device)
labels = labels.to(device)
# print(labels, '\t\t', path)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
loss = criterion(outputs, labels)
_, preds = torch.max(outputs, 1)
# print('target:', labels.item(), 'pred:', preds.item())
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / len(dataloaders[phase].dataset)
epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
if phase == 'val' and (epoch + 1) % 5 == 0:
print("Saving model.....")
torch.save(model.state_dict(), './checkpoint/resnet101-GN-' + str(epoch + 1) + '.pth')
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
return model
So the problem is that when training on whole dataset, including 80 classes, 4w images around, i got OOM somewhere during trianing, like: (note that batchsize is always 1 and the number below means training on ith image )
Epoch 0/44
----------
1
2
.
.
.
58
59
Traceback (most recent call last):
File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGNTrainVal.py", line 249, in <module>
model_ft = train_model(model_ft, dataloaders, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=num_epochs)
File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGNTrainVal.py", line 159, in train_model
outputs = model(inputs)
File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGN.py", line 166, in forward
x = self.layer2(x)
File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/container.py", line 92, in forward
input = module(input)
File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGN.py", line 85, in forward
out = self.bn1(out)
File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/normalization.py", line 233, in forward
input, self.num_groups, self.weight, self.bias, self.eps)
File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/functional.py", line 1662, in group_norm
torch.backends.cudnn.enabled)
RuntimeError: CUDA out of memory. Tried to allocate 46.12 MiB (GPU 1; 10.92 GiB total capacity; 9.94 GiB already allocated; 23.50 MiB free; 319.47 MiB cached)
and sometime it crash
Epoch 0/44
----------
1
2
3
4
5
Traceback (most recent call last):
File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGNTrainVal.py", line 249, in <module>
model_ft = train_model(model_ft, dataloaders, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=num_epochs)
File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGNTrainVal.py", line 159, in train_model
outputs = model(inputs)
File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGN.py", line 167, in forward
x = self.layer3(x)
File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/container.py", line 92, in forward
input = module(input)
File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGN.py", line 93, in forward
out = self.bn3(out)
File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/normalization.py", line 233, in forward
input, self.num_groups, self.weight, self.bias, self.eps)
File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/functional.py", line 1662, in group_norm
torch.backends.cudnn.enabled)
RuntimeError: CUDA out of memory. Tried to allocate 53.62 MiB (GPU 1; 10.92 GiB total capacity; 10.07 GiB already allocated; 15.50 MiB free; 172.92 MiB cached)
BUT, when i split the dataset into a small one like 3 classes, 1.5k images around, it can complete the training and validation without any problem.
Could anyone tell me where is the problem?
Thanks a lot!