Thanks a lot for your help. I have changed my forward function to accept one argument, and I calculate the loss outside of it. A part of my training script looks like that now:
def train(train_data_loader, model):
print('Training')
global train_itr
global train_loss_list
# initialize tqdm progress bar
prog_bar = tqdm(train_data_loader, total=len(train_data_loader))
for i, data in enumerate(prog_bar):
optimizer.zero_grad()
images, targets = data
#images = list(image.to(DEVICE) for image in images)
images = torch.stack(images).to(DEVICE)
targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]
print('Images shape ' + str(images[0].shape))
print('Targets ' + str(targets[0]))
#loss_dict = model(images, targets)
predictions = model(images)
print('Predictions shape ' + str(predictions[0].shape))
loss = model.compute_loss(predictions, targets)
losses = loss
loss_value = losses.item()
train_loss_list.append(loss_value)
train_loss_hist.send(loss_value)
losses.backward()
optimizer.step()
train_itr += 1
# update the loss value beside the progress bar for each iteration
prog_bar.set_description(desc=f"Loss: {loss_value:.4f}")
return train_loss_list
and a part of my model script is:
class ResNet(nn.Module):
def __init__(self, ResBlock, layer_list, num_classes, num_channels=3):
super(ResNet, self).__init__()
self.in_channels = 64
self.conv1 = nn.Conv2d(num_channels, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.batch_norm1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU()
self.max_pool = nn.MaxPool2d(kernel_size = 3, stride=2, padding=1)
self.layer1 = self._make_layer(ResBlock, layer_list[0], planes=64)
self.layer2 = self._make_layer(ResBlock, layer_list[1], planes=128, stride=2)
self.layer3 = self._make_layer(ResBlock, layer_list[2], planes=256, stride=2)
self.layer4 = self._make_layer(ResBlock, layer_list[3], planes=512, stride=2)
self.avgpool = nn.AdaptiveAvgPool2d((1,1))
self.fc = nn.Linear(512*ResBlock.expansion, num_classes)
def forward(self, x):
x = self.relu(self.batch_norm1(self.conv1(x)))
print('FIRST: ' + str(x.shape))
x = self.max_pool(x)
print('MAXPOOL: ' + str(x.shape))
x = self.layer1(x)
print('LAYER1: ' + str(x.shape))
x = self.layer2(x)
print('LAYER2: ' + str(x.shape))
x = self.layer3(x)
print('LAYER3: ' + str(x.shape))
x = self.layer4(x)
print('LAYER4: ' + str(x.shape))
x = self.avgpool(x)
print('AVG: ' + str(x.shape))
x = x.view(x.size(0), -1)
print('FC: ' + str(x.shape))
x = self.fc(x)
return x
def _make_layer(self, ResBlock, blocks, planes, stride=1):
ii_downsample = None
layers = []
if stride != 1 or self.in_channels != planes*ResBlock.expansion: #tto en commented out
ii_downsample = nn.Sequential(
nn.Conv2d(self.in_channels, planes*ResBlock.expansion, kernel_size=1, stride=stride),
nn.BatchNorm2d(planes*ResBlock.expansion)
)
layers.append(ResBlock(self.in_channels, planes, i_downsample=ii_downsample, stride=stride))
self.in_channels = planes*ResBlock.expansion
for i in range(blocks-1):
layers.append(ResBlock(self.in_channels, planes))
return nn.Sequential(*layers)
def compute_loss(self, predictions, targets):
target_labels = torch.cat([t['labels'] for t in targets])
loss = F.cross_entropy(predictions, target_labels)
return loss
However, I am getting this error now:
EPOCH 1 of 10
Training
0%| | 0/48 [00:00<?, ?it/s]Images shape torch.Size([3, 416, 416])
Targets {'boxes': tensor([[210.0800, 188.7600, 314.0800, 252.2000]]), 'labels': tensor([1]), 'area': tensor([6597.7603]), 'iscrowd': tensor([0]), 'image_id': tensor([64])}
FIRST: torch.Size([3, 64, 208, 208])
MAXPOOL: torch.Size([3, 64, 104, 104])
LAYER1: torch.Size([3, 256, 104, 104])
LAYER2: torch.Size([3, 512, 52, 52])
LAYER3: torch.Size([3, 1024, 26, 26])
LAYER4: torch.Size([3, 2048, 13, 13])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([5])
Loss: 1.3513: 2%|█▎ | 1/48 [00:01<01:04, 1.38s/it]Images shape torch.Size([3, 416, 416])
Targets {'boxes': tensor([[165.8800, 133.1200, 323.4400, 278.0267]]), 'labels': tensor([4]), 'area': tensor([22831.4961]), 'iscrowd': tensor([0]), 'image_id': tensor([100])}
FIRST: torch.Size([3, 64, 208, 208])
MAXPOOL: torch.Size([3, 64, 104, 104])
LAYER1: torch.Size([3, 256, 104, 104])
LAYER2: torch.Size([3, 512, 52, 52])
LAYER3: torch.Size([3, 1024, 26, 26])
LAYER4: torch.Size([3, 2048, 13, 13])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([5])
Loss: 2.5029: 4%|██▌ | 2/48 [00:02<01:01, 1.33s/it]Images shape torch.Size([3, 416, 416])
Targets {'boxes': tensor([[232.9600, 199.1600, 307.8400, 307.3200]]), 'labels': tensor([1]), 'area': tensor([8099.0200]), 'iscrowd': tensor([0]), 'image_id': tensor([4])}
FIRST: torch.Size([3, 64, 208, 208])
MAXPOOL: torch.Size([3, 64, 104, 104])
LAYER1: torch.Size([3, 256, 104, 104])
LAYER2: torch.Size([3, 512, 52, 52])
LAYER3: torch.Size([3, 1024, 26, 26])
LAYER4: torch.Size([3, 2048, 13, 13])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([5])
Loss: 0.8967: 6%|███▊ | 3/48 [00:03<00:57, 1.27s/it]Images shape torch.Size([3, 416, 416])
Targets {'boxes': tensor([[136.5867, 164.8400, 288.4267, 328.1200]]), 'labels': tensor([2]), 'area': tensor([24792.4336]), 'iscrowd': tensor([0]), 'image_id': tensor([122])}
FIRST: torch.Size([3, 64, 208, 208])
MAXPOOL: torch.Size([3, 64, 104, 104])
LAYER1: torch.Size([3, 256, 104, 104])
LAYER2: torch.Size([3, 512, 52, 52])
LAYER3: torch.Size([3, 1024, 26, 26])
LAYER4: torch.Size([3, 2048, 13, 13])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([5])
Loss: 0.8967: 6%|███▊ | 3/48 [00:04<01:03, 1.42s/it]
Traceback (most recent call last):
File "/lustre/alice3/scratch/aiadapt/va95/od/2/20211025_Custom_Object_Detection_using_PyTorch_Faster_RCNN/src/engine.py", line 134, in <module>
train_loss = train(train_loader, model)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/lustre/alice3/scratch/aiadapt/va95/od/2/20211025_Custom_Object_Detection_using_PyTorch_Faster_RCNN/src/engine.py", line 43, in train
loss = model.compute_loss(predictions, targets)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/lustre/alice3/scratch/aiadapt/va95/od/2/20211025_Custom_Object_Detection_using_PyTorch_Faster_RCNN/src/model.py", line 139, in compute_loss
loss = F.cross_entropy(predictions, target_labels)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/v/va95/miniconda3/envs/odenv/lib/python3.11/site-packages/torch/nn/functional.py", line 3053, in cross_entropy
return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: Expected input batch_size (3) to match target batch_size (5).
I have check another thread that covers this error (ValueError: Expected input batch_size (324) to match target batch_size (4) - #26 by william_hero), but unfortunately, I couldn’t figure out the problem, yet. Could you please help me with that?