Only batches of spatial targets supported (3D tensors) but got targets of size: : [10]

I’m trying to implement my personal version of SqueezeNet using Pytorch. I have some base code that I can not modify but I can modify all the code inside the class APNet and also all the other methods in the same script. When I try to start training I obtain always the error in the description.

import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
from torchvision.models.resnet import BasicBlock
import math
import torch

def _weights_init(m):
    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):

class fire(nn.Module):
    def __init__(self, inplanes, squeeze_planes, expand_planes):
        super(fire, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1, stride=1)
        self.bn1 = nn.BatchNorm2d(squeeze_planes)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(squeeze_planes, expand_planes, kernel_size=1, stride=1)
        self.bn2 = nn.BatchNorm2d(expand_planes)
        self.conv3 = nn.Conv2d(squeeze_planes, expand_planes, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(expand_planes)
        self.relu2 = nn.ReLU(inplace=True)

        # using MSR initilization
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
      , math.sqrt(2./n))

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        out1 = self.conv2(x)
        out1 = self.bn2(out1)
        out2 = self.conv3(x)
        out2 = self.bn3(out2)
        out =[out1, out2], 1)
        out = self.relu2(out)
        return out

class APNet(nn.Module):

    block : callable
        Factory/constructor creating the block to be used.
    layers : list of int
        Number of blocks in each layer.
    num_classes : int
        Number of output neurons.
    input_channels : int
        Number of input channels.
    shortcut_downsampling : {'pad', 'conv'}
        Downsampling mode for the shortcut.
        'pad' will subsample the input using strided slicing and pad the channels with zeros.
        'conv' will use a strided convolution instead.

    def __init__(self, block, layers, num_classes=10, input_channels=3, shortcut_downsampling='pad', groups=1):
      super(APNet, self).__init__()
      self.conv1 = nn.Conv2d(3, 96, kernel_size=3, stride=1, padding=1) # 32
      self.bn1 = nn.BatchNorm2d(96)
      self.relu = nn.ReLU(inplace=True)
      self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2) # 16
      self.fire2 = fire(96, 16, 64)
      self.fire3 = fire(128, 16, 64)
      self.fire4 = fire(128, 32, 128)
      self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2) # 8
      self.fire5 = fire(256, 32, 128)
      self.fire6 = fire(256, 48, 192)
      self.fire7 = fire(384, 48, 192)
      self.fire8 = fire(384, 64, 256)
      self.maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2) # 4
      self.fire9 = fire(512, 64, 256)
      self.conv2 = nn.Conv2d(512, 10, kernel_size=1, stride=1)
      self.avg_pool = nn.AvgPool2d(kernel_size=4, stride=4)
      #self.softmax = nn.LogSoftmax(dim=1)
      for m in self.modules():
          if isinstance(m, nn.Conv2d):
              n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
    , math.sqrt(2. / n))
          elif isinstance(m, nn.BatchNorm2d):

    def get_classifiers():
        return ['apnet0']
    def build_classifier(cls, arch: str, num_classes: int, input_channels: int):
        depth = 3
        cls_instance = cls(int(depth), num_classes, input_channels=input_channels)
        return cls_instance
    def forward(self, x):
      x = self.conv1(x)
      x = self.bn1(x)
      x = self.relu(x)
      x = self.maxpool1(x)
      x = self.fire2(x)
      x = self.fire3(x)
      x = self.fire4(x)
      x = self.maxpool2(x)
      x = self.fire5(x)
      x = self.fire6(x)
      x = self.fire7(x)
      x = self.fire8(x)
      x = self.maxpool3(x)
      x = self.fire9(x)
      x = self.conv2(x)
      x = self.avg_pool(x)
      #x = self.fc(x)
      #x = self.softmax(x)
      return x

def fire_layer(inp, s, e):
    f = fire(inp, s, e)
    return f

I tried also to add a linear layer at the end but the result does not change. All the above code is modifiable and all the code below is unmodifiable.

    model : torch.nn.Module
        The model to be trained.
        Might yield multiple outputs, but the first ones will be considered to
        be class scores for accuracy computation.
    loader : iterable
        The data loader, yielding batches of samples and labels.
    optimizer : torch.optim.Optimizer
        The optimizer to be used for the backward pass and model update.
    criterion : callable
        The loss function.
        All outputs of the model will be passed as argument, followed by
        the class labels.
    scheduler : torch.optim.lr_scheduler._LRScheduler, optional
        A learning rate scheduler to be called after every iteration.
    regularizer : callable(torch.nn.Module), optional
        A function taking the model as argument and returning a regularization
        loss as scalar tensor that will be added to the total loss function.
    show_progress : bool, default: True
        Whether to show a tqdm progress bar updated after every iteration.
    loss : float
    accuracy : float
    total_loss = total_acc = num_samples = 0
    for X, y in tqdm(loader, leave=False, disable=not show_progress):
        X, y = X.cuda(), y.cuda()
        output = model(X)
        if not isinstance(output, tuple):
            output = (output,)
        loss = criterion(*output, y)
        total_loss += loss.item() * len(X)
        total_acc += (output[0].argmax(dim=-1) == y).sum().item()
        num_samples += len(X)

        if regularizer is not None:
            loss = loss + regularizer(model)
        if scheduler is not None:

Based on the error message I guess your model output is a 4D tensor in the shape [batch_size, nb_classes, height, width] while the target has the shape [batch_size] and would thus indicate a multi-class classification use case.
In case that’s indeed the use case, make sure your model returns an output in the shape [batch_size, nb_classes] e.g. by using an nn.Linear layer as the last layer in the model.