Expected target size [100, 24, 40, 40], got [100]

Hi,

I’m a new PyTorch user who would like to classify between 2 classes 244040 3D images on 1 channel.
To do this, I’d like to use a U-Net. Unfortunately I have a size error when evaluating crossentropy.
Here’s my code for the model:

class Conv3DBlock(nn.Module):
“”"
The basic block for double 3x3x3 convolutions in the analysis path
init()
:param in_channels → number of input channels
:param out_channels → desired number of output channels
:param bottleneck → specifies the bottlneck block
– forward()
:param input → input Tensor to be convolved
:return → Tensor
“”"

def __init__(self, in_channels, out_channels, bottleneck = False) -> None:
    super(Conv3DBlock, self).__init__()
    self.conv1 = nn.Conv3d(in_channels= in_channels, out_channels=out_channels//2, kernel_size=(3,3,3), padding=1)
    self.bn1 = nn.BatchNorm3d(num_features=out_channels//2)
    self.conv2 = nn.Conv3d(in_channels= out_channels//2, out_channels=out_channels, kernel_size=(3,3,3), padding=1)
    self.bn2 = nn.BatchNorm3d(num_features=out_channels)
    self.relu = nn.ReLU()
    self.bottleneck = bottleneck
    if not bottleneck:
        self.pooling = nn.MaxPool3d(kernel_size=(2,2,2), stride=2)


def forward(self, input):
    res = self.relu(self.bn1(self.conv1(input)))
    res = self.relu(self.bn2(self.conv2(res)))
    out = None
    if not self.bottleneck:
        out = self.pooling(res)
    else:
        out = res
    return out, res

class UpConv3DBlock(nn.Module):
“”"
The basic block for upsampling followed by double 3x3x3 convolutions in the synthesis path
init()
:param in_channels → number of input channels
:param out_channels → number of residual connections’ channels to be concatenated
:param last_layer → specifies the last output layer
:param num_classes → specifies the number of output channels for dispirate classes
– forward()
:param input → input Tensor
:param residual → residual connection to be concatenated with input
:return → Tensor

def __init__(self, in_channels, res_channels=0, last_layer=False, num_classes=None) -> None:
    super(UpConv3DBlock, self).__init__()
    assert (last_layer==False and num_classes==None) or (last_layer==True and num_classes!=None), 'Invalid arguments'
    self.upconv1 = nn.ConvTranspose3d(in_channels=in_channels, out_channels=in_channels, kernel_size=(2, 2, 2), stride=2)
    self.relu = nn.ReLU()
    self.bn = nn.BatchNorm3d(num_features=in_channels//2)
    self.conv1 = nn.Conv3d(in_channels=in_channels+res_channels, out_channels=in_channels//2, kernel_size=(3,3,3), padding=(1,1,1))
    self.conv2 = nn.Conv3d(in_channels=in_channels//2, out_channels=in_channels//2, kernel_size=(3,3,3), padding=(1,1,1))
    self.last_layer = last_layer
    if last_layer:
        self.conv3 = nn.Conv3d(in_channels=in_channels//2, out_channels=num_classes, kernel_size=(1,1,1))
        
    
def forward(self, input, residual=None):
    out = self.upconv1(input)
    if residual!=None: out = torch.cat((out, residual), 1)
    out = self.relu(self.bn(self.conv1(out)))
    out = self.relu(self.bn(self.conv2(out)))
    if self.last_layer: out = self.conv3(out)
    return out

class UNet3D(nn.Module):
“”"
The 3D UNet model
init()
:param in_channels → number of input channels
:param num_classes → specifies the number of output channels or masks for different classes
:param level_channels → the number of channels at each level (count top-down)
:param bottleneck_channel → the number of bottleneck channels
:param device → the device on which to run the model
– forward()
:param input → input Tensor
:return → Tensor
“”"

def __init__(self, in_channels, num_classes, level_channels=[64, 128, 256], bottleneck_channel=512) -> None:
    super(UNet3D, self).__init__()
    level_1_chnls, level_2_chnls, level_3_chnls = level_channels[0], level_channels[1], level_channels[2]
    self.a_block1 = Conv3DBlock(in_channels=in_channels, out_channels=level_1_chnls)
    self.a_block2 = Conv3DBlock(in_channels=level_1_chnls, out_channels=level_2_chnls)
    self.a_block3 = Conv3DBlock(in_channels=level_2_chnls, out_channels=level_3_chnls)
    self.bottleNeck = Conv3DBlock(in_channels=level_3_chnls, out_channels=bottleneck_channel, bottleneck= True)
    self.s_block3 = UpConv3DBlock(in_channels=bottleneck_channel, res_channels=level_3_chnls)
    self.s_block2 = UpConv3DBlock(in_channels=level_3_chnls, res_channels=level_2_chnls)
    self.s_block1 = UpConv3DBlock(in_channels=level_2_chnls, res_channels=level_1_chnls, num_classes=num_classes, last_layer=True)


def forward(self, input):
    #Analysis path forward feed
    out, residual_level1 = self.a_block1(input)
    out, residual_level2 = self.a_block2(out)
    out, residual_level3 = self.a_block3(out)
    out, _ = self.bottleNeck(out)

    #Synthesis path forward feed
    out = self.s_block3(out, residual_level3)
    out = self.s_block2(out, residual_level2)
    out = self.s_block1(out, residual_level1)
    return out

if name == ‘main’:
#Configurations according to the Xenopus kidney dataset
model = UNet3D(in_channels=1, num_classes=2)
start_time = time.time()
summary(model=model, input_size=(1, 24, 40, 40), batch_size=-1, device=“cpu”)
print(“— %s seconds —” % (time.time() - start_time))

And here’s the code for the training with batch size =100:

#Definition of hyperparameters
n_iters = 3000
num_epochs = n_iters / (len(train_x) / batch_size)
num_epochs = int(num_epochs)

Create CNN

model = UNet3D(in_channels=1 , num_classes= 2)

BCE_WEIGHTS = [0.004, 0.996]
TRAINING_EPOCH=1 # 100
model = UNet3D(in_channels=1 , num_classes= 2)

criterion = CrossEntropyLoss(weight=torch.Tensor(BCE_WEIGHTS))
optimizer = Adam(params=model.parameters())
#learning_rate = 0.001

min_valid_loss = math.inf

for epoch in range(TRAINING_EPOCH):

train_loss = 0.0
model.train()
for i, (image, ground_truth) in enumerate(train_loader):
    image = image.view(100,1,24,40,40)
    optimizer.zero_grad()
    outputs = model(image)
    loss = criterion(outputs, ground_truth)
    loss.backward()
    optimizer.step()

    train_loss += loss.item()

valid_loss = 0.0
model.eval()
for image, ground_truth in test_loader:
    image = image.view(100,1,24,40,40)
    outputs = model(image)
    loss = criterion(outputs,ground_truth)
    valid_loss = loss.item()
    
print("Loss/Train"+str(train_loss / len(train_dataloader))+" et l'epoch : "+str(epoch))
print("Loss/Validation"+str(valid_loss / len(val_dataloader))+" et l'epoch : "+str(epoch))

print(f'Epoch {epoch+1} \t\t Training Loss: {train_loss / len(train_dataloader)} \t\t Validation Loss: {valid_loss / len(val_dataloader)}')

Which gives me this error: RuntimeError: Expected target size [100, 24, 40, 40], got [100].

I understood that the outputs should have the shape [batch_size, nb_classes] for the CrossEntropyLoss, i.e [100,2]. Yet my outputs shape is torch.Size([100, 2, 24, 40, 40]).

Could someone give me some advice to solve my issue?

Thanks in advance.

Alex
[/quote]

The basic block for double 3x3x3 convolutions in the analysis path
init()
:param in_channels → number of input channels
:param out_channels → desired number of output channels
:param bottleneck → specifies the bottlneck block
– forward()
:param input → input Tensor to be convolved
:return → Tensor
“”"

def __init__(self, in_channels, out_channels, bottleneck = False) -> None:
    super(Conv3DBlock, self).__init__()
    self.conv1 = nn.Conv3d(in_channels= in_channels, out_channels=out_channels//2, kernel_size=(3,3,3), padding=1)
    self.bn1 = nn.BatchNorm3d(num_features=out_channels//2)
    self.conv2 = nn.Conv3d(in_channels= out_channels//2, out_channels=out_channels, kernel_size=(3,3,3), padding=1)
    self.bn2 = nn.BatchNorm3d(num_features=out_channels)
    self.relu = nn.ReLU()
    self.bottleneck = bottleneck
    if not bottleneck:
        self.pooling = nn.MaxPool3d(kernel_size=(2,2,2), stride=2)


def forward(self, input):
    res = self.relu(self.bn1(self.conv1(input)))
    res = self.relu(self.bn2(self.conv2(res)))
    out = None
    if not self.bottleneck:
        out = self.pooling(res)
    else:
        out = res
    return out, res

class UpConv3DBlock(nn.Module):
“”"
The basic block for upsampling followed by double 3x3x3 convolutions in the synthesis path
init()
:param in_channels → number of input channels
:param out_channels → number of residual connections’ channels to be concatenated
:param last_layer → specifies the last output layer
:param num_classes → specifies the number of output channels for dispirate classes
– forward()
:param input → input Tensor
:param residual → residual connection to be concatenated with input
:return → Tensor
“”"

def __init__(self, in_channels, res_channels=0, last_layer=False, num_classes=None) -> None:
    super(UpConv3DBlock, self).__init__()
    assert (last_layer==False and num_classes==None) or (last_layer==True and num_classes!=None), 'Invalid arguments'
    self.upconv1 = nn.ConvTranspose3d(in_channels=in_channels, out_channels=in_channels, kernel_size=(2, 2, 2), stride=2)
    self.relu = nn.ReLU()
    self.bn = nn.BatchNorm3d(num_features=in_channels//2)
    self.conv1 = nn.Conv3d(in_channels=in_channels+res_channels, out_channels=in_channels//2, kernel_size=(3,3,3), padding=(1,1,1))
    self.conv2 = nn.Conv3d(in_channels=in_channels//2, out_channels=in_channels//2, kernel_size=(3,3,3), padding=(1,1,1))
    self.last_layer = last_layer
    if last_layer:
        self.conv3 = nn.Conv3d(in_channels=in_channels//2, out_channels=num_classes, kernel_size=(1,1,1))
        
    
def forward(self, input, residual=None):
    out = self.upconv1(input)
    if residual!=None: out = torch.cat((out, residual), 1)
    out = self.relu(self.bn(self.conv1(out)))
    out = self.relu(self.bn(self.conv2(out)))
    if self.last_layer: out = self.conv3(out)
    return out

class UNet3D(nn.Module):
“”"
The 3D UNet model
init()
:param in_channels → number of input channels
:param num_classes → specifies the number of output channels or masks for different classes
:param level_channels → the number of channels at each level (count top-down)
:param bottleneck_channel → the number of bottleneck channels
:param device → the device on which to run the model
– forward()
:param input → input Tensor
:return → Tensor
“”"

def __init__(self, in_channels, num_classes, level_channels=[64, 128, 256], bottleneck_channel=512) -> None:
    super(UNet3D, self).__init__()
    level_1_chnls, level_2_chnls, level_3_chnls = level_channels[0], level_channels[1], level_channels[2]
    self.a_block1 = Conv3DBlock(in_channels=in_channels, out_channels=level_1_chnls)
    self.a_block2 = Conv3DBlock(in_channels=level_1_chnls, out_channels=level_2_chnls)
    self.a_block3 = Conv3DBlock(in_channels=level_2_chnls, out_channels=level_3_chnls)
    self.bottleNeck = Conv3DBlock(in_channels=level_3_chnls, out_channels=bottleneck_channel, bottleneck= True)
    self.s_block3 = UpConv3DBlock(in_channels=bottleneck_channel, res_channels=level_3_chnls)
    self.s_block2 = UpConv3DBlock(in_channels=level_3_chnls, res_channels=level_2_chnls)
    self.s_block1 = UpConv3DBlock(in_channels=level_2_chnls, res_channels=level_1_chnls, num_classes=num_classes, last_layer=True)


def forward(self, input):
    #Analysis path forward feed
    out, residual_level1 = self.a_block1(input)
    out, residual_level2 = self.a_block2(out)
    out, residual_level3 = self.a_block3(out)
    out, _ = self.bottleNeck(out)

    #Synthesis path forward feed
    out = self.s_block3(out, residual_level3)
    out = self.s_block2(out, residual_level2)
    out = self.s_block1(out, residual_level1)
    return out

if name == ‘main’:
#Configurations according to the Xenopus kidney dataset
model = UNet3D(in_channels=1, num_classes=2)
start_time = time.time()
summary(model=model, input_size=(1, 24, 40, 40), batch_size=-1, device=“cpu”)
print(“— %s seconds —” % (time.time() - start_time))

And here’s the code for the training with batch size =100:

#Definition of hyperparameters
n_iters = 3000
num_epochs = n_iters / (len(train_x) / batch_size)
num_epochs = int(num_epochs)

Create CNN

model = UNet3D(in_channels=1 , num_classes= 2)

BCE_WEIGHTS = [0.004, 0.996]
TRAINING_EPOCH=1 # 100
model = UNet3D(in_channels=1 , num_classes= 2)

criterion = CrossEntropyLoss(weight=torch.Tensor(BCE_WEIGHTS))
optimizer = Adam(params=model.parameters())
#learning_rate = 0.001

min_valid_loss = math.inf

for epoch in range(TRAINING_EPOCH):

train_loss = 0.0
model.train()
for i, (image, ground_truth) in enumerate(train_loader):
    image = image.view(100,1,24,40,40)
    optimizer.zero_grad()
    outputs = model(image)
    loss = criterion(outputs, ground_truth)
    loss.backward()
    optimizer.step()

    train_loss += loss.item()

valid_loss = 0.0
model.eval()
for image, ground_truth in test_loader:
    image = image.view(100,1,24,40,40)
    outputs = model(image)
    loss = criterion(outputs,ground_truth)
    valid_loss = loss.item()
    
print("Loss/Train"+str(train_loss / len(train_dataloader))+" et l'epoch : "+str(epoch))
print("Loss/Validation"+str(valid_loss / len(val_dataloader))+" et l'epoch : "+str(epoch))

print(f'Epoch {epoch+1} \t\t Training Loss: {train_loss / len(train_dataloader)} \t\t Validation Loss: {valid_loss / len(val_dataloader)}')

Which gives me this error: RuntimeError: Expected target size [100, 24, 40, 40], got [100].

I understood that the outputs should have the shape [batch_size, nb_classes] for the CrossEntropyLoss, i.e [100,2]. Yet my outputs shape is torch.Size([100, 2, 24, 40, 40]).

Could someone give me some advice to solve my issue?

Thanks in advance.

Alex

Not necessarily, as it depends on the use case.
In a multi-class segmentation use case your model should output logits in the shape [batch_size, nb_classes, height, width] while the target should have the shape [batch_size, height, width] containing class indices in the range [0, nb_classes-1].
The error and your comments explains that your model outputs a tensor in the shape [100, 2, 24, 40, 40] so could you explain what these dimensions refer to?
Are you working with volumes and want to classify each voxel?

Thanks a lot for your answer! The output of my network was actually indeed wrong as I did not want to classify each voxel but the whole image (my images are of size [24, 40, 40]). To do so I changed my model to a ResNet3D.

Of course now I have some new issues and I am really stuck: any advice would be appreciate.

As a reminder, I’m trying to binary classify 24 * 40 * 40 images coming from MRI. My dataset has 10 000 images: 8000 for training, 1000 for testing and 1000 for validation. My batch size is 100. For the model I chose a 3D ResNet. I took the code from a GitHub repo and at first glance the architecture seems good. The summary output from torch summary is this one:


The parameters of my standard model are: Adam optimizer with learning rate 0.001, CrossEntropyLoss for the criterion, and 100 epochs.

Yet the training loss is quite high and is not decreasing, the network doesn’t learn anything. By investigating further the output of the network is the same for each epoch. For example for epoch 1 the network predicts only class 1 for all images and for epoch 2 it predicts only class 0 for all images.

Hence I tried the following to solve the problem:

  • work only on a small sample of 5 images to try overfitting the training dataset: the behavior of the loss is the same. On 5 sample I have a loss of 0.69 which is the entropy of a binary classifier predicting only one class.
  • balance the data with WeightRandomSampler.
  • change the learning rate from 0.00001 to 1
  • change optimizer from Adam to SGD
  • Normalize the data between 0 and 1
  • no activation layer because Softmax is already in CrossEntropy loss

The network is quite deep and should overfit on the 5 sample data. What am I missing here?

(The code for BasicBlock3d is:
class BasicBlock3d(nn.Module):
expansion = 1
def init(self, inplanes, planes, stride=1, downsample=None, groups=1,
base_width=64, dilation=1, norm_layer=None, act_layer=None):
super(BasicBlock3d, self).init()
if norm_layer is None:
norm_layer = nn.BatchNorm3d
if act_layer is None:
act_layer = partial(nn.ReLU, inplace=True)
if groups != 1 or base_width != 64:
raise ValueError(‘BasicBlock only supports groups=1 and base_width=64’)
if dilation > 1:
raise NotImplementedError(“Dilation > 1 not supported in BasicBlock”)
# Both self.conv1 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = norm_layer(planes)
self.relu = act_layer()
self.conv2 = conv3x3(planes, planes)
self.bn2 = norm_layer(planes)
self.downsample = downsample
self.stride = stride

def forward(self, x):
    identity = x

    out = self.conv1(x)
    out = self.bn1(out)
    out = self.relu(out)

    out = self.conv2(out)
    out = self.bn2(out)

    if self.downsample is not None:
        identity = self.downsample(x)

    out += identity
    out = self.relu(out)

    return out)

All your debugging steps sound reasonable and I would expect your model could overfit the tiny dataset. Could you post the training loop as well as the full model definition?

Thanks a lot for your quick answer and your patience. Honestly, you’re holding PyTorch users on your shoulders.

I tried to structure the code below with some comments, hope it will be ok for you.

First loading the data as numpy array

data_finale=np.load('data_finale.npy') # I load my data as Numpy array
labels=np.load('labels.npy')
labels[labels<1]=0 #I change the labels to binary classify
labels[labels>=1]=1
#I separate the data between training and testing, here for a small sample of 5 images
X_train= data_finale[0:5,:,:,:]
labels_train=labels[0:5]
X_test= data_finale[100:105,:,:,:] 
labels_test=labels[100:105] # the test labels in this 5-sample are 0 0 1 0 1
train_x = torch.from_numpy(X_train).float()
test_x = torch.from_numpy(X_test).float()
test_y = torch.from_numpy(labels_test).long()
batch_size = 5 #We pick beforehand a batch_size that we will use for the training

Now my data class

class MRIdata():
    def __init__(self, X, labels, transform=None):
self.data = X
        self.labels = labels
        self.transform = transform
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
         image = self.data[index]
        if self.transform:
            image = self.transform(image)
        labels = torch.tensor(self.labels[index], dtype=torch.long)
        return image, labels
transform=transforms.Compose([
    transforms.Normalize([0.485],
                         [0.229])
]) # I already transformed the numpy array as torch tensor and do not need to do it again in transform

The data sampler

 #We create the sampler to balance the data
print('target train 0/1: {}/{}'.format(len(np.where(labels_train == 0)[0]), len(np.where(labels_train == 1)[0])))
class_sample_count = np.array(
    [len(np.where(labels_train == t)[0]) for t in np.unique(labels_train)])
weight = 1. / class_sample_count
samples_weight = np.array([weight[t] for t in labels_train])
samples_weight = torch.from_numpy(samples_weight)
samples_weigth = samples_weight.double()
sampler = torch.utils.data.sampler.WeightedRandomSampler(samples_weight, len(samples_weight))
#Loading for train and test dataset 
labels_train = torch.from_numpy(labels_train).long()
train_dataset = MRIdata(train_x, labels_train, transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,  sampler=sampler)
test_y = torch.from_numpy(labels_test).long()
test = MRIdata(test_x,test_y, transform)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size)
#Checking for the sampling
sum_0=0
sum_1=0
for i, (data, target) in enumerate(train_loader):
    sum_0=sum_0+len(np.where(target.numpy() == 0)[0])
    sum_1=sum_1+len(np.where(target.numpy() == 1)[0])
print('target train after sampling 0/1 :'+str(sum_0)+'/'+str(sum_1))

The check for sampling gives me:
target train 0/1: 3/2
target train after sampling 0/1 :3/2

The model:

got from https://github.com/kbressem/faimed3d/blob/main/faimed3d/models/resnet.py

from fastai.basics import *
from fastai.layers import *
from warnings import warn
from torch.hub import load_state_dict_from_url
from torchvision.models.resnet import Bottleneck, BasicBlock
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):"""3x3 convolution with padding"""
    return nn.Conv3d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)
def conv1x1(in_planes, out_planes, stride=1): """1x1 convolution"""
    return nn.Conv3d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
from torch import nn # prevent error in nbdev with re-importing nn (was already imported with fastai)

class BasicBlock3d(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None, act_layer=None):
        super(BasicBlock3d, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm3d
        if act_layer is None:
            act_layer = partial(nn.ReLU, inplace=True)
        if groups != 1 or base_width != 64:
            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = act_layer()
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride
    def forward(self, x):
        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample is not None:
            identity = self.downsample(x)
        out += identity
        out = self.relu(out)
        return out

class Bottleneck3d(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None, act_layer=None):
        super(Bottleneck3d, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm3d
        if act_layer is None:
            act_layer = partial(nn.ReLU, inplace=True)
        width = int(planes * (base_width / 64.)) * groups
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = act_layer()
        self.downsample = downsample
        self.stride = stride
    def forward(self, x):
        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.conv3(out)
        out = self.bn3(out)
        if self.downsample is not None:
            identity = self.downsample(x)
        out += identity
        out = self.relu(out)
        return out


class ResNet3D(nn.Module):

    def __init__(self, block, layers, n_channels=3, num_classes=101, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None, act_layer=None, final_softmax=False, ps = 0.5):
        super(ResNet3D, self).__init__()
        if norm_layer is None: norm_layer = nn.BatchNorm3d
        if act_layer is None: act_layer = partial(nn.ReLU, inplace=True)
        self._norm_layer = norm_layer
        self.inplanes = 128 if isinstance(block(1,1), Bottleneck3d) else 32

        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group

        self.stem = nn.Sequential(nn.Conv3d(n_channels, self.inplanes, kernel_size=(2, 5, 5), stride=(1, 3, 3), padding=1, bias=False),
                                  norm_layer(self.inplanes),
                                  act_layer(inplace=True))

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])
        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
        self.fc = nn.Sequential(
            nn.BatchNorm1d(512 * block.expansion, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.Dropout(p=ps/2, inplace=False),
            nn.Linear(512 * block.expansion, 256),
            act_layer(inplace=True),
            nn.BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.Dropout(p=ps, inplace=False),
            nn.Linear(256, num_classes,bias = False))

        if final_softmax:
            self.fc = nn.Sequential(self.fc,
                                    nn.Softmax(1))

        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm3d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck3d):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock3d):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def _encoder(self, x1):
        x2 = self.layer1(x1)
        x3 = self.layer2(x2)
        x4 = self.layer3(x3)
        x5 = self.layer4(x4)
        return x2, x3, x4, x5

    def _head(self, x5):
        x = self.avgpool(x5)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

    def _forward_impl(self, x):
        # See note [TorchScript super()]
        x = self.stem(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

    def forward(self, x):
        return self._forward_impl(x)
if __name__ == '__main__':
    model = ResNet3D(BasicBlock3d, [2,2,2,2], n_channels=1, num_classes=2, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None, act_layer=None, final_softmax=False, ps = 0.5)
    start_time = time.time()
    summary(model=model, input_size=(1, 24, 40, 40), batch_size=-1, device="cpu")
    print("--- %s seconds ---" % (time.time() - start_time))


Which gives the summary in my previous message.

Now the training:

TRAINING_EPOCH= 100
model = ResNet3D(BasicBlock3d, [2,2,2,2], n_channels=1, num_classes=2, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None, act_layer=None, final_softmax=False, ps = 0.5)
learning_rate = 0.001
criterion = CrossEntropyLoss()
optimizer = SGD(model.parameters(),lr=learning_rate) # SGD Optimizer

min_valid_loss = math.inf

t1=time.time()
for epoch in range(TRAINING_EPOCH):

    train_loss = 0.0
    model.train()
    for i, (image, ground_truth) in enumerate(train_loader):
        image = image.view(batch_size,1,24,40,40)
        optimizer.zero_grad()
        target = model(image)
        loss = criterion(target, ground_truth)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        
    if epoch%10==0:
        print('Epoch '+str(epoch)+': train loss='+str(train_loss))
    tr.append(train_loss)    
        
    valid_loss = 0.0
    model.eval()
    correct = 0
    total = 0
    for image, ground_truth in test_loader:
        image = image.view(batch_size,1,24,40,40)
        target = model(image)
        loss = criterion(target,ground_truth)
        valid_loss = loss.item()
        # Get predi ctions from the maximum value
        predicted = torch.max(target.data, 1)[1]

        # Total number of labels
        total += len(ground_truth)
        correct += (predicted == ground_truth).sum()
    if epoch%10==0:
        print('Epoch '+str(epoch)+': valid loss='+str(valid_loss))

    accuracy = 100 * correct / float(total)

Which gives me:

Epoch 0: train loss=0.6931471824645996
Epoch 0: valid loss=0.6758744716644287
Epoch 10: train loss=0.6774154901504517
Epoch 10: valid loss=0.7009776830673218
Epoch 20: train loss=0.7139725685119629
Epoch 20: valid loss=0.7262313961982727
Epoch 30: train loss=0.5223163962364197
Epoch 30: valid loss=0.8072291612625122
Epoch 40: train loss=0.7081986665725708
Epoch 40: valid loss=1.800889015197754
Epoch 50: train loss=0.8400921821594238
Epoch 50: valid loss=13.688627243041992
Epoch 60: train loss=0.6127006411552429
Epoch 60: valid loss=228.80892944335938
Epoch 70: train loss=0.7464629411697388
Epoch 70: valid loss=63.092506408691406
Epoch 80: train loss=0.6829321980476379
Epoch 80: valid loss=182.90078735351562
Epoch 90: train loss=0.6990883946418762
Epoch 90: valid loss=973.0631103515625

Hence we see that the train loss does not decrease below 0.69.

Thanks a lot for your help and your time.