Implementing multi-input neural network in Pytorch

I am trying to reproduce the multi-input neural network of this tutorial: tutorial. This article uses PyTorch Lightning, while I want to use PyTorch, so I am adapting to my case. Basically I created my dataloaders and my network:

# Define loaders
from import DataLoader
train_loader = DataLoader(train_set, batch_size=64, num_workers=2, drop_last=True, shuffle=True)
val_loader   = DataLoader(val_set,   batch_size=64, num_workers=2, drop_last=False, shuffle=False)
test_loader  = DataLoader(test_set,  batch_size=64, num_workers=2, drop_last=False, shuffle=False)

def conv_block(input_size, output_size):
    block = nn.Sequential(
        nn.Conv2d(input_size, output_size, (3, 3)), nn.BatchNorm2d(output_size), nn.ReLU(), nn.MaxPool2d((2, 2)),

    return block

class SimpleCNN(nn.Module):

  def __init__(self):
    # Call parent contructor
    self.conv1 = conv_block(3, 16)
    self.conv2 = conv_block(16, 32)
    self.conv3 = conv_block(32, 64)

    self.ln1 = nn.Linear(64 * 26 * 26, 16)
    self.relu = nn.ReLU()
    self.batchnorm = nn.BatchNorm1d(16)
    self.dropout = nn.Dropout2d(0.5)
    self.ln2 = nn.Linear(16, 5)

    self.ln4 = nn.Linear(5, 10)
    self.ln5 = nn.Linear(10, 10)
    self.ln6 = nn.Linear(10, 5)
    self.ln7 = nn.Linear(10, 1)
  # Forward
  def forward(self, img, tab):
    img = self.conv1(img)

    img = self.conv2(img)
    img = self.conv3(img)
    img = img.reshape(img.shape[0], -1)
    img = self.ln1(img)
    img = self.relu(img)
    img = self.batchnorm(img)
    img = self.dropout(img)
    img = self.ln2(img)
    img = self.relu(img)

    tab = self.ln4(tab)
    tab = self.relu(tab)
    tab = self.ln5(tab)
    tab = self.relu(tab)
    tab = self.ln6(tab)
    tab = self.relu(tab)

    x =, tab), dim=1)
    x = self.relu(x)

    return self.ln7(x)

and then I defined my optimizer and criterion (the same of the tutorial):

optimizer = optim.SGD(model.parameters(), lr = 0.01)
criterion = nn.L1Loss()

Now, this below is my training function:

def train(net, loaders, optimizer, criterion, epochs=100, dev=torch.device('cpu')):
        net =
        # Initialize history
        history_loss = {"train": [], "val": [], "test": []}
        history_accuracy = {"train": [], "val": [], "test": []}
        # Process each epoch
        for epoch in range(epochs):
            # Initialize epoch variables
            sum_loss = {"train": 0, "val": 0, "test": 0}
            sum_accuracy = {"train": 0, "val": 0, "test": 0}
            # Process each split
            for split in ["train", "val", "test"]:
                # Process each batch
                for (image, tabular, labels) in loaders[split]:
                    # Move to CUDA
                    image =
                    tabular =
                    labels =
                    # Reset gradients
                    # Compute output
                    #pred = torch.flatten((image, tabular))
                    pred = net(image, tabular)
                    #y_pred = y_pred.double()
                    loss = criterion(pred, labels)
                    # Update loss
                    sum_loss[split] += loss.item()
                    # Check parameter update
                    if split == "train":
                        # Compute gradients
                        # Optimize
                    # Compute accuracy
                    _,pred_labels = pred.max(1)
                    batch_accuracy = (pred_labels == labels).sum().item()/image.size(0)
                    # Update accuracy
                    sum_accuracy[split] += batch_accuracy
            # Compute epoch loss/accuracy
            epoch_loss = {split: sum_loss[split]/len(loaders[split]) for split in ["train", "val", "test"]}
            epoch_accuracy = {split: sum_accuracy[split]/len(loaders[split]) for split in ["train", "val", "test"]}
            # Update history
            for split in ["train", "val", "test"]:
            # Print info
            print(f"Epoch {epoch+1}:",
    except KeyboardInterrupt:
        # Plot loss
        for split in ["train", "val", "test"]:
            plt.plot(history_loss[split], label=split)
        # Plot accuracy
        for split in ["train", "val", "test"]:
            plt.plot(history_accuracy[split], label=split)

that I call in this way:

# Define dictionary of loaders
loaders = {"train": train_loader,
           "val": val_loader,
           "test": test_loader}
# Train model
train(model, loaders, optimizer, criterion, epochs=10, dev=dev)

Training starts and it completes all 10 epochs. However, results are really bad, because I am doing something wrong. These are the results:

Epoch 1: TrL=756382.4643, TrA=0.0000, VL=724350.7875, VA=0.0000, TeL=810417.3250, TeA=0.0000,
Epoch 2: TrL=767425.5143, TrA=0.0000, VL=724348.9250, VA=0.0000, TeL=810415.4375, TeA=0.0000,
Epoch 3: TrL=769819.8732, TrA=0.0000, VL=724341.4625, VA=0.0000, TeL=810408.1375, TeA=0.0000,
Epoch 4: TrL=769039.4804, TrA=0.0000, VL=724228.2875, VA=0.0000, TeL=810297.6250, TeA=0.0000,
Epoch 5: TrL=687138.2839, TrA=0.0000, VL=720732.6250, VA=0.0000, TeL=807107.3750, TeA=0.0000,
Epoch 6: TrL=637015.2786, TrA=0.0000, VL=723909.0375, VA=0.0000, TeL=809951.0625, TeA=0.0000,
Epoch 7: TrL=601827.3125, TrA=0.0000, VL=575946.9625, VA=0.0000, TeL=565301.9250, TeA=0.0000,
Epoch 8: TrL=600566.4304, TrA=0.0000, VL=646973.8250, VA=0.0000, TeL=729645.0250, TeA=0.0000,
Epoch 9: TrL=574847.1312, TrA=0.0000, VL=326207.9562, VA=0.0000, TeL=369593.9562, TeA=0.0000,
Epoch 10: TrL=630909.6888, TrA=0.0000, VL=723533.1000, VA=0.0000, TeL=809632.1750, TeA=0.0000,

Basically, each accuracy is always 0. I think that the problem is in the labels that I give to my training function. Indeed, the tutorial of the multi input network, in its training, validation and test functions has these lines of code:

def training_step(self, batch, batch_idx):
    image, tabular, y = batch

    criterion = torch.nn.L1Loss()
    y_pred = torch.flatten(self(image, tabular))
    y_pred = y_pred.double()

    loss = criterion(y_pred, y)

However, I am not flattening anything, neither in my training function, neither in my neural network.