Please help! CNN training for custom dataset yields a weird result…

Sungyong · November 15, 2023, 7:45pm

Dear experts,

I’m trying to train data (velocity spectrum panel; sgy file) to coordinates (velocity structure profile; csv file) through a simple CNN by PyTorch. In order to test whether the training model is working or not, I tested with 10 data with its labels.
But it yields a weird result of:

Epoch 1

Test Error:
Accuracy: 0.0%, Avg loss: nan

Epoch 2

loss: nan [ 1/ 10]
Test Error:
Accuracy: 0.0%, Avg loss: nan

…

Epoch 10

loss: nan [ 1/ 10]
Test Error:
Accuracy: 0.0%, Avg loss: nan

Done!

And if I change “batch_size=1” to “batch_size=3”, it will show the following error:

Epoch 1

loss: nan [ 3/ 10]

RuntimeError Traceback (most recent call last)
in <cell line: 144>()
145 print(f"Epoch {t+1}\n-------------------------------")
146 train_loop(train_dataloader, model, loss_fn, optimizer)
→ 147 test_loop(test_dataloader, model, loss_fn)
148 print(“Done!”)

in test_loop(test_dataloader, model, loss_fn)
135 pred = model(data)
136 test_loss += loss_fn(pred, label).item()
→ 137 correct += (pred.argmax(1) == label).type(torch.float).sum().item()
138
139 test_loss /= num_batches

RuntimeError: The size of tensor a (3) must match the size of tensor b (40) at non-singleton dimension 1

How can I change my code for making the training works? And how to make my code works no matter what number is in “batch_size=”?

The data and label is like:

print(data)
print(data.shape)

tensor([[[0.0000, 0.0000, 1.0000, …, 0.0184, 0.0348, 0.0492],
[0.0000, 0.0000, 1.0000, …, 0.0442, 0.0363, 0.0250],
[0.0000, 0.0000, 1.0000, …, 0.0564, 0.0388, 0.0295],
…,
[1.0000, 0.9606, 0.8394, …, 0.0093, 0.0152, 0.0153],
[1.0000, 0.9524, 0.8419, …, 0.0091, 0.0151, 0.0160],
[1.0000, 0.9305, 0.8363, …, 0.0093, 0.0146, 0.0157]]])
torch.Size([1, 50, 200])

print(label)
print(label.shape)

tensor([ 178., 1878., 822., 1814., 1375., 2162., 1669., 2304., 2065., 2736.,
2528., 2780., 2836., 3008., 3396., 3490., 4013., 3518., nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
dtype=torch.float64)
torch.Size([40])

The full code would be:

# custom dataset  <- revised from "https://pytorch.org/tutorials/beginner/data_loading_tutorial.html"

import torch
from torch.utils.data import Dataset
import pandas as pd
import os
import segyio
import numpy as np

class cvspanel_dataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.dv_label = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.dv_label)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        data_name = os.path.join(self.root_dir,
                                 self.dv_label.iloc[idx, 0])
        gth = segyio.open(data_name, ignore_geometry=True)
        data = gth.trace.raw[:]
        data = torch.tensor(data[:, :200])
        data = data.unsqueeze(0)
        arr = self.dv_label.iloc[idx, 1:]
        arr = np.asarray(arr)
        label = arr.astype('float').reshape(-1, 2)
        label = torch.tensor(label)
        label = label.view([-1, 1])
        label = label.squeeze()

        if self.transform:
            data = self.transform(data)
        if self.transform:
            label = self.transform(label)

        return data, label

train_dataset = cvspanel_dataset(csv_file='/content/drive/MyDrive/Colab Notebooks/research_data/synthetic_1D/d-v_label.csv',
                                 root_dir='/content/drive/MyDrive/Colab Notebooks/research_data/synthetic_1D/sgy_cvs_panel',
                                 transform=None)

test_dataset = cvspanel_dataset(csv_file='/content/drive/MyDrive/Colab Notebooks/research_data/synthetic_1D/d-v_label.csv',
                                 root_dir='/content/drive/MyDrive/Colab Notebooks/research_data/synthetic_1D/sgy_cvs_panel',
                                 transform=None)

# dataloader
batch_size = 1

train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size)
test_dataloader = torch.utils.data.DataLoader(test_dataset,
                                          batch_size)

# model building

import torch
import torch.nn as nn

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=17, stride=1, padding=3),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=10, stride=5, padding=0)
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=6, stride=1, padding=2),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=3, padding=1)
        )
        self.fc1 = nn.Linear(in_features=64*3*13, out_features=512)
        self.drop = nn.Dropout(0.25)
        self.fc2 = nn.Linear(in_features=512, out_features=128)
        self.fc3 = nn.Linear(in_features=128, out_features=40)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out1 = out.view(out.size(0), -1)
        out = self.fc1(out1)
        out = self.drop(out)
        out = self.fc2(out)
        out = self.fc3(out)
        return out

# hyperparameter

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = CNN();
model.to(device)

learning_rate = 0.001;

loss_fn = nn.CrossEntropyLoss();

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate);

# Training  <- revised from "https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html"

def train_loop(train_dataloader, model, loss_fn, optimizer):
    size = len(train_dataloader.dataset)
    model.train()
    for batch, (data, label) in enumerate(train_dataloader):
        # Compute prediction and loss
        pred = model(data)
        loss = loss_fn(pred, label)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(data)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(test_dataloader, model, loss_fn):
    model.eval()
    size = len(test_dataloader.dataset)
    num_batches = len(test_dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for data, label in test_dataloader:
            pred = model(data)
            test_loss += loss_fn(pred, label).item()
            correct += (pred.argmax(1) == label).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")

I’m looking forward your help.
Thank you in advance.

ptrblck · November 15, 2023, 8:57pm

Check the shape of all used tensors in the training and testing loop, in particular data, label, and pred, and make sure the batch size is not changing and they are generally compatible for the loss and accuracy calculation.

Sungyong · November 17, 2023, 4:57am

Thank you for your reply.
I checked for the used tensors in “test_loop” and here are the results.

#model(data) # <- ValueError: expected 4D input (got 3D input)
pred = model(data.unsqueeze(0))   # torch.Size([1, 1, 50, 200]) <- torch.Size([1, 50, 200])
print(model(data.unsqueeze(0)))
print(model(data.unsqueeze(0)).shape)

tensor([[-0.6811, 0.4903, 0.1965, 0.8932, -0.0934, 0.6535, 0.5393, 1.0231,
0.0896, 1.3273, 0.2671, 0.6641, 0.7759, 0.6466, 0.7556, 1.0952,
1.1368, 1.7202, 2.7104, 1.9669, 1.9330, 2.6915, 1.0585, 1.2629,
-1.3037, -0.7101, -0.3221, -1.3404, -1.9542, -1.5863, -3.7850, -3.4187,
-3.5173, -4.1656, -4.7128, -4.0686, -4.1666, -4.7581, -3.3586, -3.9824]],
grad_fn=)
torch.Size([1, 40])

#loss_fn(pred, label).item() # <- ValueError: Expected input batch_size (1) to match target batch_size (40).
loss_fn(pred, label.unsqueeze(0)).item() # torch.Size([1, 40]) <- torch.Size([40])
print(loss_fn(pred, label.unsqueeze(0)).item())
print(loss_fn(pred, label.unsqueeze(0)).shape)

157758.68227863312
torch.Size([ ])

print(pred.argmax(1) == label)
print((pred.argmax(1) == label).shape)

tensor([False, False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False, False, False])
torch.Size([40])

print(pred.argmax(1))
print(pred.argmax(1).shape)

tensor([7])
torch.Size([1])

(For “data” and “label”, please check the post)

How can I know the batch size?
Under my understanding (please correct my understanding if is wrong), the first number in “torch.Size[ ]” is batch_size. If it is true, why my batch_size=1 worked but not the other number?

Also, could you tell me how to modify the code for making the calculation work well?

J_Johnson · November 17, 2023, 6:56am

Normally, image data should be of shape ([batch_size, channels, height, width]).

How many channels should your images have?

What should be the height and width?

Sungyong · November 17, 2023, 8:09am

Thank you for your reply, @J_Johnson!

I see. For my case, channel= 1, height=200, width=50.

J_Johnson · November 17, 2023, 9:15am

Things I would try:

Set your learning rate much lower.
If the classes distribution is uneven, set the weights argument in CrossEntropyLoss — PyTorch 2.1 documentation.
Include ReLU activations between each Linear layer.
Apply dropout on more layers while lowering the p value.

J_Johnson · November 17, 2023, 2:34pm

That’s because you are using data.unsqueeze(0) instead of data.unsqueeze(1). That means your batch dim is getting pushed to your channels dim when batchsize > 1.

Sungyong · November 20, 2023, 6:16am

Thank you for your comment, @J_Johnson !

I’ll try those 4 advices to see if it make the model to train well enough.

Regarding data.unsqueeze(0), I cannot change it to data.unsqueeze(1) because it causes dimension size error: torch.size([1, 50, 200]) → torch.size([50, 1, 200])

RuntimeError Traceback (most recent call last)
in <cell line: 145>()
145 for t in range(epochs):
146 print(f"Epoch {t+1}\n-------------------------------")
→ 147 train_loop(train_dataloader, model, loss_fn, optimizer)
148 test_loop(test_dataloader, model, loss_fn)
149 print(“Done!”)

10 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/conv.py in _conv_forward(self, input, weight, bias)
454 weight, bias, self.stride,
455 _pair(0), self.dilation, self.groups)
→ 456 return F.conv2d(input, weight, bias, self.stride,
457 self.padding, self.dilation, self.groups)
458

RuntimeError: Given groups=1, weight of size [32, 1, 17, 17], expected input[2, 50, 1, 200] to have 1 channels, but got 50 channels instead

data_name = '/content/drive/MyDrive/Colab Notebooks/research_data/synthetic_1D/sgy_cvs_panel/cvspanel0002.sgy'
gth = segyio.open(data_name, ignore_geometry=True)
data = gth.trace.raw[:]
data = torch.tensor(data[:, :200])
data = data.unsqueeze(0)

print(data)
print(data.shape)

tensor([[[0.0000, 0.0000, 1.0000, …, 0.0184, 0.0348, 0.0492],
[0.0000, 0.0000, 1.0000, …, 0.0442, 0.0363, 0.0250],
[0.0000, 0.0000, 1.0000, …, 0.0564, 0.0388, 0.0295],
…,
[1.0000, 0.9606, 0.8394, …, 0.0093, 0.0152, 0.0153],
[1.0000, 0.9524, 0.8419, …, 0.0091, 0.0151, 0.0160],
[1.0000, 0.9305, 0.8363, …, 0.0093, 0.0146, 0.0157]]])
torch.Size([1, 50, 200])

data_name = '/content/drive/MyDrive/Colab Notebooks/research_data/synthetic_1D/sgy_cvs_panel/cvspanel0002.sgy'
gth = segyio.open(data_name, ignore_geometry=True)
data = gth.trace.raw[:]
data = torch.tensor(data[:, :200])
data = data.unsqueeze(1)

print(data)
print(data.shape)

tensor([[[0.0000, 0.0000, 1.0000, …, 0.0184, 0.0348, 0.0492]],

    [[0.0000, 0.0000, 1.0000,  ..., 0.0442, 0.0363, 0.0250]],

    [[0.0000, 0.0000, 1.0000,  ..., 0.0564, 0.0388, 0.0295]],

    ...,

    [[1.0000, 0.9606, 0.8394,  ..., 0.0093, 0.0152, 0.0153]],

    [[1.0000, 0.9524, 0.8419,  ..., 0.0091, 0.0151, 0.0160]],

    [[1.0000, 0.9305, 0.8363,  ..., 0.0093, 0.0146, 0.0157]]])

torch.Size([50, 1, 200])

How can I solve the problem of batch size without making the error?

J_Johnson · November 20, 2023, 6:57am

I was referring to this line of code. While it may make the size the same for batchsize of 1, it will make the batchsize of n of size ([n, 1, 50, 200]).

Sungyong · November 20, 2023, 7:31am

Thank you for telling me where went wrong, @J_Johnson

I appreciate your valuable advice. Now I knew what went wrong. Although the direct change of “pred = model(data.unsqueeze(0))” in “train_loop” or “test_loop” causes the following error,

RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [1, 1, 1, 50, 200]

I learned how I need to approach to solve the problem. Let me try and ask for help if needed.
Thank you so much again!

Sungyong · November 20, 2023, 8:41am

Thanks a lot! I solved the problem about changing batch size. As you said, I need to make the the torch size changable based on my batch size. Here is how I solved by adjusting .unsqueeze() and .repeat() in “CNN(nn.Module)”, “train_loop”, and “test_loop”.

class cvspanel_dataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.dv_label = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.dv_label)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        data_name = os.path.join(self.root_dir,
                                 self.dv_label.iloc[idx, 0])
        gth = segyio.open(data_name, ignore_geometry=True)
        data = gth.trace.raw[:]
        data = torch.tensor(data[:, :200])
        #data = data.unsqueeze(0)
        arr = self.dv_label.iloc[idx, 1:]
        arr = np.asarray(arr)
        label = arr.astype('float').reshape(-1, 2)
        label = np.nan_to_num(label)
        label = torch.tensor(label)
        label = label.view([-1, 1])
        #label = label.squeeze()

        if self.transform:
            data = self.transform(data)
        if self.transform:
            label = self.transform(label)

        return data, label

def train_loop(train_dataloader, model, loss_fn, optimizer):
    size = len(train_dataloader.dataset)
    model.train()
    for batch, (data, label) in enumerate(train_dataloader):
        # Compute prediction and loss
        #pred = model(data)
        pred = model(data.unsqueeze(1).repeat(batch_size, 1, 1, 1))
        loss = loss_fn(pred, label.squeeze().repeat(batch_size, 1))

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch_size + 1) * len(data)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(test_dataloader, model, loss_fn):
    model.eval()
    size = len(test_dataloader.dataset)
    num_batches = len(test_dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for data, label in test_dataloader:
            #pred = model(data)
            pred = model(data.unsqueeze(1).repeat(batch_size, 1, 1, 1))
            test_loss += loss_fn(pred, label.squeeze().repeat(batch_size, 1)).item()
            correct += (pred.argmax(1) == label).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")