Getting error after loop runs through the first epoch

Hi Im training a model for classifying pneumonia detection images I developed a model that goes throgh the training loop once correctly but as soon as it goes through another epoch it starts to throw this error:

#Import math packages
import numpy as np
import os
import time
import copy
from glob import glob
import random
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from PIL import Image

#import pytorch packages
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from import DataLoader, Dataset
from torch.optim import Adam
from torchvision import datasets, transforms
from torchvision.utils import make_grid
from torchvision.models import resnet18, vgg16
from tqdm import tqdm

test_N = glob('/content/drive/MyDrive/chest_xray/test/NORMAL/*')
test_P = glob('/content/drive/MyDrive/chest_xray/test/PNEUMONIA/*')

train_N = glob('/content/drive/MyDrive/chest_xray/train/NORMAL/*')
train_P = glob('/content/drive/MyDrive/chest_xray/train/PNEUMONIA/*')

train_paths = train_N + train_P
test_paths = test_N + test_P

train_labels = [0] * len(train_N) + [1] * len(train_P)
test_labels = [0] * len(test_N) + [1] * len(test_P)


from google.colab import drive

train_paths, valid_paths, train_labels, valid_labels = train_test_split(train_paths,

def imShowRandom():
    path_random_normal = random.choice(train_N)
    path_random_abnormal = random.choice(train_P)

    fig = plt.figure(figsize=(10,10))

    ax1 = plt.subplot(1,2,1)
    ax1.set_title("Normal X-Ray")

    ax2 = plt.subplot(1,2,2)
    ax2.set_title("Pneumonia X-Ray")


class ModifySet(Dataset):
    def __init__(self,paths,labels, transform=None):
        self.paths = paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, index):
        path = self.paths[index]
        image ="RGB")

        if self.transform:
            image = self.transform(image)

        label1 = self.labels[index]
        label1 = torch.tensor([label1])

        return image, label1

class PneuModel(nn.Module):
    def __init__(self, pretrained = True):
        super(PneuModel, self).__init__()
        self.x = resnet18(pretrained = pretrained)
        self.fc = nn.Linear(in_features=512, out_features = 1)

    def forward(self, x):
        x = self.x.conv1(x)
        x = self.x.bn1(x)
        x = self.x.relu(x)
        x = self.x.maxpool(x)

        x = self.x.layer1(x)
        x = self.x.layer2(x)
        x = self.x.layer3(x)
        x = self.x.layer4(x)

        x = self.x.avgpool(x)


        x = nn.Flatten(1, -1)(x)

        x = self.fc(x)

        return x

image_size = (500,500)

train_transforms = transforms.Compose([
    transforms.Resize(size = image_size),
    transforms.Normalize([0.458,0.456,0.406], [0.229,0.224,0.225]),

test_transforms = transforms.Compose([
    transforms.Resize(size = image_size),
    transforms.Normalize([0.458,0.456,0.406], [0.229,0.224,0.225]),

train_ds = ModifySet(train_paths, train_labels, train_transforms)
val_ds = ModifySet(test_paths, test_paths, test_transforms)

model = PneuModel(pretrained = True)
epochs = 5
b_size = 15

train_dl = DataLoader(train_ds, batch_size=b_size,num_workers=5,shuffle=True)
val_dl = DataLoader(val_ds, batch_size = b_size, num_workers=5, shuffle = False)

dataloaders = {
    'train': train_dl,
    'val': val_dl

logging = {
    'train': len(dataloaders["train"])//10,
    'val': len(dataloaders["val"])//10

size = {
    'train': len(train_ds),
    'val': len(val_ds)

criterion = nn.BCEWithLogitsLoss()
optimizer = Adam(model.parameters(),lr = 3e-3)

def train_model(model, criterion, optimizer, epochs, device="cpu"):

    start = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0

    for epoch in tqdm(range(epochs),leave=False):
        for phase in ["train","val"]:
            if phase == "train":

            running_loss = 0.0
            running_corrects = 0

            for i, (inputs, labels) in tqdm(enumerate(dataloaders[phase]), leave=False, total = len(dataloaders[phase])):
                device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

                inputs =
                labels =


                with torch.set_grad_enabled(phase == "train"):
                    outputs = model(inputs)
                    preds = outputs.sigmoid() > 0.5
                    loss = criterion(outputs, labels.float())

                    if phase == "train":

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds ==

                if(i % logging[phase] ==0) & (i > 0):
                    avg_loss = running_loss/ ((i+1)*16)
                    avg_corrects = running_corrects/ ((i+1)*16)

                    print(f"[{phase}]: {epoch+1}/{epochs} | loss: {avg_loss} | acc: {avg_corrects}")

            epoch_loss = running_loss/size[phase]
            epoch_acc = running_corrects.double()/size[phase]

            print(f'Loss: {epoch_loss} Acc: {epoch_acc}')

            if phase == "val" and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

    time_pass = time.time()-start
    print("Training took {} seconds".format(time_pass))
    return model

model = train_model(model, criterion, optimizer, epochs)

I sent your post to GPT-4, and mind-blowingly, it found multiple issues with your code, as well as offered accurate suggestions. Here’s the response from GPT-4 below.

The error message indicates that a ValueError occurred because of a “too many dimensions ‘str’” issue. This usually happens when you are trying to convert a string to a PyTorch tensor, and it’s likely that the code is trying to interpret a string as a tensor with more dimensions than expected.

From the context of your code, it seems the problem occurs in the __getitem__ method of the ModifySet dataset class, specifically at the line where you are creating a tensor out of labels:

label1 = torch.tensor([label1])

If label1 is somehow a string or an unexpected type instead of an integer or a float, this error could occur. Here are a few potential reasons and solutions for this error:

  1. Data Leakage Between Epochs: Make sure that the data is not being modified in any way during the epoch that could carry over to the next epoch. In particular, ensure that the labels are not being changed to strings during the first pass through the data.

  2. Incorrect Label Handling: Double-check that train_labels and valid_labels lists contain integers and not strings. If the labels are loaded from a file and not cast to integers explicitly, they might be strings.

  3. DataLoader Worker Issue: When using multiple workers in PyTorch DataLoader, sometimes the state of the workers can cause unexpected behaviors. You can try setting num_workers=0 to see if the problem persists, which can help determine if the issue is related to multiprocessing.

  4. Data Transformation Logic: Ensure that your transformations in the dataset class are not converting labels into strings. All transformations should be applied to the images only, not the labels.

  5. Validation Dataset Loader Issue: You have an error in your validation dataset loader where you’re passing test_paths as both the images and the labels. This could be the root of the problem since the second item should be valid_labels.

To fix the validation dataset loader issue, change this line:

val_ds = ModifySet(test_paths, test_paths, test_transforms)


val_ds = ModifySet(valid_paths, valid_labels, test_transforms)

If you continue to face issues, it would be helpful to print out the type and value of label1 right before the tensor conversion to debug what’s happening at that point in the code.

Remember to restart your Python kernel or runtime if you’re using an interactive environment like Google Colab to clear any residual states before running the modified code.

what you said worked for the value error now its getting this error

you can debug it together with GPT-4 :slight_smile:

i dont have gpt 4 i only have the free version