Kfold.split() got "too many values to unpack" error?

I am trying to implement kfold in MNIST dataset. My idea is to split the data to k_folds=5, and max_epoch=10, in the 10 epoch, the 4+1 data would be used respectively as train and val data twice. Here is my implementation:

import torch
import torchvision
from visdom import Visdom
from sklearn.model_selection import KFold

learning_rate=1e-2
maxepoch=10
k_folds=int(maxepoch/2)
batch_size=200
device=torch.device('cuda:0')

train_data=torchvision.datasets.MNIST('../data', train=True, download=True,
                   transform=torchvision.transforms.Compose([
                       torchvision.transforms.ToTensor(),
                       torchvision.transforms.Normalize((0.1307,), (0.3081,))
                   ]))

test_data=torchvision.datasets.MNIST('../data', train=False, 
                   transform=torchvision.transforms.Compose([
                       torchvision.transforms.ToTensor(),
                       torchvision.transforms.Normalize((0.1307,), (0.3081,))
                   ]))

class myMLP(torch.nn.Module):

    def __init__(self):
        super(myMLP, self).__init__()

        self.model=torch.nn.Sequential(
            torch.nn.Linear(784, 200),
            torch.nn.LeakyReLU(inplace=True),
            torch.nn.Linear(200, 200),
            torch.nn.LeakyReLU(inplace=True),
            torch.nn.Linear(200, 10),
        )
    
    def forward(self, x):
        x=self.model(x)

        return x

myNet=myMLP().to(device)

optimizer=torch.optim.SGD(myNet.parameters(), lr=learning_rate)
loss_function=torch.nn.CrossEntropyLoss().to(device)

viz=Visdom()
viz.line([0.], [0.], win='train_loss', opts=dict(title='Train Loss'))
viz.line([0.], [0.], win='val', opts=dict(title='Validation Accuracy'))
global_step=0

kfold = KFold(n_splits=k_folds)
train_ids_set, val_ids_set=kfold.split(train_data)

for epoch in range(maxepoch):

    train_ids=train_ids_set[epoch%k_folds]
    val_ids  =  val_ids_set[epoch%k_folds]

    train_subsampler=torch.utils.data.SubsetRandomSampler(train_ids)
    val_subsampler  =torch.utils.data.SubsetRandomSampler(val_ids)

    train_loader=torch.utils.data.DataLoader(train_data, batch_size=batch_size, sampler=train_subsampler)
    val_loader  =torch.utils.data.DataLoader(train_data, batch_size=batch_size, sampler=  val_subsampler)

    for batch_idx, (data, target) in enumerate(train_loader):

        data=data.view(-1, 28*28)
        data, target=data.to(device), target.to(device)
        logits=myNet(data)
        loss=loss_function(logits, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_idx%100==0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_idx*len(data), len(train_loader.dataset), 100*batch_idx/len(train_loader), loss.item()))
        
        global_step+=1
        viz.line([loss.item()], [global_step], win='train_loss', update='append')
    
    val_loss=0
    correct=0
    for data, target in val_loader:
        data=data.view(-1, 28*28)
        data, target=data.to(device), target.to(device)
        logits=myNet(data)
        val_loss+=loss_function(logits, target).item()

        pred=logits.data.argmax(dim=1)
        correct+=pred.eq(target.data).sum()

        viz.images(data.view(-1, 1, 28, 28).clamp(0, 1), win='pics', opts=dict(title='Handwirtting'))
        viz.text(str(pred), win='pred', opts=dict(title='Predicted'))

    val_loss/=len(val_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(val_loss, correct, len(val_loader.dataset), 100*correct/len(val_loader.dataset)))

    viz.line([(correct/len(val_loader.dataset)).cpu().numpy()], [epoch], win='val', update='append')


test_loader=torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=True)
test_loss=0
correct=0
for data, target in test_loader:
    data=data.view(-1, 28*28)
    data, target=data.to(device), target.to(device)
    logits=myNet(data)
    test_loss+=loss_function(logits, target).item()

    pred=logits.data.argmax(dim=1)
    correct+=pred.eq(target.data).sum()

    viz.images(data.view(-1, 1, 28, 28).clamp(0, 1), win='pics', opts=dict(title='Handwirtting'))
    viz.text(str(pred), win='pred', opts=dict(title='Predicted'))

test_loss/=len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(test_loss, correct, len(test_loader.dataset), 100*correct/len(test_loader.dataset)))

I got a error message ‘too many values to unpack’, what should I do? Thanks.

I think .split returns a generator and is thus supposed to be used in a loop as:

for train_ids_set, val_ids_set in kfold.split(train_data):
    print(train_ids_set)
    print(val_ids_set)

Yeah, thanks, now I changed the code to:

import torch
import torchvision
from visdom import Visdom
from sklearn.model_selection import KFold

maxepoch=10
k_folds=int(maxepoch/2)
batch_size=200
learning_rate=1e-2
device=torch.device('cuda:0')

train_data=torchvision.datasets.MNIST('../data', train=True, download=True,
                   transform=torchvision.transforms.Compose([
                       torchvision.transforms.ToTensor(),
                       torchvision.transforms.Normalize((0.1307,), (0.3081,))
                   ]))

test_data=torchvision.datasets.MNIST('../data', train=False, 
                   transform=torchvision.transforms.Compose([
                       torchvision.transforms.ToTensor(),
                       torchvision.transforms.Normalize((0.1307,), (0.3081,))
                   ]))

class myMLP(torch.nn.Module):

    def __init__(self):
        super(myMLP, self).__init__()

        self.model=torch.nn.Sequential(
            torch.nn.Linear(784, 200),
            torch.nn.LeakyReLU(inplace=True),
            torch.nn.Linear(200, 200),
            torch.nn.LeakyReLU(inplace=True),
            torch.nn.Linear(200, 10),
        )
    
    def forward(self, x):
        x=self.model(x)

        return x

myNet=myMLP().to(device)

optimizer=torch.optim.SGD(myNet.parameters(), lr=learning_rate)
loss_function=torch.nn.CrossEntropyLoss().to(device)

viz=Visdom()
viz.line([0.], [0.], win='train_loss', opts=dict(title='Train Loss'))
viz.line([0.], [0.], win='val', opts=dict(title='Validation Accuracy'))
global_step=0

kfold = KFold(n_splits=k_folds, shuffle=True)
train_ids_set=[]
val_ids_set=[]
for t, v in kfold.split(train_data):
    train_ids_set.append(t)
    val_ids_set.append(v)

for epoch in range(maxepoch):

    train_ids=train_ids_set[epoch%k_folds]
    val_ids  =  val_ids_set[epoch%k_folds]
    print(len(train_ids))
    print(len(val_ids))
    train_subsampler=torch.utils.data.SubsetRandomSampler(train_ids)
    val_subsampler  =torch.utils.data.SubsetRandomSampler(val_ids)

    train_loader=torch.utils.data.DataLoader(train_data, batch_size=batch_size, sampler=train_subsampler)
    val_loader  =torch.utils.data.DataLoader(train_data, batch_size=batch_size, sampler=  val_subsampler)
    print(len(train_loader.dataset))
    print(len(val_loader.dataset))

    for batch_idx, (data, target) in enumerate(train_loader):

        data=data.view(-1, 28*28)
        data, target=data.to(device), target.to(device)
        logits=myNet(data)
        loss=loss_function(logits, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_idx%100==0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_idx*len(data), len(train_loader.dataset), 100*batch_idx/len(train_loader), loss.item()))
        
        global_step+=1
        viz.line([loss.item()], [global_step], win='train_loss', update='append')
    
    val_loss=0
    correct=0
    for data, target in val_loader:
        data=data.view(-1, 28*28)
        data, target=data.to(device), target.to(device)
        logits=myNet(data)
        val_loss+=loss_function(logits, target).item()

        pred=logits.data.argmax(dim=1)
        correct+=pred.eq(target.data).sum()

        viz.images(data.view(-1, 1, 28, 28).clamp(0, 1), win='pics', opts=dict(title='Handwirtting'))
        viz.text(str(pred), win='pred', opts=dict(title='Predicted'))

    val_loss/=len(val_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(val_loss, correct, len(val_loader.dataset), 100*correct/len(val_loader.dataset)))

    viz.line([(correct/len(val_loader.dataset)).cpu().numpy()], [epoch], win='val', update='append')


test_loader=torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=True)
test_loss=0
correct=0
for data, target in test_loader:
    data=data.view(-1, 28*28)
    data, target=data.to(device), target.to(device)
    logits=myNet(data)
    test_loss+=loss_function(logits, target).item()

    pred=logits.data.argmax(dim=1)
    correct+=pred.eq(target.data).sum()

    viz.images(data.view(-1, 1, 28, 28).clamp(0, 1), win='pics', opts=dict(title='Handwirtting'))
    viz.text(str(pred), win='pred', opts=dict(title='Predicted'))

test_loss/=len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(test_loss, correct, len(test_loader.dataset), 100*correct/len(test_loader.dataset)))

looks like train_ids and val_ids get respectively 48000 and 12000 items, while the sampler didn’t slice the right corresponding data, cause all

print(len(train_loader.dataset))
print(len(val_loader.dataset))

gave 60000 results.
The code run fine except the data.
Any thing wrong with these lines:?

    train_subsampler=torch.utils.data.SubsetRandomSampler(train_ids)
    val_subsampler  =torch.utils.data.SubsetRandomSampler(val_ids)

    train_loader=torch.utils.data.DataLoader(train_data, batch_size=batch_size, sampler=train_subsampler)
    val_loader  =torch.utils.data.DataLoader(train_data, batch_size=batch_size, sampler=  val_subsampler)

I just want the train_loader and val_loader get the right data corresponding to train_ids and val_ids each epoch.
Thanks!

The sampler will be applied through the DataLoader while you are checking the length of the original internal .dataset attribute (which will not be changed).
The len of the DataLoaders should return the expected number of batches.

Thanks, but

len(train_loader)
len(val_loader)

will give results of 240 and 60 respectively, which is 48000/200 and 12000/200, 200 is the batch_size.

Is there any syntax can give the length of the sampled dataset directly?

So, you mean my code really used the 48000 in each train_ids to train, and use the other 12000 in val_ids to validate? No problem? but only:

len(train_loader.dataset)
len(val_loader.dataset)

will give the original dataset length. why the syntax is designed like this? Thanks.

Yes, the DataLoaders will return the expected number of batches using the passed samplers.

You are accessing an internal attribute of the DataLoader class and are expecting the sampler to have manipulated the data, which is a wrong expectation.
The loader.dataset attribute will return the passed Dataset without any manipulations from the sampler, as the sampling process will only be triggered during the actual data loading by iterating the DataLoader.
Often the data loading is implemented lazily (i.e. each sample is only loaded and transformed in the Dataset.__getiem__) so that a sample manipulation before starting the real data loading wouldn’t even work.

I see, thanks! :grinning: :grinning: :grinning: :grinning: :grinning: :grinning: :grinning: