Dataloader giving incorrect result

I am trying to return variable length array from Dataset, Dataloader is returning incorrect results.

class TS(Dataset):
    
    def __len__(self):
        return 10
    
    def __getitem__(self,idx):
        return {'token':[1]*idx,'in':idx,'text':"hello"}

ds = TS()
dl = DataLoader(ds,batch_size=3,shuffle=False)

for dic in dl:
    print(dic)

Output

{‘token’: , ‘in’: tensor([0, 1, 2]), ‘text’: [‘hello’, ‘hello’, ‘hello’]}
{‘token’: [tensor([1, 1, 1]), tensor([1, 1, 1]), tensor([1, 1, 1])], ‘in’: tensor([3, 4, 5]), ‘text’: [‘hello’, ‘hello’, ‘hello’]}
{‘token’: [tensor([1, 1, 1]), tensor([1, 1, 1]), tensor([1, 1, 1]), tensor([1, 1, 1]), tensor([1, 1, 1]), tensor([1, 1, 1])], ‘in’: tensor([6, 7, 8]), ‘text’: [‘hello’, ‘hello’, ‘hello’]}
{‘token’: [tensor([1]), tensor([1]), tensor([1]), tensor([1]), tensor([1]), tensor([1]), tensor([1]), tensor([1]), tensor([1])], ‘in’: tensor([9]), ‘text’: [‘hello’]}

If i change the Dataset to

class TS(Dataset):
    
    def __len__(self):
        return 10
    
    def __getitem__(self,idx):
        return [1]*idx, idx

Output:

[, tensor([0, 1, 2])]
[[tensor([1, 1, 1]), tensor([1, 1, 1]), tensor([1, 1, 1])], tensor([3, 4, 5])]
[[tensor([1, 1, 1]), tensor([1, 1, 1]), tensor([1, 1, 1]), tensor([1, 1, 1]), tensor([1, 1, 1]), tensor([1, 1, 1])], tensor([6, 7, 8])]
[[tensor([1]), tensor([1]), tensor([1]), tensor([1]), tensor([1]), tensor([1]), tensor([1]), tensor([1]), tensor([1])], tensor([9])]

You would need a custom collate function to use variable input shapes:

def my_collate(batch):
    data = [item for item in batch]
    return data

class TS(Dataset):
    
    def __len__(self):
        return 10
    
    def __getitem__(self,idx):
        return {'token':[1]*idx,'in':idx,'text':"hello"}

ds = TS()
dl = DataLoader(ds,batch_size=3,shuffle=False, collate_fn=my_collate)

for dic in dl:
    print(dic)
1 Like