Train simultaneously on two datasets

Hi there, I have managed to use two datasets by creating a custom dataset that takes in two root directories:

class dataset_maker(Dataset):
    def __init__(self, root_dir1, root_dir2, transform= None):
        self.root_dir1=root_dir1
        self.root_dir2=root_dir2
        self.filelist1 = glob.glob(root_dir1+'*.png')
        self.filelist2 = glob.glob(root_dir2+'*.png')
        self.transform=transform
        
    def __len__(self):
        return min(len(self.filelist1),len(self.filelist2))
    
    def __getitem__(self, idx):
        sample1 = io.imread(self.filelist1[idx])/65535*255
        sample2 = io.imread(self.filelist2[idx])/65535*255
        sample1=np.uint8(sample1)
        sample2=np.uint8(sample2)
        sample1=PIL.Image.fromarray(sample1)
        sample2=PIL.Image.fromarray(sample2)
        if self.transform:
            sample1 = self.transform(sample1)
            sample2 = self.transform(sample2)
        return sample1,sample2

then, make a dataloader using the two datasets:

dataloader = DataLoader(combined_dataset, batch_size=3, shuffle=True, num_workers=4)

Finally, I get the data in the training loops by doing this call in the for loop:

for epoch in range(10):
    running_loss=0.0
    
    #get the data
    for batch_num, (hq_batch,Lq_batch) in enumerate(dataloader):
        print(batch_num, hq_batch.shape, Lq_batch.shape)

The output is stated below:

0 torch.Size([3, 3, 256, 256]) torch.Size([3, 3, 256, 256])
1 torch.Size([3, 3, 256, 256]) torch.Size([3, 3, 256, 256])
2 torch.Size([3, 3, 256, 256]) torch.Size([3, 3, 256, 256])
3 torch.Size([3, 3, 256, 256]) torch.Size([3, 3, 256, 256])
4 torch.Size([3, 3, 256, 256]) torch.Size([3, 3, 256, 256])
5 torch.Size([3, 3, 256, 256]) torch.Size([3, 3, 256, 256])
6 torch.Size([3, 3, 256, 256]) torch.Size([3, 3, 256, 256])
7 torch.Size([3, 3, 256, 256]) torch.Size([3, 3, 256, 256])
8 torch.Size([3, 3, 256, 256]) torch.Size([3, 3, 256, 256])
9 torch.Size([3, 3, 256, 256]) torch.Size([3, 3, 256, 256])
10 torch.Size([3, 3, 256, 256]) torch.Size([3, 3, 256, 256])
11 torch.Size([3, 3, 256, 256]) torch.Size([3, 3, 256, 256])
12 torch.Size([3, 3, 256, 256]) torch.Size([3, 3, 256, 256])

Hope this solves the problem!

3 Likes

how this combined_dataset is to be designed? Is that to be instance of Concat. I got this error while tried to form the combined dataset using dset.ImageFolder, :“TypeError: expected str, bytes or os.PathLike object, not ConcatDataset”

I’m not sure if the for loop is setup correctly since __getitem__ is returning ( data points for A, labels for A) , (data points for B, labels for B) ).

1 Like

How do you train your model ?
You had passed batch_size =3 in the dataloader, but you got two batches of 3 sample. How do you use it to train model?

Did you find the solution for this? I.e when samplers are not of same size?
Another question,. Can we use concat dataset for more than 2 samplers?

Hi There,

Suppose I have two training dataset with different size and I am trying to train it on a network simultaneously, So I can do it? also, I need to keep a track of from which dataset image is coming to find out the loss after each iteration by the equation:

where,
L0 & L1 are the lengths of the dataset and Lambda is a balancing constant.

Thank you.

Hi,

I am trying to concatenate dataset in a such a way that which will also able to return path.

Hi,

I write a simple demo for you, just use tensor_data, you can have a modification on it to meet your needs.

class custom_dataset1(torch.utils.data.Dataset):
    def __init__(self):
        super(custom_dataset1, self).__init__()
        self.tensor_data = torch.tensor([1., 2., 3., 4., 5.])
    def __getitem__(self, index):
        return self.tensor_data[index], index
    def __len__(self):
        return len(self.tensor_data)

class custom_dataset2(torch.utils.data.Dataset):
    def __init__(self):
        super(custom_dataset2, self).__init__()
        self.tensor_data = torch.tensor([6., 7., 8., 9., 10.])
    def __getitem__(self, index):
        return self.tensor_data[index], index
    def __len__(self):
        return len(self.tensor_data)

dataset1 = custom_dataset1()
dataset2 = custom_dataset2()
concate_dataset = torch.utils.data.ConcatDataset([dataset1, dataset2])
value ,index = next(iter(concate_dataset))
print(value, index)

you can change index in to path, then using corresponding loss function.

If we want to combine two imbalanced datasets and get balanced samples, I think we could use ConcatDataset and pass a WeightedRandomSampler to the DataLoader

dataset1 = custom_dataset1()
dataset2 = custom_dataset2()
concat_dataset = torch.utils.data.ConcatDataset([dataset1, dataset2])
dataloader = torch.utils.data.DataLoader(concat_dataset, batch_size= bs, weighted_sampler)
3 Likes

I am looking for an answer for this do you have any idea about it? and thank you for your help.

Thanks a lot. Really helped me with training my CycleGAN network. :slight_smile:

Maybe we can solve this by:

class ConcatDataset(torch.utils.data.Dataset):
    def __init__(self, *datasets):
        self.datasets = datasets

    def __getitem__(self, i):
        return tuple(d[i %len(d)] for d in self.datasets)

    def __len__(self):
        return max(len(d) for d in self.datasets)

train_loader = torch.utils.data.DataLoader(
             ConcatDataset(
                 datasets.ImageFolder(traindir_A),
                 datasets.ImageFolder(traindir_B)
             ),
             batch_size=args.batch_size, shuffle=True,
             num_workers=args.workers, pin_memory=True)

for i, (input, target) in enumerate(train_loader):
    ... 
2 Likes

@GloryDream

Question #1: When I try this, it loops through the shorter dataset in the group. So if dataset A is 100 and dataset B is 1000 images and if I call ConcatDataset(dataset_A, dataset_B)[100], I’ll get a tuple with the contents filled by(dataset_A[0], dataset_B[100]). Does this make sense when putting this into a loader for training? Won’t I overfit on the smaller dataset?

Question #2: Now we don’t just have (input, target), we have ((input_1, target_1), (input_2, target_2)).

How do I train when the loader gives me a list of lists like this? Do I select randomly from the first list for my input? Or is this where weighted sampling comes in?

2 Likes

I also have the same question.Please let me know what is the best way to solve this problem. I dont think we can use weighted random sampling here if yes please let me know how can i do it?

1 Like

Hello I’m facing a similar problem and none of the solutions above are fitting. I’m running semi-supervised experiments and I’d like each batch to contain say n observations from the labelled data set set and say m observations from the unlabelled data set. Of course each of these go through different objective functions but are added together before making and optimization set. Thus I would really need to have loader formatted to sample from 2 two different data set at a time. Anyone know a ingenious to do so ?

1 Like
class BalancedConcatDataset(torch.utils.data.Dataset):
    def __init__(self, *datasets):
        self.datasets = datasets
        self.max_len = max(len(d) for d in self.datasets)
        self.min_len = min(len(d) for d in self.datasets)

    def __getitem__(self, i):
        return tuple(d[i % len(d)] for d in self.datasets)

    def masks_collate(self, batch):
        # Only image - mask
        images, masks = [], []
        for item in range(len(batch)):
            for c_dataset in range(len(batch[item])):
                images.append(batch[item][c_dataset][0])
                masks.append(batch[item][c_dataset][1])
        images = torch.stack(images)
        masks = torch.stack(masks)
        return images, masks

    def __len__(self):
        return self.max_len

It would be masks or labels

Hi @apaszke when i use this function it transforms my dataset which is combined of tensors to lists is there a solution for this ??

Any luck on a solution @MarkovChain? Currently I pass multiple datasets to CycleConcatDataset and then define a dataloader on it with a single batch size. This essentially will batch all the datasets and will cycle through the shorter ones until the longest dataset finishes.

In my use case (semi supervised and domain adaptation) I would like to keep the parameter updates as balanced as possible. This cycling method is a bit unfair as the shorter datasets update the parameters more.

I think one way to help my particular use case is to somehow use different batch sizes for each dataset.

class CycleConcatDataset(data.Dataset):
    '''Dataset wrapping multiple train datasets
    Parameters
    ----------
    *datasets : sequence of torch.utils.data.Dataset
        Datasets to be concatenated and cycled
    '''
    def __init__(self, *datasets):
        self.datasets = datasets

    def __getitem__(self, i):
        result = []
        for dataset in self.datasets:
            cycled_i = i % len(dataset)
            result.append(dataset[cycled_i])

        return tuple(result)

    def __len__(self):
        return max(len(d) for d in self.datasets)

If you are looking for using multiple dataloaders at the same time this should work


class cat_dataloaders():
    """Class to concatenate multiple dataloaders"""

    def __init__(self, dataloaders):
        self.dataloaders = dataloaders
        len(self.dataloaders)

    def __iter__(self):
        self.loader_iter = []
        for data_loader in self.dataloaders:
            self.loader_iter.append(iter(data_loader))
        return self

    def __next__(self):
        out = []
        for data_iter in self.loader_iter:
            out.append(next(data_iter)) # may raise StopIteration
        return tuple(out)

Here is a quick example

class DEBUG_dataset(Dataset):
    def __init__(self,alpha):
        self.d = (torch.arange(20) + 1) * alpha
    def __len__(self):
        return self.d.shape[0]
    def __getitem__(self, index):
        return self.d[index]

train_dl1 = DataLoader(DEBUG_dataset(10), batch_size = 4,num_workers = 0 , shuffle=True)
train_dl2 = DataLoader(DEBUG_dataset(1), batch_size = 4,num_workers = 0 , shuffle=True)
tmp = cat_dataloaders([train_dl1,train_dl2])
for x in tmp:
    print(x)

output is

(tensor([140, 160, 130,  90]), tensor([ 5, 10,  8,  9]))
(tensor([120,  30, 170,  70]), tensor([15, 17, 18,  7]))
(tensor([180,  50, 190,  80]), tensor([ 6, 14,  3,  2]))
(tensor([ 10,  40, 150, 100]), tensor([11, 13,  4,  1]))
(tensor([ 60, 200, 110,  20]), tensor([19, 12, 20, 16]))
1 Like

Bro, thanks for saving my time lol.