Train two Dataset together in Capsule Network

@ptrblck Thanks again. I really appreciate your efforts.

test_dataset = datasets.MNIST(ā€˜MNISTā€™, download=True, transform=transform, train=True)
test_loader =torch.utils.data.DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=False)

Test loader will take 10,000 images as in MNIST test split has 10,000 images. If we want to take randomly selected 2000 images, how can we do that? Thanks in advanced @ptrblck

You could inherit from datasets.MNIST and override the __init__() method.
We can subsample the dataset and just hold 2000 random samples in it.
After each epoch you can shuffle the dataset again by calling test_loader.dataset.shuffle_data().
Note that you could use a more sophisticated method than np.random.shuffle, e.g. scikitā€™s train_test_split for stratification.

class MyMNISTDataset(datasets.MNIST):
    def __init__(self, root, train=True, transform=None, target_transform=None, download=False, nb_samples=None):
        self.root = os.path.expanduser(root)
        self.transform = transform
        self.target_transform = target_transform
        self.train = train  # training set or test set
        self.nb_samples = nb_samples

        if download:
            self.download()

        if not self._check_exists():
            raise RuntimeError('Dataset not found.' +
                               ' You can use download=True to download it')

        if self.train:
            self.train_data_all, self.train_labels_all = torch.load(
                os.path.join(self.root, self.processed_folder, self.training_file))
        else:
            self.test_data_all, self.test_labels_all = torch.load(
                    os.path.join(self.root, self.processed_folder, self.test_file))

        if self.nb_samples:
            self.shuffle_data()
    
    def shuffle_data(self):
            if self.train:
                data_idx = range(len(self.train_data_all))
                np.random.shuffle(data_idx)
                self.train_data = self.train_data_all[data_idx[:self.nb_samples]]
                self.train_labels = self.train_labels_all[data_idx[:self.nb_samples]]
            else:
                data_idx = range(len(self.test_data_all))
                np.random.shuffle(data_idx)
                self.test_data = self.test_data_all[data_idx[:self.nb_samples]]
                self.test_labels = self.test_labels_all[data_idx[:self.nb_samples]]


trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
test_dataset = MyMNISTDataset('MNIST', train=False, transform=trans, nb_samples=2000)
test_loader = DataLoader(test_dataset)
    
for epoch in range(2):
    for batch_idx, data in enumerate(test_loader):
        print batch_idx

    test_loader.dataset.shuffle_data()
1 Like

@ptrblck Thank you so much for your continuous supports.

Hey Patrick,

Sorry for digging up the grave but I tried your solution and Iā€™m getting a ā€œfile limit exceededā€ error. I tried iterating through each of the dataloaders independently and didnā€™t get such an error so it seems to be a problem with the zipping. In fact, the error occurs on the very first iteration over the dataloaders when barely any files should be open.

Would be awesome if you could take a look for me.

Iā€™m on python 2.7 and pytorch 0.3.0.

This is the code:

class MatchDataset(data.Dataset):
    def __init__(self, opt, data_file, loader=default_loader):
    ...

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        ...
        return ...
    
class UnmatchDataset(MatchDataset):
    def __init__(self, opt, data_file, fake_len, loader=default_loader):

        super(UnmatchDataset, self).__init__(opt, data_file)

        self.real_len = len(self.data)
        self.fake_len = fake_len

    def __len__(self):
        return self.fake_len

    def __getitem__(self, index):

        if index >= self.real_len:
            i = randint(0, self.real_len-1)
            return super(UnmatchDataset, self).__getitem__(i)
        else:
            return super(UnmatchDataset, self).__getitem__(index)

for i, (match_batch, unmatch_batch) in enumerate(zip(self.match_loader, self.unmatch_loader)):
        match_batch = [util.to_var(x) for x in match_batch]
        unmatch_batch = [util.to_var(x) for x in unmatch_batch]
...

Below is the error log Iā€™m getting:

0:   File "qgen.py", line 194, in train
0:     for i, (match_batch, unmatch_batch) in enumerate(zip(self.match_loader, self.unmatch_loader)):
0:   File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 204, in __next__
0:     idx, batch = self.data_queue.get()
0:   File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/multiprocessing/queues.py", line 376, in get
0:     return recv()
0:   File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/site-packages/torch/multiprocessing/queue.py", line 22, in recv
0:     return pickle.loads(buf)
0:   File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/pickle.py", line 1388, in loads
0:     return Unpickler(file).load()
0:   File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/pickle.py", line 864, in load
0:     dispatch[key](self)
0:   File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/pickle.py", line 1139, in load_reduce
0:     value = func(*args)
0:   File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/site-packages/torch/multiprocessing/reductions.py", line 68, in rebuild_storage_fd
0:     fd = multiprocessing.reduction.rebuild_handle(df)
0:   File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/multiprocessing/reduction.py", line 155, in rebuild_handle
0:     conn = Client(address, authkey=current_process().authkey)
0:   File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/multiprocessing/connection.py", line 169, in Client
0:     c = SocketClient(address)
0:   File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/multiprocessing/connection.py", line 320, in SocketClient
0:     fd = duplicate(s.fileno())
0: OSError: [Errno 24] Too many open files

Could you check your ulimit and increase it?
Are you closing the files properly after loading the data?

I have two training dataset with different datasize. I am trying to train both dataset simultaneously to find the loss. How I can do it?

If you want to loop both datasets simultaneously, you could use zip.
Note that the loop will stop after all batches from the the shorter dataset were returned:

dataset1 = datasets.FakeData(size=100, transform=transforms.ToTensor())
dataset2 = datasets.FakeData(size=50, transform=transforms.ToTensor())

loader1 = DataLoader(dataset1, batch_size=10)
loader2 = DataLoader(dataset2, batch_size=10)

for idx, (data1, data2) in enumerate(zip(loader1, loader2)):
    print(idx)

Hi ptrblck,

Is there any other option. So, that I can feed batches from each dataset alternately(in a single pass) and then add a loss for both batches (from batch1 and batch2).

Thank you for the help.

Inside the loop you would be able to feed both batches separately to your model and add the loss at the end.
Could you post some pseudo code, if thatā€™s not your use case?

Hey, @ptrblck I am trying to use two datasets as following, where b1 are the batches from dataset1 and b2 are the batches from dataset2.

for each epoch:
   for each batch of total (b1 + b2) batches from dataset1 and dataset2:
       forward pass;
       if current batch from dataset1:
           loss1 = criterion();
           loss1.backward()
      else
           loss2 = criterion()
           loss2.backward() * some_constant
      end
   end
end
   

Hence, I want to ensure that, within each batch, images are coming from the same dataset.

Thank you.

The easiest way would be to just iterate both datasets sequentially.
As this might have some shortcomings (e.g. bad shuffling), you could wrap both Datasets in another one and return a dict containing the data from each subset.
Here is a small example:

class MyDataset(Dataset):
    def __init__(self, data, target):
        self.data = data
        self.target = target
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        
        return x, y
    
    def __len__(self):
        return len(self.data)


class MyTotalDataset(Dataset):
    def __init__(self, dataset1, dataset2):
        self.dataset1 = dataset1
        self.dataset2 = dataset2
        
    def __getitem__(self, index):
        x1, y1 = self.dataset1[index]
        x2, y2 = self.dataset2[index]
        
        ret = {
            'data1': (x1, y1),
            'data2': (x2, y2)
        }
        return ret
    
    def __len__(self):
        return min(len(self.dataset1), len(self.dataset2))


dataset1 = MyDataset(
    data=torch.zeros(10, 3, 24, 24),
    target=torch.randint(0, 10, (10,))
)
dataset2 = MyDataset(
    data=torch.ones(10, 3, 24, 24),
    target=torch.randint(0, 10, (10,))
 )

dataset = MyTotalDataset(dataset1, dataset2)
batch_size = 16
loader = DataLoader(
    dataset,
    batch_size=batch_size//2
)

for batch in loader:
    print(batch['data1'][0].shape)
    print(batch['data2'][0].shape)
    
    # data1 should contain all zeros
    assert (batch['data1'][0]==0).all()
    # data2 should contain all ones
    assert (batch['data2'][0]==1).all()
1 Like

Thank you @ptrblck.

Will it work for datasets with different data size?

In the current approach, the length of the smaller dataset will define the overall length.
Which strategy would you like to apply if the lengths differ?

I should be able to apply all batches from each datasets in every epoch.

Here is the example of my approach but it is giving me memory error . (each dataset contain ~150k images with batch size = 250)

loder1 = DataLoader(dataset1)
loder2 = DataLoader(dataset2)

x = []

for data in loader1:
	x.append(data)
d1_batches = len(x)

for data in loader2:
	x.append(data)

index = np.arange(len(x))

np.random.shuffle(index)
#print(index)
for k in range(len(x)):
	image = x[index[k]]
	if index[k] < d1_batches:
		print("d1: ", image)
	else:
		print("d2: ", image)

In that case, this code might work better:

class MyTotalDataset(Dataset):
    def __init__(self, dataset1, dataset2):
        self.dataset1 = dataset1
        self.dataset2 = dataset2
        self.length1 = len(self.dataset1)
        self.length2 = len(self.dataset2)
        
    def __getitem__(self, index):
        if index >= self.length1:
            index -= self.length1
            x, y = self.dataset2[index]
            origin = 2
        else:
            x, y = self.dataset1[index]
            origin = 1
        
        return x, y, origin
    
    def __len__(self):
        return self.length1 + self.length2


dataset1 = MyDataset(
    data=torch.zeros(10, 3, 24, 24),
    target=torch.randint(0, 10, (10,))
)
dataset2 = MyDataset(
    data=torch.ones(10, 3, 24, 24),
    target=torch.randint(0, 10, (10,))
 )

dataset = MyTotalDataset(dataset1, dataset2)
batch_size = 16
loader = DataLoader(
    dataset,
    batch_size=16
)

for x, y, origin in loader:
    x1 = x[origin==1]
    y1 = y[origin==1]
    x2 = x[origin==2]
    y2 = y[origin==2]
    
    assert (x1==0).all()
    assert (x2==1).all()
1 Like

Hey, @ptrblck,

I didnā€™t get the last 2-lines and Is this code giving the same batch size(16) in each iteration for both datasets(i mean batch_size_d1=16 and batch_size_d2=16)? I need an answer for this with different data size. Train two Dataset together in Capsule Network

Thanks!:slightly_smiling_face:

The last two lines are just a check to make sure we are slicing the returned batch right, so that x1 comes from dataset1 while x2 comes from dataset2.

You will get a batch of batch_size samples, which might come from both datasets, e.g. for batch_size=16 you might get 10 samples of dataset1 and 6 from dataset2.

1 Like

Hi @ptrblck I was trying to iterate 2 dataloaders together for test time augmentation (couldnā€™t find a better way to handle this). The code gets stuck on zip statement and loop is not iterated in my case. I am using python 2.7. Here is my code: Also is there a better way to handle test time augmentations in pytorch? The dataloader works when not used without zip.

def test_time_augmentation(model, synced_dls, metric):
    avg_metric = []
    print('Running TTA')
    print(len(synced_dls[0]), len(synced_dls[1]))
    with torch.no_grad():
        for i, (data_0, data_1) in enumerate(zip(synced_dls[0], synced_dls[1])):
            print(i)
            data_all = [data_0, data_1] 
            for data in data_all:
                out_0, out_1, _, _, _, _ = model(data.copy())
                avg_metric.append(metric(out_0, data['gt_0'].to(device), 224) * 0.5 + metric(out_1, data['gt_1'].to(device), 224) * 0.5)
            print(sum(avg_metric) / len(avg_metric))
    return avg_metric

df = pd.read_csv('valid_correct.csv').iloc[:, 1:-1]
model = nn.DataParallel(CoreModel(freeze_resnet=False)).to(device).eval()
model.load_state_dict(torch.load('4rend_if_model.pth'), strict=False)
torch.manual_seed(0)
val_orig = DataPointMatching(df, p=0)
val_orig_dl = DataLoader(val_orig, batch_size=10, shuffle=False)
torch.manual_seed(0)
val_tfm = DataPointMatching(df, p=1)
val_tfm_dl = DataLoader(val_tfm, batch_size=10, shuffle=False)
metric = iou_from_polys
avg_metrics = test_time_augmentation(model, [val_orig_dl, val_tfm_dl], iou_from_polys)
print('FINAL IOU')
print(sum(avg_metrics) / len(avg_metrics))

What are the lengths of both DataLoaders?
Could it be, that one might be empty?