@ptrblck Thanks again. I really appreciate your efforts.
test_dataset = datasets.MNIST(āMNISTā, download=True, transform=transform, train=True)
test_loader =torch.utils.data.DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=False)
Test loader will take 10,000 images as in MNIST test split has 10,000 images. If we want to take randomly selected 2000 images, how can we do that? Thanks in advanced @ptrblck
You could inherit from datasets.MNIST
and override the __init__()
method.
We can subsample the dataset and just hold 2000 random samples in it.
After each epoch you can shuffle the dataset again by calling test_loader.dataset.shuffle_data()
.
Note that you could use a more sophisticated method than np.random.shuffle
, e.g. scikitās train_test_split for stratification.
class MyMNISTDataset(datasets.MNIST):
def __init__(self, root, train=True, transform=None, target_transform=None, download=False, nb_samples=None):
self.root = os.path.expanduser(root)
self.transform = transform
self.target_transform = target_transform
self.train = train # training set or test set
self.nb_samples = nb_samples
if download:
self.download()
if not self._check_exists():
raise RuntimeError('Dataset not found.' +
' You can use download=True to download it')
if self.train:
self.train_data_all, self.train_labels_all = torch.load(
os.path.join(self.root, self.processed_folder, self.training_file))
else:
self.test_data_all, self.test_labels_all = torch.load(
os.path.join(self.root, self.processed_folder, self.test_file))
if self.nb_samples:
self.shuffle_data()
def shuffle_data(self):
if self.train:
data_idx = range(len(self.train_data_all))
np.random.shuffle(data_idx)
self.train_data = self.train_data_all[data_idx[:self.nb_samples]]
self.train_labels = self.train_labels_all[data_idx[:self.nb_samples]]
else:
data_idx = range(len(self.test_data_all))
np.random.shuffle(data_idx)
self.test_data = self.test_data_all[data_idx[:self.nb_samples]]
self.test_labels = self.test_labels_all[data_idx[:self.nb_samples]]
trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
test_dataset = MyMNISTDataset('MNIST', train=False, transform=trans, nb_samples=2000)
test_loader = DataLoader(test_dataset)
for epoch in range(2):
for batch_idx, data in enumerate(test_loader):
print batch_idx
test_loader.dataset.shuffle_data()
Hey Patrick,
Sorry for digging up the grave but I tried your solution and Iām getting a āfile limit exceededā error. I tried iterating through each of the dataloaders independently and didnāt get such an error so it seems to be a problem with the zipping. In fact, the error occurs on the very first iteration over the dataloaders when barely any files should be open.
Would be awesome if you could take a look for me.
Iām on python 2.7 and pytorch 0.3.0.
This is the code:
class MatchDataset(data.Dataset):
def __init__(self, opt, data_file, loader=default_loader):
...
def __len__(self):
return len(self.data)
def __getitem__(self, index):
sample = self.data[index]
...
return ...
class UnmatchDataset(MatchDataset):
def __init__(self, opt, data_file, fake_len, loader=default_loader):
super(UnmatchDataset, self).__init__(opt, data_file)
self.real_len = len(self.data)
self.fake_len = fake_len
def __len__(self):
return self.fake_len
def __getitem__(self, index):
if index >= self.real_len:
i = randint(0, self.real_len-1)
return super(UnmatchDataset, self).__getitem__(i)
else:
return super(UnmatchDataset, self).__getitem__(index)
for i, (match_batch, unmatch_batch) in enumerate(zip(self.match_loader, self.unmatch_loader)):
match_batch = [util.to_var(x) for x in match_batch]
unmatch_batch = [util.to_var(x) for x in unmatch_batch]
...
Below is the error log Iām getting:
0: File "qgen.py", line 194, in train
0: for i, (match_batch, unmatch_batch) in enumerate(zip(self.match_loader, self.unmatch_loader)):
0: File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 204, in __next__
0: idx, batch = self.data_queue.get()
0: File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/multiprocessing/queues.py", line 376, in get
0: return recv()
0: File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/site-packages/torch/multiprocessing/queue.py", line 22, in recv
0: return pickle.loads(buf)
0: File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/pickle.py", line 1388, in loads
0: return Unpickler(file).load()
0: File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/pickle.py", line 864, in load
0: dispatch[key](self)
0: File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/pickle.py", line 1139, in load_reduce
0: value = func(*args)
0: File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/site-packages/torch/multiprocessing/reductions.py", line 68, in rebuild_storage_fd
0: fd = multiprocessing.reduction.rebuild_handle(df)
0: File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/multiprocessing/reduction.py", line 155, in rebuild_handle
0: conn = Client(address, authkey=current_process().authkey)
0: File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/multiprocessing/connection.py", line 169, in Client
0: c = SocketClient(address)
0: File "/ais/gobi5/shenkev/anaconda2/envs/py27/lib/python2.7/multiprocessing/connection.py", line 320, in SocketClient
0: fd = duplicate(s.fileno())
0: OSError: [Errno 24] Too many open files
Could you check your ulimit
and increase it?
Are you closing the files properly after loading the data?
I have two training dataset with different datasize. I am trying to train both dataset simultaneously to find the loss. How I can do it?
If you want to loop both datasets simultaneously, you could use zip
.
Note that the loop will stop after all batches from the the shorter dataset were returned:
dataset1 = datasets.FakeData(size=100, transform=transforms.ToTensor())
dataset2 = datasets.FakeData(size=50, transform=transforms.ToTensor())
loader1 = DataLoader(dataset1, batch_size=10)
loader2 = DataLoader(dataset2, batch_size=10)
for idx, (data1, data2) in enumerate(zip(loader1, loader2)):
print(idx)
Hi ptrblck,
Is there any other option. So, that I can feed batches from each dataset alternately(in a single pass) and then add a loss for both batches (from batch1 and batch2).
Thank you for the help.
Inside the loop you would be able to feed both batches separately to your model and add the loss at the end.
Could you post some pseudo code, if thatās not your use case?
Hey, @ptrblck I am trying to use two datasets as following, where b1 are the batches from dataset1 and b2 are the batches from dataset2.
for each epoch:
for each batch of total (b1 + b2) batches from dataset1 and dataset2:
forward pass;
if current batch from dataset1:
loss1 = criterion();
loss1.backward()
else
loss2 = criterion()
loss2.backward() * some_constant
end
end
end
Hence, I want to ensure that, within each batch, images are coming from the same dataset.
Thank you.
The easiest way would be to just iterate both datasets sequentially.
As this might have some shortcomings (e.g. bad shuffling), you could wrap both Datasets
in another one and return a dict containing the data from each subset.
Here is a small example:
class MyDataset(Dataset):
def __init__(self, data, target):
self.data = data
self.target = target
def __getitem__(self, index):
x = self.data[index]
y = self.target[index]
return x, y
def __len__(self):
return len(self.data)
class MyTotalDataset(Dataset):
def __init__(self, dataset1, dataset2):
self.dataset1 = dataset1
self.dataset2 = dataset2
def __getitem__(self, index):
x1, y1 = self.dataset1[index]
x2, y2 = self.dataset2[index]
ret = {
'data1': (x1, y1),
'data2': (x2, y2)
}
return ret
def __len__(self):
return min(len(self.dataset1), len(self.dataset2))
dataset1 = MyDataset(
data=torch.zeros(10, 3, 24, 24),
target=torch.randint(0, 10, (10,))
)
dataset2 = MyDataset(
data=torch.ones(10, 3, 24, 24),
target=torch.randint(0, 10, (10,))
)
dataset = MyTotalDataset(dataset1, dataset2)
batch_size = 16
loader = DataLoader(
dataset,
batch_size=batch_size//2
)
for batch in loader:
print(batch['data1'][0].shape)
print(batch['data2'][0].shape)
# data1 should contain all zeros
assert (batch['data1'][0]==0).all()
# data2 should contain all ones
assert (batch['data2'][0]==1).all()
In the current approach, the length of the smaller dataset will define the overall length.
Which strategy would you like to apply if the lengths differ?
I should be able to apply all batches from each datasets in every epoch.
Here is the example of my approach but it is giving me memory error . (each dataset contain ~150k images with batch size = 250)
loder1 = DataLoader(dataset1)
loder2 = DataLoader(dataset2)
x = []
for data in loader1:
x.append(data)
d1_batches = len(x)
for data in loader2:
x.append(data)
index = np.arange(len(x))
np.random.shuffle(index)
#print(index)
for k in range(len(x)):
image = x[index[k]]
if index[k] < d1_batches:
print("d1: ", image)
else:
print("d2: ", image)
In that case, this code might work better:
class MyTotalDataset(Dataset):
def __init__(self, dataset1, dataset2):
self.dataset1 = dataset1
self.dataset2 = dataset2
self.length1 = len(self.dataset1)
self.length2 = len(self.dataset2)
def __getitem__(self, index):
if index >= self.length1:
index -= self.length1
x, y = self.dataset2[index]
origin = 2
else:
x, y = self.dataset1[index]
origin = 1
return x, y, origin
def __len__(self):
return self.length1 + self.length2
dataset1 = MyDataset(
data=torch.zeros(10, 3, 24, 24),
target=torch.randint(0, 10, (10,))
)
dataset2 = MyDataset(
data=torch.ones(10, 3, 24, 24),
target=torch.randint(0, 10, (10,))
)
dataset = MyTotalDataset(dataset1, dataset2)
batch_size = 16
loader = DataLoader(
dataset,
batch_size=16
)
for x, y, origin in loader:
x1 = x[origin==1]
y1 = y[origin==1]
x2 = x[origin==2]
y2 = y[origin==2]
assert (x1==0).all()
assert (x2==1).all()
Hey, @ptrblck,
I didnāt get the last 2-lines and Is this code giving the same batch size(16) in each iteration for both datasets(i mean batch_size_d1=16 and batch_size_d2=16)? I need an answer for this with different data size. Train two Dataset together in Capsule Network
Thanks!
The last two lines are just a check to make sure we are slicing the returned batch right, so that x1
comes from dataset1
while x2
comes from dataset2
.
You will get a batch of batch_size
samples, which might come from both datasets, e.g. for batch_size=16
you might get 10 samples of dataset1
and 6 from dataset2
.
Hi @ptrblck I was trying to iterate 2 dataloaders together for test time augmentation (couldnāt find a better way to handle this). The code gets stuck on zip statement and loop is not iterated in my case. I am using python 2.7. Here is my code: Also is there a better way to handle test time augmentations in pytorch? The dataloader works when not used without zip.
def test_time_augmentation(model, synced_dls, metric):
avg_metric = []
print('Running TTA')
print(len(synced_dls[0]), len(synced_dls[1]))
with torch.no_grad():
for i, (data_0, data_1) in enumerate(zip(synced_dls[0], synced_dls[1])):
print(i)
data_all = [data_0, data_1]
for data in data_all:
out_0, out_1, _, _, _, _ = model(data.copy())
avg_metric.append(metric(out_0, data['gt_0'].to(device), 224) * 0.5 + metric(out_1, data['gt_1'].to(device), 224) * 0.5)
print(sum(avg_metric) / len(avg_metric))
return avg_metric
df = pd.read_csv('valid_correct.csv').iloc[:, 1:-1]
model = nn.DataParallel(CoreModel(freeze_resnet=False)).to(device).eval()
model.load_state_dict(torch.load('4rend_if_model.pth'), strict=False)
torch.manual_seed(0)
val_orig = DataPointMatching(df, p=0)
val_orig_dl = DataLoader(val_orig, batch_size=10, shuffle=False)
torch.manual_seed(0)
val_tfm = DataPointMatching(df, p=1)
val_tfm_dl = DataLoader(val_tfm, batch_size=10, shuffle=False)
metric = iou_from_polys
avg_metrics = test_time_augmentation(model, [val_orig_dl, val_tfm_dl], iou_from_polys)
print('FINAL IOU')
print(sum(avg_metrics) / len(avg_metrics))
What are the lengths of both DataLoaders
?
Could it be, that one might be empty?