Hello everyone,
I’m getting a Bus error (core dumped) , when using the distUtils.DistributedSampler with a larger dataset.
It works fine, once I reduce the data size or don’t use the distUtils.DistributedSampler.
Any thoughts on what may be causing this or how I can fix it ? Thx
My code is following:
Trainer.py
corpus_dataset = CorpusDataset(h5py_path, self.word2Vec, self.args.maxL_input, self.args.maxL_output)
train_sampler = None
if self.args.distributed:
dist.init_process_group(backend=self.args.distBackend, init_method=self.args.distUrl,
world_size=self.args.worldSize, rank=self.args.rank)
train_sampler = distUtils.DistributedSampler(corpus_dataset, self.args.worldSize, self.args.rank)
custom_loader = Data.DataLoader(
dataset=corpus_dataset,
batch_size=self.args.batchSize,
shuffle=(train_sampler is None),
drop_last=(train_sampler is not None),
num_workers=1,
collate_fn=collate_fn,
sampler=train_sampler
)
for epoch in range(self.args.numEpochs):
for posts, p_lens, responses, r_lens, labels in custom_loader:
pass
Dataset and collate_fn
class CorpusDataset(Data.Dataset):
def __init__(self, h5_path, word2Vec, maxL_input, maxL_output):
self.h5f = h5py.File(h5_path, 'r')
self.word2Vec = word2Vec
self.pad_id = word2Vec.word2id(TAG_TOKEN_PAD)
self.input_boundary = maxL_input
self.output_boundary = maxL_output
self.datasize = self.h5f['posts'].shape[0]
# When the variable(times) is greater than 3 , it can not work in my device and getting the getting a Bus error (core dumped) .**
self.len = self.datasize + (self.datasize * times)
def __getitem__(self, index):
question_index = index if index < self.datasize else index % self.datasize
answer_index = index if index < self.datasize else get_random(question_index)
raw_post = self.h5f['posts'][question_index].split()
raw_response = self.h5f['responses'][answer_index].split()
label = 1 if index < self.datasize else label = 0
post = raw_post[:self.input_boundary]
response = raw_response[:self.output_boundary]
post = self.word2Vec.sentence2id(post, True)
response = self.word2Vec.sentence2id(response, True)
return post, response, label
def __len__(self):
return self.len
def collate_fn(batch):
pairs = sorted(batch, key=lambda p: len(p[0]), reverse=True)
inputs_batch, targets_batch, labels, pad_id = zip(*pairs)
pad_id = pad_id[0]
p_lens, posts = count_len_and_add_pad(inputs_batch, pad_id)
r_lens, responses = count_len_and_add_pad(targets_batch, pad_id)
posts = torch.LongTensor(posts)
responses = torch.LongTensor(responses)
labels = torch.FloatTensor(labels).unsqueeze(1)
return posts, p_lens, responses, r_lens, labels