Hello! I have a custom ConstantLengthDataset class:
class ConstantLengthDataset(IterableDataset):
def __init__(
self,
tokenizer,
dataset,
infinite=False,
seq_length=8192,
num_of_sequences=1024,
chars_per_token=3.6,
):
self.tokenizer = tokenizer
self.concat_token_id = tokenizer.eos_token_id
self.dataset = dataset
self.seq_length = seq_length
self.infinite = infinite
self.current_size = 0
self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
def __iter__(self):
iterator = iter(self.dataset)
more_examples = True
while more_examples:
buffer, buffer_len = [], 0
while True:
if buffer_len >= self.max_buffer_size:
break
try:
buffer.append((next(iterator))["content"])
buffer_len += len(buffer[-1])
except StopIteration:
if self.infinite:
iterator = iter(self.dataset)
else:
more_examples = False
break
tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
all_token_ids = []
for tokenized_input in tokenized_inputs:
all_token_ids.extend(tokenized_input + [self.concat_token_id])
for i in range(0, len(all_token_ids), self.seq_length):
input_ids = all_token_ids[i : i + self.seq_length]
if len(input_ids) == self.seq_length:
self.current_size += 1
yield {
"input_ids": torch.LongTensor(input_ids),
"labels": torch.LongTensor(input_ids),
}
and due to per_gpu_batch_size = 1
in trainer I want to train on 1 node using 2 GPUs with seq_len = 8192 tokens. But when I launched this training using torchrun
command I noticed that despite of yield
and num_workers = 1
trainer consider more than 1 sequence of 8192 tokens per gpu and moreover different GPUs have same buffer
elements of ConstantLengthDataset class. How can I fix it for distributed training. Thank you in advance.