Distributed training with Trainer and ConstantLengthDataset classes

Hello! I have a custom ConstantLengthDataset class:

class ConstantLengthDataset(IterableDataset):
    def __init__(
        self, 
        tokenizer, 
        dataset, 
        infinite=False,
        seq_length=8192,
        num_of_sequences=1024,
        chars_per_token=3.6,
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.infinite = infinite
        self.current_size = 0
        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.max_buffer_size:
                    break
                try:
                    buffer.append((next(iterator))["content"])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    self.current_size += 1
                    yield {
                        "input_ids": torch.LongTensor(input_ids),
                        "labels": torch.LongTensor(input_ids),
                    }

and due to per_gpu_batch_size = 1 in trainer I want to train on 1 node using 2 GPUs with seq_len = 8192 tokens. But when I launched this training using torchrun command I noticed that despite of yield and num_workers = 1 trainer consider more than 1 sequence of 8192 tokens per gpu and moreover different GPUs have same buffer elements of ConstantLengthDataset class. How can I fix it for distributed training. Thank you in advance.