Batch size and steps in multiple gpus

Lets say I have 2 million datasets. If I set batch-size to 1 and use 4 gpus on my machine and train the model using accelerate (huggingface), does it mean that each epoch will have 500,000 steps (2 million / 4) or will each epoch have 2 million steps?

I have tried to train my code with the above scenario using the code below.

def main():       
    processor = Pix2StructProcessor.from_pretrained('./deplot_models/deplot_base_model/')
    model = Pix2StructForConditionalGeneration.from_pretrained('./deplot_models/deplot_base_model/').to('cuda')
    with open('data/full_vocab.txt', 'r+') as f:
        full_v = [v.strip('\n') for v in f.readlines()]
    new_t = full_v[50345:]

    def collator(batch):
        new_batch = {"flattened_patches":[], "attention_mask":[]}
        texts = [item["text"] for item in batch]

        text_inputs = processor(text=texts, padding='max_length', return_tensors="pt", add_special_tokens=True, max_length=512)

        new_batch["labels"] = text_inputs.input_ids
        new_batch["labels"][new_batch["labels"] == processor.tokenizer.pad_token_id] = -100
        for item in batch:

        new_batch["flattened_patches"] = torch.stack(new_batch["flattened_patches"])
        new_batch["attention_mask"] = torch.stack(new_batch["attention_mask"])

        return new_batch
    train_dataset = DeplotDataset('./test/images', './test/targets/', processor)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=1, collate_fn=collator)
    optimizer = Adafactor(model.parameters(), scale_parameter=False, relative_step=False, lr=0.01, weight_decay=1e-05)
    # optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=100000, num_training_steps=2_000_000*EPOCHS)
    accelerator = Accelerator()
    device = accelerator.device
    model, optimizer, training_dataloader, scheduler = accelerator.prepare(model, optimizer, train_dataloader, scheduler)
    # model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3])
    # model.cuda()


    for epoch in range(EPOCHS):
        print("Epoch:", epoch)
        for idx, batch in enumerate(train_dataloader):
#             if idx % 50 == 0:
#                 print(torch.cuda.max_memory_allocated())
            outputs = model(**batch)

            loss = outputs.loss

            if idx % 2 == 0:
                print(f"Idx: {idx}, Loss: {loss.item()}")


For some reason, I have noticed that the steps (idx) go above 500000. Is this supposed to happen?