Lets say I have 2 million datasets. If I set batch-size to 1 and use 4 gpus on my machine and train the model using accelerate (huggingface), does it mean that each epoch will have 500,000 steps (2 million / 4) or will each epoch have 2 million steps?
+)
I have tried to train my code with the above scenario using the code below.
def main():
processor = Pix2StructProcessor.from_pretrained('./deplot_models/deplot_base_model/')
model = Pix2StructForConditionalGeneration.from_pretrained('./deplot_models/deplot_base_model/').to('cuda')
with open('data/full_vocab.txt', 'r+') as f:
full_v = [v.strip('\n') for v in f.readlines()]
new_t = full_v[50345:]
processor.tokenizer.add_tokens(new_t)
model.resize_token_embeddings(len(processor.tokenizer))
def collator(batch):
new_batch = {"flattened_patches":[], "attention_mask":[]}
texts = [item["text"] for item in batch]
text_inputs = processor(text=texts, padding='max_length', return_tensors="pt", add_special_tokens=True, max_length=512)
new_batch["labels"] = text_inputs.input_ids
new_batch["labels"][new_batch["labels"] == processor.tokenizer.pad_token_id] = -100
for item in batch:
new_batch["flattened_patches"].append(item["flattened_patches"])
new_batch["attention_mask"].append(item["attention_mask"])
new_batch["flattened_patches"] = torch.stack(new_batch["flattened_patches"])
new_batch["attention_mask"] = torch.stack(new_batch["attention_mask"])
return new_batch
train_dataset = DeplotDataset('./test/images', './test/targets/', processor)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=1, collate_fn=collator)
optimizer = Adafactor(model.parameters(), scale_parameter=False, relative_step=False, lr=0.01, weight_decay=1e-05)
# optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=100000, num_training_steps=2_000_000*EPOCHS)
accelerator = Accelerator()
device = accelerator.device
model, optimizer, training_dataloader, scheduler = accelerator.prepare(model, optimizer, train_dataloader, scheduler)
model.to(device)
# model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3])
# model.cuda()
model.train()
for epoch in range(EPOCHS):
print("Epoch:", epoch)
for idx, batch in enumerate(train_dataloader):
# if idx % 50 == 0:
# print(torch.cuda.max_memory_allocated())
outputs = model(**batch)
loss = outputs.loss
if idx % 2 == 0:
print(f"Idx: {idx}, Loss: {loss.item()}")
accelerator.backward(loss)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
For some reason, I have noticed that the steps (idx) go above 500000. Is this supposed to happen?