I checked your example code for using torch.nn.parallel.DistributedDataParallel
to train model on multiple GPUs on the same host. I’ve modified the code to fine-tune (unsupervised learning) the smallest GPT-2 model and I have 4 x 8GB graphics cards.
I thought 32GB of memory should be enough for the smallest GPT-2 model (even medium should kind of be working?!), but I still get errors like:
...
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [129,0,0], thread: [57,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
...
RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`
The code I use:
import argparse
import os
import torch
import torch.nn.functional as F
import torch.multiprocessing as mp
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
# Define the training parameters
max_length = 128
learning_rate = 1e-5
class MyTrainDataset(Dataset):
def __init__(self, tokenizer, max_length):
self.tokenizer = tokenizer
self.max_length = max_length
#self.data = [(torch.rand(20), torch.rand(1)) for _ in range(size)]
# Read and tokenize the text file
with open("your_dataset.txt", "r", encoding="utf-8") as file:
self.text = file.read().replace("\n", " ")
def __len__(self):
return len(self.text)
def __getitem__(self, idx):
inputs = self.tokenizer.encode_plus(
self.text[idx : idx + self.max_length],
add_special_tokens=True,
max_length=self.max_length,
padding="max_length",
truncation=True
)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
return torch.tensor(input_ids), torch.tensor(attention_mask)
def ddp_setup(rank, world_size):
"""
Args:
rank: Unique identifier of each process
world_size: Total number of processes
"""
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
init_process_group(backend="nccl", rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
class Trainer:
def __init__(
self,
model: torch.nn.Module,
train_data: DataLoader,
optimizer: torch.optim.Optimizer,
gpu_id: int,
) -> None:
self.gpu_id = gpu_id
self.model = model.to(gpu_id)
self.train_data = train_data
self.optimizer = optimizer
self.model = DDP(model, device_ids=[gpu_id])
def _run_batch(self, source, targets):
self.optimizer.zero_grad()
output = self.model(source)
loss = F.cross_entropy(output, targets)
loss.backward()
self.optimizer.step()
def _run_epoch(self, epoch):
b_sz = len(next(iter(self.train_data))[0])
print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}")
self.train_data.sampler.set_epoch(epoch)
for source, targets in self.train_data:
source = source.to(self.gpu_id)
targets = targets.to(self.gpu_id)
self._run_batch(source, targets)
def train(self, max_epochs: int):
for epoch in range(max_epochs):
self._run_epoch(epoch)
def load_train_objs(tokenizer):
train_set = MyTrainDataset(tokenizer, max_length) # load your dataset
config = GPT2Config.from_pretrained("gpt2")
model = GPT2LMHeadModel(config)
#model = torch.nn.Linear(20, 1) # load your model
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
return train_set, model, optimizer
def prepare_dataloader(dataset: Dataset, batch_size: int):
return DataLoader(
dataset,
batch_size=batch_size,
pin_memory=True,
shuffle=False,
sampler=DistributedSampler(dataset)
)
def main(rank: int, world_size: int, total_epochs: int, batch_size: int):
ddp_setup(rank, world_size)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
dataset, model, optimizer = load_train_objs(tokenizer)
train_data = prepare_dataloader(dataset, batch_size)
trainer = Trainer(model, train_data, optimizer, rank)
trainer.train(total_epochs)
destroy_process_group()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='simple distributed training job')
parser.add_argument('total_epochs', type=int, help='Total epochs to train the model')
parser.add_argument('--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)')
args = parser.parse_args()
world_size = torch.cuda.device_count()
mp.spawn(main, args=(world_size, args.total_epochs, args.batch_size), nprocs=world_size)
I run it with:
python3 training.py --batch_size 1 1
…which is the smallest possible batch size and 1 epoch. Dataset file your_dataset.txt
contains only 10+ lines and is 5.1kB in total.
Is my multiprocessing not working or 4x8GB GPUs are definitely not enough for this task?