Intro:
Hello I heard of the super simple api of data parallelism in PyTorch so I decided to give it a try but after profiling I found almost identical results between using & not using the parallelism feature (DESPITE seeing all 4 GPUs active during training). In each instance I get roughly:
duration: 56.92420029640198, loss: 2.6403932571411133
…
Code Comments:
I’m using Transformer XL pertained model from hugging face & a custom training loop.
I tried to keep the info as minimal as possible. The background regarding my training loop is that I was getting memory leaks unless I allocated a reusable mini_batch tensor. I’m concerned this could be related as I’m not sure how/if I need to distribute this Tensor manually to each GPU as well…
Code:
# ...
model = AutoModelWithLMHead.from_pretrained('xlnet-base-cased').to('cpu')
# ...
import torch as pt
from torch import optim
# NOTE: this is the 'memory efficient version' for CUDA
# pad_len := max number of tokens (input sequences padded to this length)
def train_loop(model, input_output_data, epochs=5, batch_size=256, pad_len=200):
input, output = input_output_data
n_examples = len(input)
n_batches = int(n_examples/batch_size+0.99999)
model.train() # turn on training
optimizer = optim.Adam(model.parameters(), lr=0.001)
# helpers for tensor dict manipulation
slice_inputs = lambda x, a, b: {k:x[k][a:b] for k in x}
cast_inputs = lambda x, device='cuda': {k:x[k].to(device) for k in x}
def assign_dict(a,b):
for k in b:
a[k][:] = b[k]
# they must be padded to the same size for batching to work...
all_inputs = tokenizer(input, return_tensors='pt', padding='max_length',
truncation=True, max_length=pad_len)
all_outputs = tokenizer(output, return_tensors='pt', padding='max_length',
truncation=True, max_length=pad_len)
all_inputs = cast_inputs(all_inputs, 'cpu')
all_outputs = cast_inputs(all_outputs, 'cpu')
# The idea was to have a reusable mini-batch tensor to avoid memory leaks...
inputs = slice_inputs(all_inputs, 0, batch_size)
outputs = slice_inputs(all_outputs, 0, batch_size)
inputs = cast_inputs(inputs, 'cuda')
outputs = cast_inputs(outputs, 'cuda')
last_loss = None
for i in range(epochs):
print(f'epoch: {i+1}/{epochs}')
for j in range(n_batches):
optimizer.zero_grad()
torch.cuda.empty_cache()
try:
a, b = (j*batch_size, (j+1)*batch_size)
assign_dict(inputs, slice_inputs(all_inputs, a, b))
assign_dict(outputs, slice_inputs(all_outputs, a, b))
loss = model(**inputs, labels=outputs['input_ids'])[0].mean()
except Exception as e:
pdb.set_trace()
raise
print(f'batch: {j+1}/{n_batches}, loss: {loss}')
loss.backward()
optimizer.step()
last_loss = loss.item()
return last_loss
Here is the code I use to actually perform the training (I switch model=serial_model
to model=fast_model
for comparison):
# Train and Profile
import time
import torch as pt
fast_model = pt.nn.DataParallel(model.to('cuda'))
serial_model = model.to('cuda:0')
model = serial_model # switch to fast_model for comparison
start = time.time()
final_loss = train_loop(model, (input, output), batch_size=10, epochs=1)
duration = time.time() - start
print(f'duration: {duration}, loss: {final_loss}')
P.S. LMK if you want full code, I tried to keep it minimal.
Thanks for your help!