Dear @All
I’m trying to apply Transformer tutorial from Harvardnlp, I have 4 GPUs server and, I got CUDA error: out of memory for 512 batch size.
# For data loading.
from torchtext import data, datasets
if True:
BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"
SRC = data.Field(tokenize=bpemb_ar.encode, pad_token=BLANK_WORD, fix_length = 50)
TRG = data.Field(tokenize=bpemb_ar.encode, init_token = BOS_WORD,
eos_token = EOS_WORD, pad_token=BLANK_WORD, fix_length = 50)
MAX_LEN = 100
train, val, test = TabularDataset.splits(path='./data/',train='SCUT_trai.csv',
validation='SCUT_vali.csv', test='test.csv', format='csv',
fields=[('src', SRC), ('trg', TRG)], skip_header=True,
filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and
len(vars(x)['trg']) <= MAX_LEN)
MIN_FREQ = 2
SRC.build_vocab(train.src, min_freq=MIN_FREQ)
TRG.build_vocab(train.trg, min_freq=MIN_FREQ)
class MultiGPULossCompute:
"A multi-gpu loss compute and train function."
def __init__(self, generator, criterion, devices, opt=None, chunk_size=5):
# Send out to different gpus.
self.generator = generator
self.criterion = nn.parallel.replicate(criterion,
devices=devices)
self.opt = opt
self.devices = devices
self.chunk_size = chunk_size
def __call__(self, out, targets, normalize):
total = 0.0
generator = nn.parallel.replicate(self.generator,
devices=self.devices)
out_scatter = nn.parallel.scatter(out,
target_gpus=self.devices)
out_grad = [[] for _ in out_scatter]
targets = nn.parallel.scatter(targets,
target_gpus=self.devices)
# Divide generating into chunks.
chunk_size = self.chunk_size
for i in range(0, out_scatter[0].size(1), chunk_size):
# Predict distributions
out_column = [[Variable(o[:, i:i+chunk_size].data,
requires_grad=self.opt is not None)]
for o in out_scatter]
gen = nn.parallel.parallel_apply(generator, out_column)
# Compute loss.
y = [(g.contiguous().view(-1, g.size(-1)),
t[:, i:i+chunk_size].contiguous().view(-1))
for g, t in zip(gen, targets)]
loss = nn.parallel.parallel_apply(self.criterion, y)
# Sum and normalize loss
l = nn.parallel.gather(loss,
target_device=self.devices[0])
l = l.sum() / normalize
total += l.data
# Backprop loss to output of transformer
if self.opt is not None:
l.backward()
for j, l in enumerate(loss):
out_grad[j].append(out_column[j][0].grad.data.clone())
# Backprop all loss through transformer.
if self.opt is not None:
out_grad = [Variable(torch.cat(og, dim=1)) for og in out_grad]
o1 = out
o2 = nn.parallel.gather(out_grad,
target_device=self.devices[0])
o1.backward(gradient=o2)
self.opt.step()
self.opt.optimizer.zero_grad()
return total * normalize
# GPUs to use
devices = [0, 1, 2, 3]
if True:
pad_idx = TRG.vocab.stoi["<blank>"]
model = make_model(len(SRC.vocab), len(TRG.vocab), N=6)
model.cuda()
criterion = LabelSmoothing(size=len(TRG.vocab), padding_idx=pad_idx, smoothing=0.1)
criterion.cuda()
BATCH_SIZE = 256 #12000
train_iter = MyIterator(train, batch_size=BATCH_SIZE, device=0,
repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
batch_size_fn=batch_size_fn, train=True)
valid_iter = MyIterator(val, batch_size=BATCH_SIZE, device=0,
repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
batch_size_fn=batch_size_fn, train=False)
model_par = nn.DataParallel(model, device_ids=devices)
# torch.device('cuda')
None
/home/akram/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:20: UserWarning: nn.init.xavier_uniform is now deprecated in favor of nn.init.xavier_uniform_.
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-40-7da75baf3635> in <module>
4 pad_idx = TRG.vocab.stoi["<blank>"]
5 model = make_model(len(SRC.vocab), len(TRG.vocab), N=6)
----> 6 model.cuda()
7 criterion = LabelSmoothing(size=len(TRG.vocab), padding_idx=pad_idx, smoothing=0.1)
8 criterion.cuda()
~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in cuda(self, device)
263 Module: self
264 """
--> 265 return self._apply(lambda t: t.cuda(device))
266
267 def cpu(self):
~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
191 def _apply(self, fn):
192 for module in self.children():
--> 193 module._apply(fn)
194
195 for param in self._parameters.values():
~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
191 def _apply(self, fn):
192 for module in self.children():
--> 193 module._apply(fn)
194
195 for param in self._parameters.values():
~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
191 def _apply(self, fn):
192 for module in self.children():
--> 193 module._apply(fn)
194
195 for param in self._parameters.values():
~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
191 def _apply(self, fn):
192 for module in self.children():
--> 193 module._apply(fn)
194
195 for param in self._parameters.values():
~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
191 def _apply(self, fn):
192 for module in self.children():
--> 193 module._apply(fn)
194
195 for param in self._parameters.values():
~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
191 def _apply(self, fn):
192 for module in self.children():
--> 193 module._apply(fn)
194
195 for param in self._parameters.values():
~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
197 # Tensors stored in modules are graph leaves, and we don't
198 # want to create copy nodes, so we have to unpack the data.
--> 199 param.data = fn(param.data)
200 if param._grad is not None:
201 param._grad.data = fn(param._grad.data)
~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in <lambda>(t)
263 Module: self
264 """
--> 265 return self._apply(lambda t: t.cuda(device))
266
267 def cpu(self):
RuntimeError: CUDA error: out of memory
The whole code copy from the original article, any suggestions?