Why do you think the following code would give CUDA memory error? Is there any accumulation happening that will hinder the variables to go out of scope?
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.nn import TransformerDecoder, TransformerDecoderLayer
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
class TransformerModel(nn.Module):
def __init__(self, ntoken_source, ntoken_target, ninp, nhead, nhid, nlayers, dropout=0.5):
super(TransformerModel, self).__init__()
self.model_type = 'Transformer'
self.src_mask = None
self.trg_mask = None
self.src_key_padding_mask = None
self.trg_key_padding_mask = None
self.pos_encoder = PositionalEncoding(ninp, dropout)
encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
self.encoder = nn.Embedding(ntoken_source, ninp)
self.decoder = nn.Embedding(ntoken_target, ninp)
self.ninp = ninp
decoder_layers = TransformerDecoderLayer(ninp, nhead, nhid, dropout)
self.transformer_decoder = TransformerDecoder(decoder_layers, nlayers)
self.pred_decoder = nn.Linear(ninp, ntoken_target)
self.init_weights()
def _generate_square_subsequent_mask(self, sz):
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def _generate_padding_mask(self, tsr):
# padding index is 1
msk = torch.tensor((tsr.data.cpu().numpy() == 1).astype(int), dtype = torch.bool)
msk = msk.permute(1,0)
return msk
def init_weights(self):
initrange = 0.1
self.encoder.weight.data.uniform_(-initrange, initrange)
self.pred_decoder.bias.data.zero_()
self.pred_decoder.weight.data.uniform_(-initrange, initrange)
def forward(self, src, trg=None):
trg_device = trg.device
self.trg_mask = self._generate_square_subsequent_mask(len(trg)).to(trg_device)
src_device = src.device
self.src_key_padding_mask = self._generate_padding_mask(src).to(src_device)
self.trg_key_padding_mask = self._generate_padding_mask(trg).to(trg_device)
src = self.encoder(src) * math.sqrt(self.ninp)
src = self.pos_encoder(src)
enc_output = self.transformer_encoder(src, src_key_padding_mask=self.src_key_padding_mask)#, self.src_mask)
trg = self.decoder(trg) * math.sqrt(self.ninp)
trg = self.pos_encoder(trg)
dec_output = self.transformer_decoder(trg, enc_output, tgt_mask=self.trg_mask,\
tgt_key_padding_mask=self.trg_key_padding_mask)
preds = self.pred_decoder(dec_output)
return preds
Training Procedure:
model = TransformerModel(ntokens_source, ntokens_target, emsize, nhead, nhid, nlayers, dropout).to(device)
total_params = sum(p.numel() for p in model.parameters())
total_params_t = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total Parameters: {}, Trainable Parameters: {}".format(total_params, total_params_t))
iterators = BucketIterator.splits(
d, batch_size=bsz, shuffle=True)
print("Len Iterators: {}, {}".format(len(iterators[0]), len(iterators[1])))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
epochs = 50
check_after = 20
best_val_loss = float('inf')
for epoch in range(epochs):
model.zero_grad()
model.train()
avg_loss = 0.0
for i, batch in enumerate(iterators[0]):
print(i)
src = batch.src.to(device)[1:,:]
trg = batch.trg.to(device)
print(src.shape, trg.shape)
optimizer.zero_grad()
output = model(src,trg[:-1,:])
loss = criterion(output.view(-1, ntokens_target), trg[1:,:].reshape(-1))
loss.backward()
avg_loss+=loss.detach().item()
optimizer.step()
torch.cuda.empty_cache()
del output
del src
del trg
#print('EVALUATION')
model.eval()
avg_eval_loss = 0.0
avg_bleu_score = 0.0
avg_gleu_score = 0.0
for i, batch_ in enumerate(iterators[1]):
src_eval = batch_.src.to(device)[1:,:]
trg_eval = batch_.trg.to(device)
output_eval = model(src_eval, trg_eval[:-1,:])#.view(-1, ntokens_target)
pairs, avg_bleu, avg_gleu = decode(output_eval.detach().cpu(), trg_eval[1:,:].cpu(), TRG.vocab.itos)
if i%check_after == 0:
log(i, pairs)
loss_eval = criterion(output_eval.view(-1, ntokens_target), trg_eval[1:,:].view(-1))
avg_eval_loss+=loss_eval.detach().item()
avg_bleu_score+=avg_bleu
avg_gleu_score+=avg_gleu
torch.cuda.empty_cache()
del src_eval
del trg_eval
del output_eval
It’s happening with as low as 8 batch size. The train iterator performs approx. 2500 iterations. And the CUDA memory error occurs during these iterations but if it manges to pass through one epoch, it certainly occurs at around the first or second iteration in the 2nd epoch. Please help with understanding what could be the problem here (if there is). Also, how can I use nn.DataParallel here?