Transformer doesn't work on Autoregressive inputs

maxrivera · August 29, 2022, 4:46pm

I trained my transformer for machine translation achieving a val loss of 0.04, and validation predictions look very good. But when I try to do an autoregressive input during inference, I get an unintelligible output. Note: both the decoder and encoder can only take in a fixed length input of the same size. I have seen tutorials that start the autoregressive input with a 1x1 tensor, but my implementation must start with a 1xMAX_SEQ_LENGTH tensor. Can anyone tell why my autoregression is not working? Is it possible to make it work with this input format? Thanks

# PREPARE SRC  
transcription = [['他','昨','天','來','我','的','辦','公','室','。']]
# add special tokens and padding 
transcription[0].insert(0,'BOS')
transcription[0].append('EOS')
transcription = np.array([np.concatenate([x,['PAD']*(params['d_seq']-len(x))]) if len(x) < params['d_seq'] else x for x in transcription]) # padding
print('src sentence: ', transcription)
# convert words to ids 
transcription_ids = transformer_training.word_to_id(transcription, cn_word_dict)
# convert to tensor 
transcription_tensor = torch.Tensor(transcription_ids).to(DEVICE).long()

# NON-AUTOREGRESSIVE INPUT
print('----- NON-AUTOREGRESSIVE INPUT -----')
trg = [['he', 'came', 'to', 'my', 'office', 'yesterday','.']]
trg[0].insert(0,'BOS')
trg[0].append('EOS')
trg = np.array([np.concatenate([x,['PAD']*(MAX_LENGTH-len(x))]) if len(x) < MAX_LENGTH else x for x in trg])
print('trg sentence: ', trg)
trg_ids = transformer_training.word_to_id(trg, en_word_dict) # convert target sentence to ids 
trg_tensor = torch.Tensor(trg_ids).to(torch.device('cpu')).long() # convert target ids to tensor 
mask = torch.tril(torch.ones((MAX_LENGTH, MAX_LENGTH))).to(DEVICE).long() # create mask 
out = model(transcription_tensor, trg_tensor, mask)
val, ind = torch.max(out,dim=-1)
def id_to_word(inds, en_index_dict):
    words = [[en_index_dict[str(ind.item())] for ind in sent] for sent in inds]
    return words
out_sentence = id_to_word(ind, en_index_dict)
print('out_sentence: ', out_sentence)

# AUTOREGRESSIVE INPUT 
print('--- AUTOREGRESSIVE INPUT-----')
mask = torch.tril(torch.ones((MAX_LENGTH, MAX_LENGTH))).to(DEVICE).long()
trg = [['BOS']]
trg = np.array([np.concatenate([x,['BOS']*(MAX_LENGTH-len(x))]) if len(x) < MAX_LENGTH else x for x in trg])
print('trg at time=0: ',trg)
trg_ids = transformer_training.word_to_id(trg, en_word_dict)
trg_tensor = torch.Tensor(trg_ids).to(DEVICE).long()
src = model.encoder(transcription_tensor)
for i in range(1,MAX_LENGTH):

    out = model.decoder(src, trg_tensor, mask)
    val, ind = torch.max(out, dim=-1)
    trg_tensor[0][i] = ind[0][i]

out_sentence = id_to_word(ind, en_index_dict)
print('out_sentence: ', out_sentence)

src sentence:  [['BOS' '他' '昨' '天' '來' '我' '的' '辦' '公' '室' '。' 'EOS' 'PAD' 'PAD' 'PAD'
  'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD'
  'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD'
  'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD'
  'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD']]
----- NON-AUTOREGRESSIVE INPUT -----
trg sentence:  [['BOS' 'he' 'came' 'to' 'my' 'office' 'yesterday' '.' 'EOS' 'PAD' 'PAD'
  'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD'
  'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD'
  'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD'
  'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD' 'PAD'
  'PAD']]
out_sentence:  [['BOS', 'he', 'came', 'to', 'my', 'office', 'yesterday', '.', 'EOS', 'face', 'face', 'face', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she', 'she']]
--- AUTOREGRESSIVE INPUT-----
trg at time=0:  [['BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS'
  'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS'
  'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS'
  'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS'
  'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS' 'BOS']]
out_sentence:  [['left', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.']]