LSTM model giving repeated ending token as output when using BERT embedding

rectified_sigmoid · November 28, 2020, 5:10am

I am trying to train a sequence to sequence LSTM model. I am using pre-trained BERT model for getting word embeddings. BERT needs [CLS] and [SEP] tokens at start and end, so that the tokenized sequence becomes ["[CLS]", …,"[SEP]"]. My model has encoder, that takes in this sequence as input and gives an intermediary output, and the Decoder takes this output and gives the original sequence as output. But problem is, after training, the model is giving “[CLS]” followed by [SEP] for every input as output. It is biasing to [SEP]. So, if input is [CLS] , a, b, c, [SEP] the output I get is [CLS] [SEP] [SEP] [SEP] [SEP].How do I solve this issue? here is my encoder code:

class Encoder(nn.Module):
  def __init__(self, hidden_size, output_size=3):
    super(Encoder, self).__init__()

    self.bert = BertModel.from_pretrained("bert-base-uncased")
    self.bert.eval()
    #self.embed = nn.Embedding(input_size, embed_size)

    self.lstm1 = nn.LSTM(768, hidden_size)  

    
    self.linear1 = nn.Linear(hidden_size, 128)
    self.bn1 = nn.BatchNorm1d(128)
    self.linear2 = nn.Linear(128, 64)
    self.linear3 = nn.Linear(64, 32)
    self.bn2 = nn.BatchNorm1d(32)
    self.linear4 = nn.Linear(32, output_size)

    

  def forward(self, input, mask):
    input = input.long()
    with torch.no_grad():
      out,_ = self.bert(input, attention_mask = mask)
    out = out.permute(1,0,2)
    out,_ = self.lstm1(out)
    out = self.bn1(self.linear1(out[-1]))
    out = self.linear2(out)
    out = self.linear3(out)
    out = self.bn2(out)
    out = self.linear4(out)

    out = torch.sigmoid(out)

    return out

And here is my decoder code:

class Decoder(nn.Module):
  def __init__(self, input_size, output_size, hidden_size=128):
    super(Decoder, self).__init__()

    self.hidden_size = hidden_size

    self.output_size = output_size

    self.dropout = nn.Dropout(0.2)

    self.linear5 = nn.Linear(input_size,32)
    self.bn3 = nn.BatchNorm1d(32)
    self.linear6 = nn.Linear(32, 64)
    self.linear7 = nn.Linear(64, hidden_size)
    self.lstm2 = nn.LSTM(hidden_size,hidden_size)
    self.linear8 = nn.Linear(hidden_size,output_size)


  def forward(self, x, seq_len):
    out = self.bn3(self.linear5(x))
    #x = self.dropout(self.linear6(x))
    out = self.linear6(out)
    out = self.linear7(out)
    outt = out.unsqueeze(0)
    hidden = (torch.zeros(1, x.size(0),self.hidden_size, device=device), torch.zeros(1,x.size(0),self.hidden_size, device=device))
    out = []
    for t in range(seq_len):
      outt, hidden = self.lstm2(outt, hidden)
      out.append(outt.squeeze(0))
    out = torch.stack(out, dim=0).to(device)
    out = self.linear8(out)
    
    out = torch.softmax(out, dim=2)

    return out

I am making use of the pre-trained BERT model from the transformers library