I am trying to train a sequence to sequence LSTM model. I am using pre-trained BERT model for getting word embeddings. BERT needs [CLS] and [SEP] tokens at start and end, so that the tokenized sequence becomes ["[CLS]", …,"[SEP]"]. My model has encoder, that takes in this sequence as input and gives an intermediary output, and the Decoder takes this output and gives the original sequence as output. But problem is, after training, the model is giving “[CLS]” followed by [SEP] for every input as output. It is biasing to [SEP]. So, if input is [CLS] , a, b, c, [SEP] the output I get is [CLS] [SEP] [SEP] [SEP] [SEP].How do I solve this issue? here is my encoder code:

```
class Encoder(nn.Module):
def __init__(self, hidden_size, output_size=3):
super(Encoder, self).__init__()
self.bert = BertModel.from_pretrained("bert-base-uncased")
self.bert.eval()
#self.embed = nn.Embedding(input_size, embed_size)
self.lstm1 = nn.LSTM(768, hidden_size)
self.linear1 = nn.Linear(hidden_size, 128)
self.bn1 = nn.BatchNorm1d(128)
self.linear2 = nn.Linear(128, 64)
self.linear3 = nn.Linear(64, 32)
self.bn2 = nn.BatchNorm1d(32)
self.linear4 = nn.Linear(32, output_size)
def forward(self, input, mask):
input = input.long()
with torch.no_grad():
out,_ = self.bert(input, attention_mask = mask)
out = out.permute(1,0,2)
out,_ = self.lstm1(out)
out = self.bn1(self.linear1(out[-1]))
out = self.linear2(out)
out = self.linear3(out)
out = self.bn2(out)
out = self.linear4(out)
out = torch.sigmoid(out)
return out
```

And here is my decoder code:

```
class Decoder(nn.Module):
def __init__(self, input_size, output_size, hidden_size=128):
super(Decoder, self).__init__()
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout = nn.Dropout(0.2)
self.linear5 = nn.Linear(input_size,32)
self.bn3 = nn.BatchNorm1d(32)
self.linear6 = nn.Linear(32, 64)
self.linear7 = nn.Linear(64, hidden_size)
self.lstm2 = nn.LSTM(hidden_size,hidden_size)
self.linear8 = nn.Linear(hidden_size,output_size)
def forward(self, x, seq_len):
out = self.bn3(self.linear5(x))
#x = self.dropout(self.linear6(x))
out = self.linear6(out)
out = self.linear7(out)
outt = out.unsqueeze(0)
hidden = (torch.zeros(1, x.size(0),self.hidden_size, device=device), torch.zeros(1,x.size(0),self.hidden_size, device=device))
out = []
for t in range(seq_len):
outt, hidden = self.lstm2(outt, hidden)
out.append(outt.squeeze(0))
out = torch.stack(out, dim=0).to(device)
out = self.linear8(out)
out = torch.softmax(out, dim=2)
return out
```

I am making use of the pre-trained BERT model from the transformers library