Transformer Pair Sentence Classification

I’m trying to implement text classification with a transformer with sentence pairs as the data.

class IronyClassifier(nn.Module):
    def __init__(self, batch_size, n_tokens, d_model, n_heads, n_hid, n_layers, dropout_p=0.5):
        super(IronyClassifier, self).__init__()

        self.batch_size = batch_size
        self.d_model = d_model

        self.word_embedding = self.load_word_embedding(trainable=True)
        print('word_embedding loaded')
        self.positional_encoder = PositionalEncoding(d_model, dropout_p)
        self.segment_encoding = SegmentEncoding(d_model=d_model)

        # encoder definition
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads, dim_feedforward=n_hid,
                                                   dropout=dropout_p, activation='gelu')
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=n_layers)

        self.classifier = nn.Linear(in_features=(d_model), out_features=1)

    def load_word_embedding(self, trainable=True):
        ...

        return word_embedding

    def generate_src_mask(self, sentence_lens: tuple, first_sentence_lens) -> torch.Tensor:
        max_len = max(sentence_lens)
        max_len_last = max(first_sentence_lens)

        src_mask = []

        for current_len, first_current_len in zip(sentence_lens, first_sentence_lens):
            src_mask.append([True] + ([False] * (first_sentence_len - 1)) + ([True] * (max_len_first - first_current_len)) + [False] + ([False] * ((current_len - 3))) + ([True] * ((max_len) - ((current_len)))) + [False])

        src_mask = torch.BoolTensor(src_mask).to(next(self.parameters()).device)

        return src_mask

    def forward(self, src: torch.Tensor, sentence_lens: tuple, first: bool, first_sentence_embedding: Optional[torch.Tensor] = torch.zeros((10, 20, 200)).to(torch.device('cuda')), last_sentence_lens: Optional[tuple] = None, chain_training: Optional[bool] = True):

        src = self.word_embedding(src.long()) * math.sqrt(self.d_model)


        src_0 = src[1:]
        src = torch.cat((src[0].unsqueeze(0), first_sentence_embedding, src_0), dim=0)        # 'CLS', 'Sentence_0', 'SEP', 'Sentence_1', 'SEP'
        src = self.positional_encoder(src)
        src += self.segment_encoding(sentence_lens=sentence_lens, first_sentence_lens=first_sentence_lens).unsqueeze(1).repeat(1, self.batch_size, 1)

        src_mask = self.generate_src_mask(sentence_lens=sentence_lens, first_sentence_lens=first_sentence_lens)
        out = self.transformer_encoder(src, src_key_padding_mask=src_mask)
        out = self.classifier(out[0])

        return out, word_embedding

With

d_model = 300 (Word Embedding)
n_hids = 512 (Feed Forward Input - / Output - Size)
n_heads = 6
n_layers = 7
dropout_p = 0.25
.
.
.

I’ve implemented all the encodings (positional encoding and segment / sentence encoding). I use glove pretrained word embeddings and the SARC - Dataset, which should not be the problem. I think, in theory it should work, but for a reason, I haven’t found for more than a week, my model decides not to train for getting the BCEWithLogitsLoss at least lower than about 0.63, as that is the loss value it currently reaches, until it starts to overfit badly.

I really appreciate any help.

Regards,
Unity :slight_smile: