Transformer Model for Regression is not learning

Ziege · May 28, 2023, 7:35am

Hello,

I have built a Transformer model which receives as input data of the form (Batch_size, Sequence_len, Num_Features).

An example input is for example

[[70, 33,  4, 62, 12,  0,  3,  4, 62,  7,  8, 18, 62, 12,  0, 17, 10, 62,
          1, 24, 62,  2,  0, 12, 15,  0,  8,  6, 13,  8, 13,  6, 62,  0,  6,  0,
          8, 13, 18, 19, 62,  7, 14, 18, 15,  8, 19,  0, 11, 62,  5, 17,  0, 20,
          3, 63, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
         69, 69, 69, 69, 69, 69]] (Mit einer Batch Size von 1)

“70” is my start token and “69” my padding.
The corresponding output would be:

[[[ 0.     0.   ]
  [ 0.     0.092]
  [ 0.182  0.238]
  [ 0.058  0.005]
  [ 0.204  0.214]
  [ 0.032  0.136]
  [ 0.153  0.184]
  [ 0.133  0.061]
  [ 0.162  0.084]
  [ 0.22   0.203]
  [ 0.228  0.266]
  [ 0.158  0.142]
  [ 0.525  0.522]
  [ 1.871  1.903]
  [ 0.117  0.168]
  [ 0.122  0.11 ]
  [ 0.233  0.165]
  [ 0.273  0.265]
  [ 0.322  0.339]
  [ 0.209  0.174]
  [ 0.196  0.205]
  [ 0.158  0.244]
  [ 0.08   0.084]
  [ 0.144  0.07 ]
  [ 0.225  0.219]
  [ 0.184  0.22 ]
  [ 0.06   0.05 ]
  [ 0.357  0.323]
  [ 0.363  0.496]
  [ 0.138  0.168]
  [ 0.213  0.062]
  [ 0.726  0.784]
  [ 0.873  0.801]
  [ 0.682  0.744]
  [ 0.265  0.196]
  [ 0.042  0.105]
  [ 0.199  0.229]
  [ 0.096  0.075]
  [ 0.148  0.145]
  [ 0.17   0.122]
  [ 0.371  0.356]
  [ 0.294  0.306]
  [ 0.199  0.197]
  [ 0.038  0.08 ]
  [ 0.184  0.134]
  [ 0.214  0.206]
  [ 0.209  0.217]
  [ 0.109  0.13 ]
  [ 0.185  0.168]
  [ 0.267  0.266]
  [ 0.596  0.641]
  [ 0.239  0.293]
  [ 0.095  0.099]
  [ 0.176  0.064]
  [ 0.133  0.177]
  [ 0.452  0.463]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]
  [-1.    -1.   ]]]```

With "[ 0. 0. ]" as start token and "[-1,-1]" as padding. 

So it is a regression problem. 

The relevant part of my model looks like this:

def forward(self, src, tgt):
    src_seq_length = src.size(1)
    tgt_seq_length = tgt.size(1)
    N = src.size(0)

    pos_enc_src = (
        torch.arange(0, src.size(1))
        .unsqueeze(0)
        .expand(N, src_seq_length)
        .to(tgt.device)
    )
    pos_enc_tgt = (
        torch.arange(0, tgt.size(1))
        .unsqueeze(0)
        .expand(N, tgt_seq_length)
        .to(tgt.device)
    )
    src_key_padding_mask = self.make_src_padding_mask(src)
    src_embedded = self.embedding(src.long()) + self.src_pos_encoding(pos_enc_src)
    src_embedded = src_embedded.to(tgt.device)

    tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(
        tgt.device
    )
    tgt_key_padding_mask = self.make_tgt_padding_mask(tgt)

    tgt_embedded = self.target_embedding(tgt.float()).to(tgt.device)
    tgt_embedded = self.ELU(tgt_embedded).to(tgt.device)

    tgt_embedded = tgt_embedded + self.tgt_pos_encoding(pos_enc_tgt)

    print("SRC", src[0])
    print("TGT", tgt[0])

    output = self.transformer(
        src_embedded,
        tgt_embedded,
        src_key_padding_mask=src_key_padding_mask,
        tgt_mask=tgt_mask,
        tgt_key_padding_mask=tgt_key_padding_mask,
    ).to(tgt.device)

    for layer in self.linear_layers:
        output = layer(output).to(tgt.device)
        output = self.RELU(output).to(tgt.device)

    return output

``

With the following layer specifications:

self.target_embedding = nn.Linear(2, d_model)
self.linear_layers = nn.ModuleList(
            [
                nn.Linear(2 ** (start_pow - i), 2 ** (start_pow - i - 1))
                for i in range(start_pow - 1)
            ]
        )
self.embedding = nn.Embedding(num_features, d_model, padding_idx=69)
self.src_pos_encoding = nn.Embedding(max_seq_length, d_model, device="cuda")
self.tgt_pos_encoding = nn.Embedding(max_seq_length, d_model, device="cuda")

self.transformer = nn.Transformer(
    d_model,
    nhead,
    num_layers,
    dim_feedforward,
    dropout=0.1,
    activation="relu",
    batch_first=True,
    device="cuda",
)

And the following functions I use for creating padding masks:

    def make_src_padding_mask(self, src):
        """
        src_mask = src == 69
        return src_mask

    def make_tgt_padding_mask(self, tgt):
        tgt_mask = tgt == -1
        tgt_mask = tgt_mask[:, :, [0]].squeeze(2)
        return tgt_mask```

A training step looks like this:

def training_step(self, batch, batch_idx):
    src, tgt = batch["input"].to(self.device), batch["target"].to(self.device)
    # finding the index of the first pad token ([-1,-1])
    last_non_pad_idxs = self.get_last_non_padding_idx(tgt)
    # Remove  the last non padding index from the target sequence
    tgt_minus_1 = torch.stack(
        [
            torch.cat((tgt[i, :idx], tgt[i, idx + 1 :]))
            for i, idx in enumerate(last_non_pad_idxs)
        ]
    ).to(self.device)

    output = self.model(src, tgt_minus_1)
    loss = self.criterion(output, tgt[:, 1:])
    self.log(
        "train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True
    )

    return loss

Problem :

My model works syntactically, but does not learn. A prediction always has approximately the same elements at each position in the sequence. The output depends on the input, but the form is always the same: Each element has approximately the same elements

Example:

[[0.1459, 0.4486],
[0.1470, 0.4488],
[0.1479, 0.4490],
[0.1479, 0.4490],
[0.1486, 0.4491],
[0.1489, 0.4492],
[0.1490, 0.4492],
[0.1494, 0.4493],
[0.1496, 0.4493],
[0.1486, 0.4491],
[0.1490, 0.4492],
[0.1495, 0.4493],
[0.1495, 0.4493],
[0.1490, 0.4492],
[0.1494, 0.4493],
[0.1497, 0.4494],
[0.1503, 0.4495],
[0.1502, 0.4495],
[0.1503, 0.4495],
[0.1500, 0.4494],
[0.1490, 0.4492],
[0.1505, 0.4495],
[0.1500, 0.4494],
[0.1501, 0.4494],
[0.1501, 0.4495],
[0.1500, 0.4494],
[0.1503, 0.4495],
[0.1501, 0.4494],
[0.1508, 0.4496],
[0.1505, 0.4495],
[0.1504, 0.4495],
[0.1497, 0.4494],
[0.1504, 0.4495],
[0.1505, 0.4495],
[0.1503, 0.4495],
[0.1500, 0.4494],
[0.1509, 0.4496],
[0.1507, 0.4496],
[0.1504, 0.4495],
[0.1506, 0.4496],
[0.1512, 0.4497],
[0.1511, 0.4497],
[0.1510, 0.4496],
[0.1508, 0.4496],
[0.1512, 0.4497],
[0.1508, 0.4496],
[0.1510, 0.4496],
[0.1514, 0.4497],
[0.1509, 0.4496],
[0.1508, 0.4496],
[0.1504, 0.4495],
[0.1509, 0.4496],
[0.1506, 0.4495],
[0.1511, 0.4497],
[0.1512, 0.4497],
[0.1511, 0.4497],
[0.1511, 0.4497],
[0.1511, 0.4497],
[0.1512, 0.4497],
[0.1512, 0.4497],
[0.1513, 0.4497],
[0.1515, 0.4497],
[0.1510, 0.4496],
[0.1509, 0.4496],
[0.1504, 0.4495],
[0.1505, 0.4495],
[0.1515, 0.4498],
[0.1507, 0.4496],
[0.1509, 0.4496],
[0.1512, 0.4497],
[0.1512, 0.4497],
[0.1512, 0.4497],
[0.1506, 0.4496],
[0.1505, 0.4495],
[0.1515, 0.4497],
[0.1511, 0.4497],
[0.1505, 0.4495]]