NaN values after transformer layer

Hi this is a follow-up to my other question I now have the architecture but I am getting NaN values after the first gradient update and after the transformer layer.

class SimpleTransformer(torch.nn.Module):
    def __init__(self, n_time_series, d_model=128):
        super().__init__()
        self.dense_shape = torch.nn.Linear(n_time_series, d_model)
        self.pe = SimplePositionalEncoding(d_model)
        self.transformer = Transformer(d_model, nhead=8)
        self.final_layer = torch.nn.Linear(d_model, 1)
    def forward(self, x, t, tgt_mask):
        x = self.dense_shape(x)
        x = self.pe(x)
        t = self.dense_shape(t)
        t = self.pe(t)
        x = x.permute(1,0,2)
        t = t.permute(1,0,2)
        x = self.transformer(x, t, src_mask=tgt_mask, tgt_mask=tgt_mask)
        print(torch.isnan(x)) # This returns true after the first gradient update
        x = self.final_layer(x)
        return x
        
class SimplePositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(SimplePositionalEncoding, self).__init__()
        self.dropout = torch.nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)
def generate_square_subsequent_mask(sz):
        r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
        """
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

Training loop

for epoch in range(max_epochs):
    running_loss = 0.0
    for src, trg in data_loader:
        mask = generate_square_subsequent_mask(10)
        optimizer.zero_grad()
        output = a(src.float(), mask)
        #output = s(src.float(), trg.float(), mask)
        labels = trg[:, :, 0]
        loss = criterion(output.view(-1, 10), labels.float())
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(s.parameters(), 0.5)
        optimizer.step()
        running_loss += loss.item()
        i+=1

Can anyone help?
Thanks

I am also facing a similar issue, did you able to solve the problem?

Yeah I was able to solve it by applying StandardScaler on my the data and filling all NaN values. Large values even more so than with traditional RNNs seem to cause loss to explode.