Nan loss appears after model traininig for a while

my model like this:

class MyModel(nn.Module):
    def __init__(self,vocab_uid,vocab_name,vocab_signature,vocab_comment,config):
        super(MyModel,self).__init__()
        self.vocab_uid=vocab_uid
        self.vocab_name=vocab_name
        self.vocab_signature=vocab_signature
        self.vocab_comment=vocab_comment
        self.config=config
        self.embedding_uid=nn.Embedding(len(vocab_uid),config.embed_size,padding_idx=len(vocab_uid)-1)
        self.embedding_name=nn.Embedding(len(vocab_name),config.embed_size,padding_idx=len(vocab_name)-1)
        self.embedding_signature=nn.Embedding(len(vocab_signature),config.embed_size,padding_idx=len(vocab_signature)-1)
        self.embedding_comment=nn.Embedding(len(vocab_comment),config.embed_size,padding_idx=len(vocab_comment)-1)
        torch.nn.init.xavier_uniform_(self.embedding_uid.weight.data)
        torch.nn.init.xavier_uniform_(self.embedding_name.weight.data)
        torch.nn.init.xavier_uniform_(self.embedding_signature.weight.data)
        torch.nn.init.xavier_uniform_(self.embedding_comment.weight.data)

        self.convs_uid=nn.ModuleList([nn.Conv1d(config.embed_size,config.num_filters,kernel_size) for kernel_size in config.filter_sizes])
        self.convs_name=nn.ModuleList([nn.Conv1d(config.embed_size,config.num_filters,kernel_size) for kernel_size in config.filter_sizes])
        self.convs_signature=nn.ModuleList([nn.Conv1d(config.embed_size,config.num_filters,kernel_size) for kernel_size in config.filter_sizes])
        self.convs_comment=nn.ModuleList([nn.Conv1d(config.embed_size,config.num_filters,kernel_size) for kernel_size in config.filter_sizes])
        
        self.fc = nn.Sequential(
            nn.Linear(3*self.input_dim+config.embed_size,config.hidden_size),
            nn.BatchNorm1d(config.hidden_size),
            # nn.ReLU(True),
            nn.LeakyReLU(0.01),
            nn.Dropout(config.dropout),
            nn.Linear(config.hidden_size,int(config.hidden_size/4)),
            nn.BatchNorm1d(int(config.hidden_size/4)),
            # nn.ReLU(True),
            nn.LeakyReLU(0.01),
            nn.Dropout(config.dropout),
            nn.Linear(int(config.hidden_size/4),len(config.labels))
        )

    def forward(self,x):
        uid_x,uid_mask,name_x,name_mask,signature_x,signature_mask,comment_x,comment_mask=x
        uid_embed=self.embedding_uid(uid_x)
        name_embed=self.embedding_name(name_x)
        signature_embed=self.embedding_signature(signature_x)
        comment_embed=self.embedding_comment(comment_x)

        uid_out=self.pre_model(uid_embed,self.convs_uid)
        name_out=self.pre_model(name_embed,self.convs_name)
        signature_out=self.pre_model(signature_embed,self.convs_signature)
        comment_out=self.pre_model(comment_embed,self.convs_comment)
        
        out=torch.cat([uid_out,name_out,signature_out,comment_out],dim=1)
        # out=torch.cat([name_out,signature_out,comment_out],dim=1)
        
        out=self.fc(out)
        # out = out.clamp (min=-5,max=5)
        return out

and loss function is F.cross_entropy…
I printed the loss output:

tensor(0.8520, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6428, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7526, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8432, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.5514, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8035, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6737, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7228, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7483, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6396, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8970, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7002, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7178, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6543, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.5708, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7416, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6376, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6778, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8271, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6978, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.5308, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8856, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7801, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7459, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6802, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6753, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8960, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7238, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7120, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8334, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.5827, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.0080, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7410, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8163, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6670, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.5345, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8395, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6059, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6882, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7274, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6992, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6757, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8750, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6141, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6805, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8153, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7314, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.5208, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7060, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9566, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8059, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8620, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7142, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8262, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6450, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6933, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8161, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8882, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7261, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6931, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7386, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8716, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9373, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9689, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7818, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6070, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7214, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8461, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8139, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.5316, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6324, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7920, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6427, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6935, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8860, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)

I tried almost all solutions on the forum, but still didn’t solve this problem.
I considered the size of the vocab_uid dictionary is 100,000 (the others are 20,000),this is too big,so i change out to the following to remove vocab_uid part:

out=torch.cat([name_out,signature_out,comment_out],dim=1)

and the problem of nan loss was solved

tensor(0.9186, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7103, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7542, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7021, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9060, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8179, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8151, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7942, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9203, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7924, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6114, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.1044, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7693, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7954, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6667, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9488, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7380, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9334, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7521, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6514, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7088, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.0585, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6585, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8773, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8796, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7429, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9010, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.5841, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.0587, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.0045, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7762, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6410, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.1958, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6005, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9324, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.0253, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7470, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7483, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8799, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7111, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.1603, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9201, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7401, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8063, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7628, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8120, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9181, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9712, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.4098, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8301, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6993, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7944, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6429, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6885, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7231, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6550, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6640, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.5790, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6554, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8883, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.0812, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.4727, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8472, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9151, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6278, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6851, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8824, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8724, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8245, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7162, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.0181, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8741, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.0740, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.5420, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8620, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8351, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6989, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9398, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.4831, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8195, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8160, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7738, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9108, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.0457, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.4809, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9541, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9704, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8382, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9873, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8535, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.5409, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8447, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9713, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.8365, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7024, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.1927, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.5514, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7880, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6342, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.9768, device='cuda:0', grad_fn=<NllLossBackward>)

But I want to add vocab_uid model ,how can i do it ?

Could you run your code with anomaly detection and post the stack trace here, please?
This could give us some information where the NaN was created.

what is “anomaly detection and post the stack trace”, I’m beginner,could you provide some tutorial?

autograd.detect_anomaly is a utility to help you debug runs in the autograd engine.

If you run the code with the decorator, you should get an error message, which you could post here so that we could have a look.