Nan Values in model params and loss

import numpy as np
import torch
import torch.nn as nn
import math
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class PositionalEncoding(nn.Module):

def __init__(self, d_model, dropout=0.1, max_len=5000):
    super(PositionalEncoding, self).__init__()
    self.dropout = nn.Dropout(p=0.1)
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0).transpose(0, 1)
    self.register_buffer('pe', pe)

def forward(self, x):
    x = x +[:x.size(0), :]
    return x

class SelfAttentionPooling(nn.Module):
def init(self, input_dim):
super(SelfAttentionPooling, self).init()
self.W = nn.Linear(input_dim, 1)

def forward(self, batch_rep):
        batch_rep : size (N, T, H), N: batch size, T: sequence length, H: Hidden dimension
        att_w : size (N, T, 1)
        utter_rep: size (N, H)
    softmax = nn.functional.softmax
    att_w = softmax(self.W(batch_rep).squeeze(-1)).unsqueeze(-1)
    utter_rep = torch.sum(batch_rep * att_w, dim=1)

    return utter_rep

class TransformerModel(nn.Module):

def __init__(self, d_model, nhead, dim_feedforward, nlayers, n_conv_layers=2, n_class=2, dropout=0.5, dropout_other=0.1):
    super(TransformerModel, self).__init__()
    self.model_type = 'Transformer'
    self.n_class = n_class
    self.n_conv_layers = n_conv_layers
    self.relu = torch.nn.ReLU()
    self.pos_encoder = PositionalEncoding(310, dropout)
    self.self_att_pool = SelfAttentionPooling(d_model)
    encoder_layers = TransformerEncoderLayer(d_model=d_model, 
    self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
    self.d_model = d_model
    self.flatten_layer = torch.nn.Flatten()
    self.decoder = nn.Sequential(nn.Linear(d_model, d_model), nn.Dropout(0.1),
                                   nn.Linear(d_model, d_model), nn.Dropout(0.1), 
                                   nn.Linear(d_model, n_class))
    # Transformer Conv. layers
    self.conv1 = torch.nn.Conv1d(in_channels=12, out_channels=128, kernel_size=3, stride=1, padding=0)
    self.conv2 = torch.nn.Conv1d(in_channels=128, out_channels=d_model, kernel_size=3, stride=1, padding=1)
    self.conv = torch.nn.Conv1d(in_channels=d_model, out_channels=d_model, kernel_size=3, stride=1, padding=0)
    self.maxpool = torch.nn.MaxPool1d(kernel_size=2)
    self.dropout = torch.nn.Dropout(p=0.1)

def init_weights(self):
    initrange = 0.1, initrange), initrange)

def forward(self, src):      
    src = self.relu(self.conv1(src))
    src = self.relu(self.conv2(src))
    for i in range(self.n_conv_layers):
      src = self.relu(self.conv(src))
      src = self.maxpool(src)

    src = self.pos_encoder(src)   
    # print(src.shape) # [batch, embedding, sequence]
    src = src.permute(2,0,1) # reshape from [batch, embedding dim., sequnce] --> [sequence, batch, embedding dim.]
    output = self.transformer_encoder(src) # output: [sequence, batch, embedding dim.], (ex. [3000, 5, 512])
    output = output.permute(1,0,2)
    output = self.self_att_pool(output)
    logits = self.decoder(output) # output: [batch, n_class]

    return logits

Here is my pytorch implementation of Transformers model which i am using for ecg disease classification. Input data to model is [12,signal_length] for 12 leads.
While training i am getting nan values in loss and in model.params also.
What is wrong i am not getting.

Try reducing your learning rate by one or more orders of magnitude.