Mask unrelative input with zeros in LSTM cause gradient NAN

My pytorch version is 0.3.
When I train LSTM with reinforcement learning methods(A2C), the input of one time step is the output of the last time step.However,some outputs of the last step are unrelative so I have to mofidy the output variable.I have tried two ways to implement this and the first is use inplace operations ,just like the following code:

    def forward(self, x):
        hidden, cell = self.init_hidden()
        outputs = []
        for i in range(1, self.seq_len+1):
            hidden, cell = self.lstmcell(x, (hidden, cell))
            output = F.sigmoid(self.classfier(hidden))
            if i < self.seq_len:
                output = output.clone()  
                output[:, i:] = 0
            
            output = output.clamp(1e-15, 1-1e-15)
            outputs.append(output)
            x = output

        outputs = torch.cat(outputs, dim=0)
        states = (hidden, cell)
        return outputs, states

and the other way is like this:

    def forward(self, x):
        hidden, cell = self.init_hidden()
        outputs = []
        for i in range(1, self.seq_len+1):
            hidden, cell = self.lstmcell(x, (hidden, cell))
            output = F.sigmoid(self.classfier(hidden))
            if i < self.seq_len:
                mask = torch.cat([torch.ones(output.size(0), i), \
                                  torch.zeros(output.size(0), self.seq_len-i)], dim=1)
                mask = Variable(mask.cuda())
                output = output*mask

            output = output.clamp(1e-15, 1-1e-15)
            outputs.append(output)
            x = output

        outputs = torch.cat(outputs, dim=0)
        states = (hidden, cell)
        return outputs, states

The code can run normally,but I get the graident NAN when we I have trained tens of epochs and it cause all parameters of the LSTM become NAN.I have debug that the forward computation results and loss is normal before the gradient becomes NAN.
Of course, I have tried

         torch.nn.utils.clip_grad_norm(agent.parameters(), 0.25)

and it seems not work.I show the completed code here,

class Agent(nn.Module):
    def __init__(self, batch_size, seq_len, input_size, hidden_size, num_class):
        super(Agent, self).__init__()
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.input_size = input_size
        self.cell_size = self.hidden_size = hidden_size
        self.lstmcell = nn.LSTMCell(self.input_size, self.hidden_size)
        self.classfier = nn.Linear(self.hidden_size, num_class)

    def init_hidden(self):
        return (Variable(torch.zeros(self.batch_size, self.hidden_size).cuda()),\
                Variable(torch.zeros(self.batch_size, self.cell_size).cuda()))

    def forward(self, x):
        hidden, cell = self.init_hidden()
        outputs = []
        for i in range(1, self.seq_len+1):
            hidden, cell = self.lstmcell(x, (hidden, cell))
            output = F.sigmoid(self.classfier(hidden))
            if i < self.seq_len:
                # output = output.clone()    #use inplace operation must use clone()
                # output[:, i:] = 0
                mask = torch.cat([torch.ones(output.size(0), i), \
                                  torch.zeros(output.size(0), self.seq_len-i)], dim=1)
                mask = Variable(mask.cuda())
                output = output*mask

            output = output.clamp(1e-15, 1-1e-15)
            outputs.append(output)
            x = output

        outputs = torch.cat(outputs, dim=0)
        states = (hidden, cell)
        return outputs, states


def train(trainloader, epoch, num_blocks, dnet, agent, agent_optimizer, value_network, value_network_optimizer, block_config, growth_rate):
    agent.train()
    value_network.train()
    dnet.eval()
    layer_num = sum(block_config)
    for batch_idx, (inputs, targets) in tqdm.tqdm(enumerate(trainloader), total=len(trainloader)):
        inputs, targets = Variable(inputs.cuda()), Variable(targets.cuda(async=True))
        empty_embedding = Variable(torch.zeros(1, sum(block_config)).cuda())
        # hook = agent.output.register_hook(agent.print_grad)
        probs, _ = agent(empty_embedding)
        distribution = Bernoulli(probs)
        policy = distribution.sample()
        if args.cl_step < num_blocks:
            for i in range(layer_num-args.cl_step):
                policy[i, 0:(i+1)] = 1          #in order to ca     lculate reward
            policy_mask = Variable(torch.ones(policy.size(0)).cuda())
            policy_mask[:-args.cl_step] = 0     #in order to calculate loss
        else:
            policy_mask = None

        preds = dnet.forward(inputs, policy.data.cpu().numpy())
        qs, match = get_reward(preds, targets, policy.data, growth_rate, block_config)
        qs = Variable(qs)
        v_inputs = Variable(empty_embedding.data)
        vs = value_network(v_inputs).detach()
        advantage = qs - vs

        #agent loss
        loss = -distribution.log_prob(policy)
        loss = loss.sum(dim=1)
        if policy_mask is not None:
            loss = policy_mask.type(torch.cuda.FloatTensor)*loss
        loss = loss.sum()*advantage.sum()
        entropy_loss = -probs * torch.log(probs)
        entropy_loss = 0.01*args.beta * entropy_loss.sum()
        loss = loss/inputs.size(0) - entropy_loss
        #train agent
        agent_optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm(agent.parameters(), 0.25)
        agent_optimizer.step()

        #value network loss
        criterion = nn.MSELoss().cuda()
        values = value_network(Variable(empty_embedding.data))
        values = values.repeat(inputs.size(0), 1)
        target_values = qs
        value_network_loss = criterion(values, target_values)
        #train value network
        value_network_optimizer.zero_grad()
        value_network_loss.backward()
        value_network_optimizer.step()

I have been stuck in this problem for a few days.Hope anyone can help me.Thanks very much!