Action values goes to nan for all the actions while training A3C

I am using pytorch to make an A3C with 4 processes as in below code.

But to my surprise while training the A3C action values goes to nan for all actions. Initially it was not the case that action values were gone to nan.

But after an overnight training it goes to nan. Can someone please help to let me know what issue is there.

class SharedAdam(torch.optim.Adam):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.99), eps=1e-8,
                 weight_decay=0):
        super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        # State initialization
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'] = 0
                state['exp_avg'] = torch.zeros_like(p.data)
                state['exp_avg_sq'] = torch.zeros_like(p.data)
    
                # share in memory
                state['exp_avg'].share_memory_()
                state['exp_avg_sq'].share_memory_()



class ActorCritic(torch.nn.Module):

    def __init__(self, num_inputs, action_space):
        super(ActorCritic, self).__init__()

        self.num_inputs = num_inputs
        self.action_space = action_space
        self.lstm = nn.LSTMCell(num_inputs, num_inputs)
        num_outputs = action_space
        self.fc1 = nn.Linear(num_inputs, 256)
        self.fc1.apply(init_weights)
        self.fc2 = nn.Linear(256, 256)
        self.fc2.apply(init_weights)
        self.critic_linear = nn.Linear(256, 1)
        self.critic_linear.apply(init_weights)
        self.actor_linear = nn.Linear(256, num_outputs)
        self.actor_linear.apply(init_weights)
        self.lstm.bias_ih.data.fill_(0)
        self.lstm.bias_hh.data.fill_(0)
        self.sig1 = nn.Sigmoid()
        self.train()

    def forward(self, inputs):
        inputs, (hx, cx) = inputs
        hx, cx = self.lstm(inputs, (hx, cx))
        x = self.sig1(self.fc1(hx))
        x = torch.tanh(self.fc2(x))
        return self.critic_linear(x), self.actor_linear(x), (hx, cx)
    
    def save(self, filename, directory):
        torch.save(self.state_dict(), '%s/%s_actor.pth' % (directory, filename))

    def load(self, filename, directory):
            self.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))

below is code for training

def train(rank,params, model, optimizer,data):
    try:
        data = data.dropna()

        count = 0

        data = torch.DoubleTensor(np.asarray(data))

        env = ENV(params.state_dim, params.action_dim, data)
        print("env created\n")
        # init training variables
        max_timesteps = data.shape[0] - 1
        state = env.reset()
        done = True
        episode_length = 0
        count = 0
        while count<max_timesteps-1:
            episode_length += 1
            if done:
                cx = Variable(torch.zeros(1, params.state_dim))
                hx = Variable(torch.zeros(1, params.state_dim))
            else:
                cx = Variable(cx.data)
                hx = Variable(hx.data)

            values = []
            log_probs = []
            rewards = []
            entropies = []
            while count<max_timesteps-1:
                value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))
                prob = F.softmax(action_values,dim = -1)
                log_prob = F.log_softmax(action_values, dim=-1).reshape(-1,)
                entropy = -(log_prob * prob).sum(1, keepdim=True)
                entropies.append(entropy)
                
                action = sample(prob)
                
                
                log_prob = log_prob.gather(0, Variable(action))
         
                state, reward, done = env.step(action)
                done = (done or count == max_timesteps-2)
                reward = max(min(reward, 1), -1)
                
                count +=1
                
                if done:
                    episode_length = 0
                    state = env.reset()
                    
                
                values.append(value)
                log_probs.append(log_prob)
                rewards.append(reward)
                print(ticker, "rank ",rank," action:",action, "reward ",reward)

                if done:
                    break
                
            R = torch.zeros(1, 1)
            if not done:
                value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
                R = value.data
            values.append(Variable(R))
            policy_loss = 0
            value_loss = 0
            R = Variable(R)
            gae = torch.zeros(1, 1)
            for i in reversed(range(len(rewards))):
                R = params.gamma * R + rewards[i]
                advantage = R - values[i]
                value_loss = value_loss + 0.5 * advantage.pow(2)
                TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data
                gae = gae * params.gamma * params.tau + TD
                policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i]

            optimizer.zero_grad()
            (policy_loss + 0.5 * value_loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 40)
            optimizer.step()
            
    except:
        traceback.print_exc()

below is code for sampling an action

def sample(logits):
    noise = torch.rand(logits.shape)
    return torch.argmax(logits - torch.log(-torch.log(noise)), 1)

Your sample function might return invalid outputs.
torch.rand samples from [0, 1) and if you sample a zero in it, torch.log would return -Inf.
Try to add a small eps value to this operation or check other operations, which could potentially output an invalid value.