Torch.nn.conv1d producing nans in a3c


I am trying extract some features from time-series data of window size 50. However after some training of a3c, outputs of nn.conv1d turns to nans.

Can someone please help to let me know what am I missing here.

Below is my code for actor critic

class ActorCritic(nn.Module):

def __init__(self, params):
    super(ActorCritic, self).__init__()

    self.window = params.window
    self.state_dim = params.state_dim
    self.action_space = params.action_dim
    self.hidden_size = params.hidden_size
    self.conv1 = nn.Conv1d(1, params.out_channels, params.kernel_size1, stride = params.kernel_size1 - 1, padding = params.kernel_size1 - 2)
    self.conv2 = nn.Conv1d(params.out_channels, params.out_channels, params.kernel_size2, stride = params.kernel_size2 - 1, padding = params.kernel_size2 - 2)
    self.lstm_size = self.calc_inputs(params.window)
    lstm_size = self.lstm_size
    params.lstm_size = lstm_size
    self.lstm = nn.LSTMCell(lstm_size, lstm_size)
    lst = [lstm_size]
    for i in range(params.layers):
    self.hidden = nn.ModuleList()
    for k in range(len(lst)-1):
        self.hidden.append(nn.Linear(lst[k], lst[k+1]))
    for layer in self.hidden:

    self.critic_linear = nn.Linear(lst[-1], 1)
    self.actor_linear = nn.Linear(lst[-1], self.action_space)

def forward(self, inputs):
    (state), (hx, cx) = inputs
    conv1_out = self.conv1(state[1])
    conv2_out = self.conv2(conv1_out).reshape(1,-1)
    inputs =[state[0], conv2_out], dim = 1)
    # print(inputs)
    hx, cx = self.lstm(inputs, (hx, cx))
    x = hx
    for layer in self.hidden:
        x = F.elu(layer(x))
    return self.critic_linear(x), self.actor_linear(x), (hx, cx)

def calc_inputs(self,num_inputs):
    a = torch.rand(num_inputs).reshape(1,1,-1)
    a = self.conv2(self.conv1(a)).reshape(-1,)
    return a.shape[0] + 3
def save(self, filename, directory):, '%s/%s_actor.pth' % (directory, filename))

def load(self, filename, directory):
    self.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))

and here is code for training.

            state = env.reset()
            count = 0
            done = True
            info = False
                values = []
                log_probs = []
                rewards = []
                entropies = []

                for step in range(params.num_steps):
                    if(info == True):
                        h_out = (Variable(torch.zeros([1, params.lstm_size])), Variable(torch.zeros([1, params.lstm_size])))
                        state = env.reset()
                        h_out = (Variable(h_out[0].data), Variable(h_out[1].data))

                    h_in = h_out
                    value, action_values, h_out = model(((state), h_in))
                    action_values = action_values.reshape(-1,)
                    prob = F.softmax(action_values - max(action_values), dim = 0)    
                    log_prob = torch.log(prob)
                    entropy = -(log_prob * prob).sum()
                    action = epsilon_greedy(prob, epsilon)
                    # action = Categorical(prob).sample().reshape(-1,)
                    log_prob_a = log_prob.gather(0, Variable(action))
                    # print("action_values:",action_values)
                    # print("prob:",prob)
                    # print("log_prob:",log_prob)
                    # print("action:",action, "log_prob_a:",log_prob_a)
                    (state), reward, done, info, _ = env.step(action)
                    reward = max(min(reward, 1), -1) 
                    count +=1
                    if done:
                        (state) = env.reset()

                    if done:
                R = torch.zeros(1, 1)
                if not done:
                    value, _, _ = model(((state), h_out))
                    R =
                policy_loss = 0
                value_loss = 0
                R = Variable(R)
                gae = torch.zeros(1, 1).to(device)
                for i in reversed(range(len(rewards))):
                    R = params.gamma * R + rewards[i]
                    advantage = R - values[i]
                    value_loss = value_loss + 0.5 * advantage.pow(2)
                    TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data
                    gae = gae * params.gamma * params.tau + TD
                    policy_loss = policy_loss - log_probs[i] * gae - 0.01 * entropies[i]

                (policy_loss + 0.5 * value_loss).mean().backward()
                nn.utils.clip_grad_norm_(model.parameters(), 40)
                ensure_shared_grads(model, shared_model)
  , directory="./pytorch_models")

Is your loss exploding before you are seeing the NaN values?
If not, check all torch.log calls etc. for valid input values and make sure the output is not -Inf or NaN.


In all the code torch.log has softmax layer previous to it.
So I am assuming it will have valid inputs.

Also the value function should range between -10 and 20.therefore loss function should also not explode.

But let me try to confirm whether loss is exploding.

Besides I am unable to get why convolution output is nan for valid inputs.Does it have weighs that are updated?

Also is it ok the way how I am sampling action?
Sometimes it is giving error that it has been asked to sample from negative probability? But I am using softmax to calculate probs. Let me try to reproduce the error and check what are the probability values.

Yes, convolution layers are trainable and have a weight (filters) and bias parameter.
If the loss is exploding and thus the gradients are large in their magnitude, the parameter updates might yield to overflows.


Thanks for the reply.

Using tanh activation function seems to have resolved the issue.
And negative probability was also due to exploding.