Why am I getting a NaN in Normal(mu, std).rsample?

I am having a hard time debugging this issue. Roughly 100-150 iters into training, I am consistently getting a NaN, which anomaly_detect reports to be associated with an rsample from Normal seen below. I don’t understand how this could be where the NaN is and not the mu, std computation itself. Why does it not pop up during selection_net's forward pass?

A really aggravating issue is that this happens only 100-150 iterations in. This means that it’s fine for that whole time otherwise.

Synopsis of failing code:

selection_net = PolicySelectionNormal(...)
mu, std = selection_net(batch_data, node_sample, category_sample)
pi_distribution = Normal(mu, std)
action = pi_distribution.rsample()

Error I get:

/home/me/miniconda3/envs/myenv/lib/python3.7/site-packages/torch/autograd/__init__.py:132: UserWarning: Error detected in AddBackward0. Traceback of forward call that caused the error:
  File "main.py", line 459, in <module>
    run(opts)
  File "main.py", line 138, in run
    return train(opts, miner)
  File "main.py", line 223, in train
    log_ = miner.train()
  File "/home/me/Code/mydir/rl/pg.py", line 34, in train
    loss, log = self.compute_loss()
  File "/home/me/Code/mydir/rl/pg.py", line 64, in compute_loss
    obs, masks, acts)
  File "/home/me/Code/mydir/rl/policies/base.py", line 793, in get_probs_values_entropies
    pi_distribution)
  File "/home/me/Code/mydir/rl/policies/base.py", line 702, in get_squashed_action_prob_entropy
    action = distribution.rsample()
  File "/home/me/miniconda3/envs/myenv/lib/python3.7/site-packages/torch/distributions/normal.py", line 68, in rsample
    return self.loc + eps * self.scale
 (Triggered internally at  /opt/conda/conda-bld/pytorch_1607370156314/work/torch/csrc/autograd/python_anomaly_mode.cpp:104.)
  allow_unreachable=True)  # allow_unreachable flag
Traceback (most recent call last):
  File "main.py", line 459, in <module>
    run(opts)
  File "main.py", line 138, in run
    return train(opts, miner)
  File "main.py", line 223, in train
    log_ = miner.train()
  File "/home/me/Code/mydir/rl/pg.py", line 35, in train
    loss.backward()
  File "/home/me/miniconda3/envs/myenv/lib/python3.7/site-packages/comet_ml/monkey_patching.py", line 293, in wrapper
    return_value = original(*args, **kwargs)
  File "/home/me/miniconda3/envs/myenv/lib/python3.7/site-packages/torch/tensor.py", line 221, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/home/me/miniconda3/envs/myenv/lib/python3.7/site-packages/torch/autograd/__init__.py", line 132, in backward
    allow_unreachable=True)  # allow_unreachable flag
RuntimeError: Function 'AddBackward0' returned nan values in its 0th output.

Network that has the issue:

class PolicySelectionNetwork(nn.Module):
    """Continuous policy for selection."""

    def __init__(self, num_node_features, hidden_size, embedding_dim,
                 num_node_out, num_choice_out, num_layers):
        super().__init__()

        self.node_embeddings = nn.Embedding(num_node_out, embedding_dim)
        self.choice_embeddings = nn.Embedding(num_choice_out, embedding_dim)
        layers = [GCNConv(num_node_features, hidden_size)]
        for i in range(num_layers):
            layers.append(GCNConv(hidden_size, hidden_size))
        self.layers = nn.ModuleList(layers)

        self.conversion_layer = nn.Linear(hidden_size, embedding_dim)
        layers = [nn.Linear(embedding_dim, hidden_size)]
        self.combined_layers = nn.ModuleList(layers)

        self.mu_layer = nn.Linear(hidden_size, 1)
        self.log_std_layer = nn.Linear(hidden_size, 1)

    def forward(self, observations, nodes, choices):
        """Compute the policy and the logprobs.

        Args:
        - observations: torch.geometric Batch.
        - nodes: A batch of one-hot node actions.
        - choices: A batch of one-hot choice actions.
        """
        px, edge_index = observations.x, observations.edge_index
        for layer in self.layers[:-1]:
            px = layer(px, edge_index)
            px = F.relu(px)
        px = self.layers[-1](px, edge_index)
        global_px = global_mean_pool(px, observations.batch)
        global_px = self.conversion_layer(global_px)

        nodes = self.node_embeddings(nodes)
        choices = self.choice_embeddings(choices)

        policy = global_px + nodes + choices
        for layer in self.combined_layers:
            policy = layer(policy)
            policy = F.relu(policy)

        mu = self.mu_layer(policy)
        log_std = self.log_std_layer(policy)
        log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX)
        std = torch.exp(log_std)
        return mu, std

Is LOG_STD_MIN negative? In any case it would be great to see (some) values of std in the first code snippet.

LOG_STD_MIN = -20
LOG_STD_MAX = 2

Here’s a sample from iter 105
mu: tensor([[ 1.3312],
[ 1.2513],
[-0.2568],
[ 0.3249],
[ 1.4733],
[ 0.5431],
[ 0.3800],
[ 1.1068]])
logstd: tensor([[1.9672],
[2.0967],
[0.7903],
[1.8929],
[2.8613],
[1.8218],
[1.7235],
[1.9602]])

I’m printing out everything that comes before this computation in that function and not seeing any NaNs anywhere. Pretty baffled by this right now. Any ideas?

Any solutions? I’ve been getting a similar issue (but training a simple VAE - sampling from a normal when reparametrizing).

Runs for some epochs before giving NaNs

ValueError                                Traceback (most recent call last)
/tmp/ipykernel_17275/3251967212.py in <module>
     25 
     26             # forward propagation
---> 27             x_hat, mu, logvar, z, y_pred = model(x)
     28 
     29             # calculate total loss

~/anaconda3/envs/torch/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1100         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1101                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102             return forward_call(*input, **kwargs)
   1103         # Do not call functions when jit is used
   1104         full_backward_hooks, non_full_backward_hooks = [], []

/tmp/ipykernel_17275/3191899913.py in forward(self, x)
     60     def forward(self, x):
     61         mu, logvar = self.encode(x)
---> 62         z = self.reparameterize(mu, logvar)
     63         x_hat = self.decode(z)
     64         y = self.pred_net(z)

/tmp/ipykernel_17275/3191899913.py in reparameterize(self, mu, logvar)
     51     def reparameterize(self, mu, logvar):
     52         std = torch.exp(logvar/2)
---> 53         z = torch.distributions.Normal(mu, std).rsample()
     54         return z
     55 

~/anaconda3/envs/torch/lib/python3.9/site-packages/torch/distributions/normal.py in __init__(self, loc, scale, validate_args)
     48         else:
     49             batch_shape = self.loc.size()
---> 50         super(Normal, self).__init__(batch_shape, validate_args=validate_args)
     51 
     52     def expand(self, batch_shape, _instance=None):

~/anaconda3/envs/torch/lib/python3.9/site-packages/torch/distributions/distribution.py in __init__(self, batch_shape, event_shape, validate_args)
     53                 valid = constraint.check(value)
     54                 if not valid.all():
---> 55                     raise ValueError(
     56                         f"Expected parameter {param} "
     57                         f"({type(value).__name__} of shape {tuple(value.shape)}) "

ValueError: Expected parameter loc (Tensor of shape (256, 15)) of distribution Normal(loc: torch.Size([256, 15]), scale: torch.Size([256, 15])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
1 Like