Gradients blow up when trying to get info about gradients

kbr · August 17, 2018, 4:55pm

Hello,
The network I am training works fine, however when I try to learn about the norms of gradients or distributions, it blows towards the end of training. I tested different things about the gpu memory or my code, could not find the reason. Here are a couple codes I tried to add:
(the register_hook one is extremely slow)
(I am not 100% sure that the reason is these codes, however the problem occurs consistently when I add these lines)

     for name, param in N.named_parameters():
            cur_w_norm = sq_tensor_norm(param)
            cur_g_norm = sq_tensor_norm(param.grad)

            cur_w_name = 'weights/N_' + name + '_norm'
            writer.add_scalar(cur_w_name , cur_w_norm.item(), epoch)

            cur_g_name = 'grads/N_' + name + '_norm'
            writer.add_scalar(cur_g_name , cur_g_norm.item(), epoch)


      def sq_tensor_norm(M):
          num_tensors = functools.reduce(operator.mul, list(M.size()))
          return (torch.sum(M ** 2)) / num_tensors # ** 0.5

     for name, param in N.named_parameters():
            grad_array = param.grad.cpu().numpy().flatten()
            n, bins, patches = plt.hist(grad_array, 50, facecolor='g', normed=True, alpha=0.75)
            del grad_array

            plt.xlabel('grads ' + name)
            plt.title('Histogram of grads for ' + name)
            plt.grid(True)
            writer.add_figure('grad_histograms/histogram_grads_' + name, plt
                     .figure(1), global_step=None, close=True, walltime=None)
            plt.close('all')

  with torch.no_grad():
        for name, param in N.named_parameters():
            writer.add_histogram('grads/' + 'N_' + name, param.clone().cpu().numpy(),epoch, bins=20)

  for name, param in N.named_parameters():
      writer.add_histogram('grads/' + 'N_' + name, param.clone().detach().cpu().numpy(),epoch, bins=20)

     for name, param in N.named_parameters():
         F_cur = per_layer_grad_histograms(writer, name)
         param.register_hook(F_cur)

     def per_layer_grad_histograms(writer, name):
         return lambda grad: per_layer_grad_hist(writer, grad, name)

    def per_layer_grad_hist(writer, grad, name):
        grad_array = grad.cpu().numpy().flatten()
        n, bins, patches = plt.hist(grad_array, 50, facecolor='g', normed=True, alpha=0.75)
        plt.xlabel('grads ' + name)
        plt.title('Histogram of grads for ' + name)
        plt.grid(True)
        writer.add_figure('grads/histogram_grads_' + name, plt
            .figure(1), global_step=None, close=True, walltime=None)
        plt.close('all')