Computing Gradients with respect to intermediate activations

Hi,

I am looking to do the following in C++:

auto x = torch::zeros({5}, torch::requires_grad());
auto out_1 = torch::relu( W_1.matmul(x) + b_1);
auto out_2 = torch::relu( W_1.matmul(out_2) + b_2); 
auto logits = torch::log_softmax(out_2, 1);
auto loss = logits[0] - logits[1];

I want to find the values of d(loss)/d(out_1) and d(loss)/d(out_2).

How do I do this? Attempts to do

loss.backward();

and then accessing

out_2.grad()

have seemingly failed: there is a runtime error which says libc++abi.dylib: terminating with uncaught exception of type c10::Error: tensor with backend UNKNOWN_BACKEND does not have a device

I can provide a minimal breaking example if need be.

Make sure to push your tensors on a particular device!

I think I have done that.
Here’s my code:

#include <iostream>
#include<vector>
#include <string>
#include<cassert>

#include<torch/torch.h>

#include "TorchNeuralNet.hpp"

// std::pair< std::vector<torch::Tensor>, std::vector<torch::Tensor> >
void getActsAndGrads(TorchNeuralNetwork& net, torch::Tensor& x, torch::Device device) //, torch::nn::Functional& L) 
{
    assert(x.numel() == net.getLayerInputSize(0));
    auto out = x;
    std::cout << "output_init" << std::endl;
    std::cout << out << std::endl;
    std::cout << out.options() << std::endl;
    std::cout << out.sizes() << std::endl;

    std::vector<torch::Tensor> activations;
    std::vector<torch::Tensor> gradients;
    // std::vector<torch::Tensor> act_gradients;

    //outputs
    for(unsigned i=0; i < net.getNumLayers(); i++) {

        std::cout << i << std::endl;

        auto w = net.getWeightTensor(i, device);
        auto b = net.getBiasTensor(i, device);

        // std::cout << "w" << i << std::endl;
        // std::cout << w << std::endl;
        // std::cout << w.options() << std::endl;

        // std::cout << "b" << i << std::endl;
        // std::cout << b << std::endl;
        // std::cout << b.options() << std::endl;

        out = w.matmul(out) + b;
        out = torch::relu(out);
        activations.push_back(out);

        assert(out.dtype() == torch::kFloat64);
        assert(out.layout() == torch::kStrided);
        assert(out.device().type() == torch::kCPU);
        assert(out.requires_grad());
        std::cout << "out_" << i << " asserts success!" << std::endl;

        // std::cout << "out" << i << std::endl;
        // std::cout << out << std::endl;
        // std::cout << out.options() << std::endl;
        // std::cout << out.sizes() << std::endl << std::endl;
        
        // std::cout << "\n\n\n";
    }

    out = out[0];
    std::cout << "out_final" << std::endl;
    std::cout << out << std::endl;
    std::cout << out.options() << std::endl;
    std::cout << out.sizes() << std::endl << std::endl;

    assert(out.dtype() == torch::kFloat64);
    assert(out.layout() == torch::kStrided);
    assert(out.device().type() == torch::kCPU);
    assert(out.requires_grad());
    std::cout << "out_final asserts success!" << std::endl;

    int k = 0;
    out.backward();
    for(auto& o: activations) {
        std::cout << "act_grad_" << k << std::endl;
        auto g = o.grad();
        gradients.push_back(g);
        std::cout << g << std::endl;
        std::cout << g.options() << std::endl;
        std::cout << g.sizes() << std::endl << std::endl;
        std::cout << "\n\n\n";
        k++;
    }
}

int main()
{
    const std::string FILE_NAME = "../nnet/ACASXU_run2a_1_1_batch_2000.nnet";
    TorchNeuralNetwork net(FILE_NAME);

    torch::DeviceType device_type;
    if (torch::cuda::is_available()) {
        std::cout << "CUDA available! Training on GPU." << std::endl;
        device_type = torch::kCUDA;
    } else {
        std::cout << "Training on CPU." << std::endl;
        device_type = torch::kCPU;
    }
    torch::Device device(device_type);

    auto options = torch::TensorOptions()
                    .dtype(torch::kFloat64)
                    .layout(torch::kStrided)
                    .device(device)
                    .requires_grad(true);

    auto x = torch::zeros({5}, options);

    assert(x.dtype() == torch::kFloat64);
    assert(x.layout() == torch::kStrided);
    assert(x.device().type() == torch::kCPU);
    assert(x.requires_grad());
    std::cout << "x asserts success!" << std::endl;


    std::cout << "x" << std::endl;
    std::cout << x << std::endl;
    std::cout << x.options() << std::endl;
    std::cout << x.sizes() << std::endl;
    // auto grads = 
    getActsAndGrads(net, x, device);
    return 0;
}

All asserts are succesful; even the gradientss.push_back(g) is(I have verified this by separating the loops). Here’s the output:

Training on CPU.
x asserts success!
x
 0
 0
 0
 0
 0
[ Variable[CPUDoubleType]{5} ]
TensorOptions(dtype=double, device=cpu, layout=Strided, requires_grad=false)
[5]
output_init
 0
 0
 0
 0
 0
[ Variable[CPUDoubleType]{5} ]
TensorOptions(dtype=double, device=cpu, layout=Strided, requires_grad=false)
[5]
0
out_0 asserts success!
1
out_1 asserts success!
2
out_2 asserts success!
3
out_3 asserts success!
4
out_4 asserts success!
5
out_5 asserts success!
6
out_6 asserts success!
out_final
0
[ Variable[CPUDoubleType]{} ]
TensorOptions(dtype=double, device=cpu, layout=Strided, requires_grad=false)
[]

out_final asserts success!
act_grad_0
[ Tensor (undefined) ]
terminate called after throwing an instance of 'c10::Error'
  what():  tensor with backend UNKNOWN_BACKEND does not have a device (device at /nobackup/libtorch/include/c10/core/TensorImpl.h:442)
frame #0: std::function<std::string ()>::operator()() const + 0x11 (0x7f56da498b91 in /nobackup/libtorch/lib/libc10.so)
frame #1: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x2a (0x7f56da497d1a in /nobackup/libtorch/lib/libc10.so)
frame #2: c10::TensorImpl::device() const + 0x100 (0x559e4e318974 in ./int_grad)
frame #3: at::Tensor::device() const + 0x20 (0x559e4e319548 in ./int_grad)
frame #4: at::Tensor::options() const + 0x7d (0x559e4e31912b in ./int_grad)
frame #5: getActsAndGrads(TorchNeuralNetwork&, at::Tensor&, c10::Device) + 0x7a3 (0x559e4e3171ca in ./int_grad)
frame #6: main + 0x3d7 (0x559e4e317747 in ./int_grad)
frame #7: __libc_start_main + 0xe7 (0x7f56ce780b97 in /lib/x86_64-linux-gnu/libc.so.6)
frame #8: _start + 0x2a (0x559e4e31434a in ./int_grad)

@cndn can you help? I see that Is retain_grad() supported in new C++ API? is kinda close to my issue. Thanks a lot!

I have to accumulate these gradients in a list data structure, which I want to pass into the constructor of the hook:

struct GradSaveHook : public torch::autograd::FunctionPreHook {
  GradSaveHook(std::vector<std::vector<torch::autograd::Variable> >& g, torch::autograd::Variable* x) : v(x), gradients(g) {};
  torch::autograd::variable_list operator()(const torch::autograd::variable_list& grads) {
    std::cout << "grads" << grads << "\n" << std::endl;
    gradients.push_back(grads);
    return grads;
  }
  torch::autograd::Variable* v;
  std::vector<std::vector<torch::autograd::Variable> > gradients;
};

I then run the following loop:

for(unsigned i=0; i < net.getNumLayers(); i++) {
        auto w = net.getWeightTensor(i, device);
        auto b = net.getBiasTensor(i, device);

        out = w.matmul(out) + b;
        out = torch::relu(out);
        activations.push_back(out);

        auto hook_ptr = std::make_shared<GradSaveHook>(new GradSaveHook(gradients, &out));
        out.add_hook(hook_ptr);
    }

Then, I define the loss:

auto loss = out[0];
loss.backward();

@teja5832 Do you have an equivalent Python code that you successfully ran? Looking at the C++ code you shared there seems to be some issues (e.g. out.backward should be called with gradients). It would be best if we work out the correct logic in Python first, and then translate it to C++.