Computing Gradients with respect to intermediate activations

(Surya Teja Chavali) #1

Hi,

I am looking to do the following in C++:

auto x = torch::zeros({5}, torch::requires_grad());
auto out_1 = torch::relu( W_1.matmul(x) + b_1);
auto out_2 = torch::relu( W_1.matmul(out_2) + b_2); 
auto logits = torch::log_softmax(out_2, 1);
auto loss = logits[0] - logits[1];

I want to find the values of d(loss)/d(out_1) and d(loss)/d(out_2).

How do I do this? Attempts to do

loss.backward();

and then accessing

out_2.grad()

have seemingly failed: there is a runtime error which says libc++abi.dylib: terminating with uncaught exception of type c10::Error: tensor with backend UNKNOWN_BACKEND does not have a device

I can provide a minimal breaking example if need be.

(Siddhesh Thakur) #2

Make sure to push your tensors on a particular device!

(Surya Teja Chavali) #3

I think I have done that.
Here’s my code:

#include <iostream>
#include<vector>
#include <string>
#include<cassert>

#include<torch/torch.h>

#include "TorchNeuralNet.hpp"

// std::pair< std::vector<torch::Tensor>, std::vector<torch::Tensor> >
void getActsAndGrads(TorchNeuralNetwork& net, torch::Tensor& x, torch::Device device) //, torch::nn::Functional& L) 
{
    assert(x.numel() == net.getLayerInputSize(0));
    auto out = x;
    std::cout << "output_init" << std::endl;
    std::cout << out << std::endl;
    std::cout << out.options() << std::endl;
    std::cout << out.sizes() << std::endl;

    std::vector<torch::Tensor> activations;
    std::vector<torch::Tensor> gradients;
    // std::vector<torch::Tensor> act_gradients;

    //outputs
    for(unsigned i=0; i < net.getNumLayers(); i++) {

        std::cout << i << std::endl;

        auto w = net.getWeightTensor(i, device);
        auto b = net.getBiasTensor(i, device);

        // std::cout << "w" << i << std::endl;
        // std::cout << w << std::endl;
        // std::cout << w.options() << std::endl;

        // std::cout << "b" << i << std::endl;
        // std::cout << b << std::endl;
        // std::cout << b.options() << std::endl;

        out = w.matmul(out) + b;
        out = torch::relu(out);
        activations.push_back(out);

        assert(out.dtype() == torch::kFloat64);
        assert(out.layout() == torch::kStrided);
        assert(out.device().type() == torch::kCPU);
        assert(out.requires_grad());
        std::cout << "out_" << i << " asserts success!" << std::endl;

        // std::cout << "out" << i << std::endl;
        // std::cout << out << std::endl;
        // std::cout << out.options() << std::endl;
        // std::cout << out.sizes() << std::endl << std::endl;
        
        // std::cout << "\n\n\n";
    }

    out = out[0];
    std::cout << "out_final" << std::endl;
    std::cout << out << std::endl;
    std::cout << out.options() << std::endl;
    std::cout << out.sizes() << std::endl << std::endl;

    assert(out.dtype() == torch::kFloat64);
    assert(out.layout() == torch::kStrided);
    assert(out.device().type() == torch::kCPU);
    assert(out.requires_grad());
    std::cout << "out_final asserts success!" << std::endl;

    int k = 0;
    out.backward();
    for(auto& o: activations) {
        std::cout << "act_grad_" << k << std::endl;
        auto g = o.grad();
        gradients.push_back(g);
        std::cout << g << std::endl;
        std::cout << g.options() << std::endl;
        std::cout << g.sizes() << std::endl << std::endl;
        std::cout << "\n\n\n";
        k++;
    }
}

int main()
{
    const std::string FILE_NAME = "../nnet/ACASXU_run2a_1_1_batch_2000.nnet";
    TorchNeuralNetwork net(FILE_NAME);

    torch::DeviceType device_type;
    if (torch::cuda::is_available()) {
        std::cout << "CUDA available! Training on GPU." << std::endl;
        device_type = torch::kCUDA;
    } else {
        std::cout << "Training on CPU." << std::endl;
        device_type = torch::kCPU;
    }
    torch::Device device(device_type);

    auto options = torch::TensorOptions()
                    .dtype(torch::kFloat64)
                    .layout(torch::kStrided)
                    .device(device)
                    .requires_grad(true);

    auto x = torch::zeros({5}, options);

    assert(x.dtype() == torch::kFloat64);
    assert(x.layout() == torch::kStrided);
    assert(x.device().type() == torch::kCPU);
    assert(x.requires_grad());
    std::cout << "x asserts success!" << std::endl;


    std::cout << "x" << std::endl;
    std::cout << x << std::endl;
    std::cout << x.options() << std::endl;
    std::cout << x.sizes() << std::endl;
    // auto grads = 
    getActsAndGrads(net, x, device);
    return 0;
}

All asserts are succesful; even the gradientss.push_back(g) is(I have verified this by separating the loops). Here’s the output:

Training on CPU.
x asserts success!
x
 0
 0
 0
 0
 0
[ Variable[CPUDoubleType]{5} ]
TensorOptions(dtype=double, device=cpu, layout=Strided, requires_grad=false)
[5]
output_init
 0
 0
 0
 0
 0
[ Variable[CPUDoubleType]{5} ]
TensorOptions(dtype=double, device=cpu, layout=Strided, requires_grad=false)
[5]
0
out_0 asserts success!
1
out_1 asserts success!
2
out_2 asserts success!
3
out_3 asserts success!
4
out_4 asserts success!
5
out_5 asserts success!
6
out_6 asserts success!
out_final
0
[ Variable[CPUDoubleType]{} ]
TensorOptions(dtype=double, device=cpu, layout=Strided, requires_grad=false)
[]

out_final asserts success!
act_grad_0
[ Tensor (undefined) ]
terminate called after throwing an instance of 'c10::Error'
  what():  tensor with backend UNKNOWN_BACKEND does not have a device (device at /nobackup/libtorch/include/c10/core/TensorImpl.h:442)
frame #0: std::function<std::string ()>::operator()() const + 0x11 (0x7f56da498b91 in /nobackup/libtorch/lib/libc10.so)
frame #1: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x2a (0x7f56da497d1a in /nobackup/libtorch/lib/libc10.so)
frame #2: c10::TensorImpl::device() const + 0x100 (0x559e4e318974 in ./int_grad)
frame #3: at::Tensor::device() const + 0x20 (0x559e4e319548 in ./int_grad)
frame #4: at::Tensor::options() const + 0x7d (0x559e4e31912b in ./int_grad)
frame #5: getActsAndGrads(TorchNeuralNetwork&, at::Tensor&, c10::Device) + 0x7a3 (0x559e4e3171ca in ./int_grad)
frame #6: main + 0x3d7 (0x559e4e317747 in ./int_grad)
frame #7: __libc_start_main + 0xe7 (0x7f56ce780b97 in /lib/x86_64-linux-gnu/libc.so.6)
frame #8: _start + 0x2a (0x559e4e31434a in ./int_grad)
(Surya Teja Chavali) #4

@cndn can you help? I see that Is retain_grad() supported in new C++ API? is kinda close to my issue. Thanks a lot!

I have to accumulate these gradients in a list data structure, which I want to pass into the constructor of the hook:

struct GradSaveHook : public torch::autograd::FunctionPreHook {
  GradSaveHook(std::vector<std::vector<torch::autograd::Variable> >& g, torch::autograd::Variable* x) : v(x), gradients(g) {};
  torch::autograd::variable_list operator()(const torch::autograd::variable_list& grads) {
    std::cout << "grads" << grads << "\n" << std::endl;
    gradients.push_back(grads);
    return grads;
  }
  torch::autograd::Variable* v;
  std::vector<std::vector<torch::autograd::Variable> > gradients;
};

I then run the following loop:

for(unsigned i=0; i < net.getNumLayers(); i++) {
        auto w = net.getWeightTensor(i, device);
        auto b = net.getBiasTensor(i, device);

        out = w.matmul(out) + b;
        out = torch::relu(out);
        activations.push_back(out);

        auto hook_ptr = std::make_shared<GradSaveHook>(new GradSaveHook(gradients, &out));
        out.add_hook(hook_ptr);
    }

Then, I define the loss:

auto loss = out[0];
loss.backward();