Hi again.
I’m attempting to minimise x*x using autograd in c++:
torch::Tensor dynamic_parameters = torch::tensor(/*value=*/{17.2}, torch::dtype(torch::kFloat64).requires_grad(true));
torch::Tensor target = torch::tensor(/*value=*/{0}, torch::dtype(torch::kFloat64).requires_grad(false));
torch::optim::SGD optimizer({dynamic_parameters}, /*lr=*/0.001);
pybind11::gil_scoped_release no_gil;
for (int s = 0; s < 20; s ++)
{
torch::Tensor prediction = dynamic_parameters * dynamic_parameters;
torch::Tensor loss = torch::nn::functional::mse_loss(prediction, target);
std::cout << "guess: " << dynamic_parameters << std::endl;
std::cout << "loss: " << loss << std::endl;
optimizer.zero_grad();
loss.backward();
optimizer.step();
}
Steps 10-20 output:
guess: -2.5730
[ CPUDoubleType{1} ]
loss: 43.827
[ CPUDoubleType{} ]
guess: -2.5048
[ CPUDoubleType{1} ]
loss: 39.3659
[ CPUDoubleType{} ]
guess: -2.4420
[ CPUDoubleType{1} ]
loss: 35.5603
[ CPUDoubleType{} ]
guess: -2.3837
[ CPUDoubleType{1} ]
loss: 32.2869
[ CPUDoubleType{} ]
guess: -2.3295
[ CPUDoubleType{1} ]
loss: 29.4501
[ CPUDoubleType{} ]
guess: -2.2790
[ CPUDoubleType{1} ]
loss: 26.9751
[ CPUDoubleType{} ]
guess: -2.2316
[ CPUDoubleType{1} ]
loss: 24.8023
[ CPUDoubleType{} ]
guess: -2.1872
[ CPUDoubleType{1} ]
loss: 22.8843
[ CPUDoubleType{} ]
guess: -2.1453
[ CPUDoubleType{1} ]
loss: 21.1824
[ CPUDoubleType{} ]
guess: -2.1058
[ CPUDoubleType{1} ]
loss: 19.6651
[ CPUDoubleType{} ]
guess: -2.0685
[ CPUDoubleType{1} ]
loss: 18.3065
[ CPUDoubleType{} ]
guess: -2.0331
[ CPUDoubleType{1} ]
loss: 17.0851
[ CPUDoubleType{} ]
guess: -1.9995
[ CPUDoubleType{1} ]
loss: 15.9829
[ CPUDoubleType{} ]
-
It explodes with a learning rate of 0.005, which I guess surprises me a little.
-
The convergence seems quite slow - do we just expect to be running an optimiser step hundreds of times even for a simple function?
-
I note the loss from MSE is not… the prediction minus the target squared. Purely out of interest, what does that loss represent?
Thanks