Why is it a bad idea?

What if we update the learning rate using backprop? Why is it a bad idea?

something like this:

import torch
import numpy as np
torch.manual_seed(42)
np.random.seed(42)

N = 64

alpha = 1.3
beta = np.array([[1.9],[1.5]])#

x_data = np.random.randn(N, 2)#
y_data = x_data.dot(beta) + alpha

x=torch.from_numpy(x_data).float()
y=torch.from_numpy(y_data).float()

w_beta = torch.randn((2, 1), requires_grad=True)
w_alpha = torch.randn(1, requires_grad=True)

learning_rate = torch.tensor(0.01, requires_grad=True)

torch.autograd.set_detect_anomaly(False)
for t in range(10):
    learning_rate.detach_().requires_grad_()
    w_alpha.detach_().requires_grad_()
    w_beta.detach_().requires_grad_()

    y_pred = x.mm(w_beta).add(w_alpha)
    loss = (y_pred - y).pow(2).sum()

    
    loss.backward(retain_graph=True, create_graph=True)
    print(loss.item())
    saved_w_beta_gra =  w_beta.grad.detach().clone()
    saved_w_alpha_gra =  w_alpha.grad.detach().clone()
    w_beta2 = w_beta - (w_beta.grad*learning_rate)
    w_alpha2= w_alpha - (w_alpha.grad*learning_rate)


    y_pred = x.mm(w_beta2).add(w_alpha2)
    loss = (y_pred - y).pow(2).sum()
    
    loss.backward(retain_graph=True, create_graph=True)

    print("HERE",learning_rate,learning_rate.grad)
    with torch.no_grad():
        learning_rate -= 0.0000001*learning_rate.grad
        w_beta = w_beta - (saved_w_beta_gra*learning_rate)
        w_alpha= w_alpha - (saved_w_alpha_gra *learning_rate)

There is a paper that does a similar thing:

Besides: It needs too much memory! and I guess the second learning-rate is really sensitive