I am implementing a paper where I compute the hessian and the gradient for a logistic regression model. I am referring to those two formulas in CMU ML class lectures, page 57.
My code is as below:
def gradient_by_hand(loader, model, criterion, LAMBDA):
grad = []
for i, (data, target) in enumerate(loader):
y_hat = model(data)
target[target == -1] = 0
loss = criterion(y_hat, target.to(torch.float32))
loss.backward()
g = model.linear.weight.grad + LAMBDA * model.linear.weight.data
grad.append(g)
return torch.stack(grad, dim=0).mean(dim=0)
def hessian_by_hand(loader, model, LAMBDA):
hessian = []
for i, (data, target) in enumerate(loader):
y_hat = model(data)
S = torch.diag(y_hat * (1-y_hat))
h = torch.matmul(torch.matmul(data.T, S), data) + LAMBDA * torch.eye(data.shape[1])
hessian.append(h)
return torch.stack(hessian, dim=0).mean(dim=0)
where the model is as follows:
class LogisticRegression(nn.Module):
def __init__(self, input_size, num_classes):
super(LogisticRegression, self).__init__()
self.linear = nn.Linear(input_size, num_classes, bias= False)
def forward(self, x):
out = torch.squeeze(torch.sigmoid(self.linear(x)))
return out
does this seem a valid implementation for the hessian and gradient of the model provided ?
EDIT to add some context:
I am performing the standard newton method to optimize the model:
def standard_newton(n_epochs, loader, model, criterion, LAMBDA, eps):
log.info("Running standard newton method")
for i in tqdm(range(n_epochs)):
g = gradient_by_hand(loader, model, criterion, LAMBDA)
h = hessian_by_hand(loader, model, LAMBDA)
# upadte weights
model.linear.weight.data -= torch.matmul(torch.inverse(h), torch.squeeze(g))
# check for convergence
if torch.norm(g) < 1e-3:
log.info("Converged because gradient norm is less than {}".format(eps))
return model
return model