Computing the gradient and the hessian for logistic regression by hand (almost)

Ahmed_Khaled · July 29, 2023, 12:32pm

I am implementing a paper where I compute the hessian and the gradient for a logistic regression model. I am referring to those two formulas in CMU ML class lectures, page 57.

My code is as below:


def gradient_by_hand(loader, model, criterion, LAMBDA):
    grad = []
    for i, (data, target) in enumerate(loader):
        y_hat = model(data)
        target[target == -1] = 0
        loss = criterion(y_hat, target.to(torch.float32))
        loss.backward()
        g = model.linear.weight.grad + LAMBDA * model.linear.weight.data
        grad.append(g)
    return torch.stack(grad, dim=0).mean(dim=0)

def hessian_by_hand(loader, model, LAMBDA):
    hessian = []
    for i, (data, target) in enumerate(loader):
        y_hat = model(data)
        S = torch.diag(y_hat * (1-y_hat))
        h = torch.matmul(torch.matmul(data.T, S), data) + LAMBDA * torch.eye(data.shape[1])
        hessian.append(h)
    return torch.stack(hessian, dim=0).mean(dim=0)

where the model is as follows:


class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, num_classes, bias= False)

    def forward(self, x):
        out = torch.squeeze(torch.sigmoid(self.linear(x)))
        return out

does this seem a valid implementation for the hessian and gradient of the model provided ?

EDIT to add some context:

I am performing the standard newton method to optimize the model:

def standard_newton(n_epochs, loader, model, criterion, LAMBDA, eps):

    log.info("Running standard newton method")
    for i in tqdm(range(n_epochs)):
        g = gradient_by_hand(loader, model, criterion, LAMBDA)
        h = hessian_by_hand(loader, model, LAMBDA)

        # upadte weights
        model.linear.weight.data -= torch.matmul(torch.inverse(h), torch.squeeze(g))
        # check for convergence
        if torch.norm(g) < 1e-3:
            log.info("Converged because gradient norm is less than {}".format(eps))
            return model
    return model