I would like to implement a custom optimizer class so that I have more control during the gradient descent step. Here is my approach:
class SGD:
def __init__(self, model, learning_rate, momentum):
self.model = model
self.learning_rate = learning_rate
self.momentum = momentum
self.v_old = [torch.zeros_like(p.data) for p in model.parameters()]
def step(self):
with torch.no_grad():
update = lambda v_old, grad: self.momentum * v_old + self.learning_rate * grad
self.v_old = [update(v_old, params.grad) for v_old, params in zip(self.v_old, self.model.parameters())]
for parameters, v_old in zip(self.model.parameters(), self.v_old):
parameters -= v_old
I use this class as follows:
train_loader, test_loader = get_data_loader(batch_size)
model = Model()
cross_entropy_loss = nn.CrossEntropyLoss()
optimizer = SGD(model=model, learning_rate=0.01, momentum=0.9)
for epoch in range(10):
running_loss = 0.0
running_counter = 0
for i, data in enumerate(train_loader):
inputs, labels = data[0], data[1]
model.zero_grad()
outputs = model(inputs)
loss = cross_entropy_loss(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
running_counter += labels.size(0)
running_loss = running_loss / running_counter
print(f"epoch {epoch} loss {running_loss}")
I would like to know if I can improve the SGD class and if the approach is considered good style? Or are there more elegant ways to create custom optimizer classes?