I have an external function that constructs a tensor from a set of parameters. I then pass this tensor and the parameters to a torch.nn module during initilzation. The problem is that the parameters are not being learned. I have included below a minimal test case where the module is a softmax classifier. The (externally) constructed tensor is the parameter tensor of the softmax.
import torch
from torch.autograd import Variable
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from collections import OrderedDict
train_dataset = dsets.MNIST(root='./data', train=True, transform=transforms.ToTensor(), download=False)
test_dataset = dsets.MNIST(root='./data', train=False, transform=transforms.ToTensor())
batch_size = 100
n_iters = 3000
epochs = n_iters / (len(train_dataset) / batch_size)
input_dim = 784
output_dim = 10
lr_rate = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
def create_parameters(input_dim, output_dim):
parameters = OrderedDict()
for i in range(input_dim):
parameters[f"parameter_{i}"] = torch.nn.Parameter(torch.rand( size=(output_dim,1), requires_grad=True, device=device))
return parameters
def create_bias(output_dim):
bias = torch.nn.Parameter(torch.rand( size=(output_dim,), requires_grad=True, device=device))
return bias
def construct_logit_parameter(parameters):
p = torch.hstack(tuple(parameters.values()))
return p
ps = create_parameters(input_dim, output_dim)
bias = create_bias(output_dim)
p = construct_logit_parameter(ps)
class SoftMax(torch.nn.Module):
def __init__(self, parameters, bias, logit_parameter):
super(SoftMax, self).__init__()
self.params = torch.nn.ParameterDict(parameters)
self.logit_parameter = logit_parameter
self.bias = bias
def forward(self, x):
outputs = x @ self.logit_parameter.T + bias
return outputs
model = SoftMax(ps, bias, p)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr_rate)
iter = 0
for epoch in range(int(epochs)):
for i, (images, labels) in enumerate(train_loader):
images = images.view(-1, 28 * 28).to(device)
labels = labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
iter+=1
if iter%500==0:
ps = list(model.parameters())
print(ps[1].grad)
print(ps[0].grad)
# calculate Accuracy
correct = 0
total = 0
for images, labels in test_loader:
images = images.view(-1, 28*28).to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total+= labels.size(0)
correct+= (predicted == labels.to(device)).sum()
accuracy = 100 * correct/total
print("Iteration: {}. Loss: {}. Accuracy: {}.".format(iter, loss.item(), accuracy))
Inspecting the gradient (simply checking .grad for the parameters in model.parameters()) shows me that the bias parameter has a non-zero gradient but all the other ones don’t. How do I fix this? My guess is that something is detached from the computation graph somewhere but I am not sure where and what.
Note, yes I know I could just construct the parameters in the nn.Module but the code base I am working on makes this a little bit tricky and I figured it might be easier to solve it this way.