I have an external function that constructs a tensor from a set of parameters. I then pass this tensor and the parameters to a torch.nn module during initilzation. The problem is that the parameters are not being learned. I have included below a minimal test case where the module is a softmax classifier. The (externally) constructed tensor is the parameter tensor of the softmax.

```
import torch
from torch.autograd import Variable
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from collections import OrderedDict
train_dataset = dsets.MNIST(root='./data', train=True, transform=transforms.ToTensor(), download=False)
test_dataset = dsets.MNIST(root='./data', train=False, transform=transforms.ToTensor())
batch_size = 100
n_iters = 3000
epochs = n_iters / (len(train_dataset) / batch_size)
input_dim = 784
output_dim = 10
lr_rate = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
def create_parameters(input_dim, output_dim):
parameters = OrderedDict()
for i in range(input_dim):
parameters[f"parameter_{i}"] = torch.nn.Parameter(torch.rand( size=(output_dim,1), requires_grad=True, device=device))
return parameters
def create_bias(output_dim):
bias = torch.nn.Parameter(torch.rand( size=(output_dim,), requires_grad=True, device=device))
return bias
def construct_logit_parameter(parameters):
p = torch.hstack(tuple(parameters.values()))
return p
ps = create_parameters(input_dim, output_dim)
bias = create_bias(output_dim)
p = construct_logit_parameter(ps)
class SoftMax(torch.nn.Module):
def __init__(self, parameters, bias, logit_parameter):
super(SoftMax, self).__init__()
self.params = torch.nn.ParameterDict(parameters)
self.logit_parameter = logit_parameter
self.bias = bias
def forward(self, x):
outputs = x @ self.logit_parameter.T + bias
return outputs
model = SoftMax(ps, bias, p)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr_rate)
iter = 0
for epoch in range(int(epochs)):
for i, (images, labels) in enumerate(train_loader):
images = images.view(-1, 28 * 28).to(device)
labels = labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
iter+=1
if iter%500==0:
ps = list(model.parameters())
print(ps[1].grad)
print(ps[0].grad)
# calculate Accuracy
correct = 0
total = 0
for images, labels in test_loader:
images = images.view(-1, 28*28).to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total+= labels.size(0)
correct+= (predicted == labels.to(device)).sum()
accuracy = 100 * correct/total
print("Iteration: {}. Loss: {}. Accuracy: {}.".format(iter, loss.item(), accuracy))
```

Inspecting the gradient (simply checking .grad for the parameters in model.parameters()) shows me that the bias parameter has a non-zero gradient but all the other ones don’t. How do I fix this? My guess is that something is detached from the computation graph somewhere but I am not sure where and what.

Note, yes I know I could just construct the parameters in the nn.Module but the code base I am working on makes this a little bit tricky and I figured it might be easier to solve it this way.