Prediction has .grad = None

class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(10, 16),
            nn.Sigmoid(),  
            nn.Linear(16, 16),
            nn.Sigmoid(),
            nn.Linear(16, 16),
            nn.Sigmoid(),
            nn.Linear(16, 3),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

def nn_train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to('cpu'), y.to('cpu') 

        # # Compute prediction error
        pred = model.forward(X.float())
        loss = loss_fn(pred, y)

        # # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d} ]")

pred.grad is None during training, I am using pandas to read in word data, Word2Vec to convert word to vector, numpy to store the vectors, and created a dataloader using the array of numpy vectors to pass into the training function.

Also, I am using CrossEntropyLoss and torch.optim.SGD for optimizer

.grad is only populated for the leaf tensors (which usually are just the parameters of your model - and leaf just means that it is an input, and not the output of another operation requiring grad). .grad is only populated for leaf tensors (that require grad) because those are the weights that would need to be updated on each iteration, and if we had to save .grad for every intermediate result, that would be a lot more memory intensive.
If you check model._parameters[0] you should see that the .grad field is populated there.

What can I use instead of .grad?