```
class Network(nn.Module):
def __init__(self):
super(Network, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(10, 16),
nn.Sigmoid(),
nn.Linear(16, 16),
nn.Sigmoid(),
nn.Linear(16, 16),
nn.Sigmoid(),
nn.Linear(16, 3),
nn.Sigmoid()
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
def nn_train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
X, y = X.to('cpu'), y.to('cpu')
# # Compute prediction error
pred = model.forward(X.float())
loss = loss_fn(pred, y)
# # Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d} ]")
```

pred.grad is None during training, I am using pandas to read in word data, Word2Vec to convert word to vector, numpy to store the vectors, and created a dataloader using the array of numpy vectors to pass into the training function.

Also, I am using CrossEntropyLoss and torch.optim.SGD for optimizer