Hello all,

I am currently building a classieifer on my own and I am experiencing this issue. As far as I know the model is learning but the cost tends to go to nan for some reason, after printing the values it is not going to huge numbers or small numbers so i was wondering what could be causing the issue.

```
class linear_model_defer(torch.nn.Module):
def __init__(self, n_features, k_classes):
super(linear_model_defer, self).__init__()
self.n_features = n_features
self.k_classes = k_classes + 1 # with the k+1 being the defer class
self.linear = torch.nn.Linear(self.n_features, self.k_classes)
self.softmax = torch.nn.Softmax(dim=0)
def forward(self, x):
out = self.linear(x)
return out
def train(self, x, y, expert: synth_expert, epochs, lr=0.001):
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(self.parameters(), lr=lr)
for epoch in range(epochs):
for curr_x, curr_y in zip(x, y):
epoch_cost = 0
expert_pred = torch.argmax(expert.predict(curr_y))
true_label = torch.argmax(curr_y)
outputs = self.forward(torch.flatten(curr_x))
if expert_pred.item() == true_label.item():
loss_ex = -torch.log(outputs[-1] + 1e-10)
else:
loss_ex = 0
# print(loss_ex)
loss = loss_fn(outputs[:-1].unsqueeze(0), true_label.unsqueeze(0)) + loss_ex
optimizer.zero_grad()
loss.backward()
optimizer.step()
# print(loss)
epoch_cost += loss.item()
if epoch % 5 == 0:
total = 0
correct = 0
with torch.no_grad():
for curr_x, curr_y in zip(x, y):
outputs = self.forward(torch.flatten(curr_x))
predicted_label = int(torch.argmax(self.softmax(outputs)).item())
true_label = int(torch.argmax(curr_y).item())
# if predicted_label == 10:
# print("kaas")
# print(predicted_label, true_label)
if predicted_label == true_label:
correct += 1
total += 1
accuracy = correct / total
print(f'Training accuracy after {epoch} epochs: {accuracy:.4f} Cost: {epoch_cost:.4f}')
```