Hi, I am training a simple model on the GTSRB dataset with PyTorch C++ Frontend. There is an “INTERNAL ASSERT FAILED” error in epoch 5, as you can see in the following code:
...
Train Epoch: 5 1208/13200 Loss: 0.0776928 Acc: 0.831126
Train Epoch: 5 1288/13200 Loss: 0.0787623 Acc: 0.831522
Train Epoch: 5 1368/13200 Loss: 0.0887302 Acc: 0.826023
terminate called after throwing an instance of 'c10::Error'
what(): !std::isnan(loss.template item<float>()) INTERNAL ASSERT FAILED at "../trainGTSRB.cpp":184, please report a bug to PyTorch.
Exception raised from train at ../trainGTSRB.cpp:184 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x6c (0x7ffff7f9c8bc in /home/amorais/pytorch/torch/lib/libc10.so)
...
I would appreciate it if you could help me solve it.
I am classifying 9 classes of 28x28 images. Here is my network:
struct NetImpl : torch::nn::SequentialImpl {
NetImpl() {
using namespace torch::nn;
auto stride = torch::ExpandingArray<2>({2, 2});
// input images have size 28x28
push_back(Conv2d(Conv2dOptions(numChannels, 32, 3).stride(1).padding(0)));
push_back(Functional(torch::max_pool2d, 2, stride,0, 1, false));
push_back(Functional(torch::relu));
push_back(Conv2d(Conv2dOptions(32, 64, 3).stride(1).padding(0)));
push_back(Functional(torch::max_pool2d, 2, stride, 0, 1, false));
push_back(Functional(torch::relu));
push_back(Conv2d(Conv2dOptions(64, 128, 3).stride(1).padding(0)));
push_back(Functional(torch::relu));
push_back(Flatten());
push_back(Linear(1152, 600));
push_back(Functional(torch::relu));
push_back(Linear(600, 320));
push_back(Functional(torch::relu));
push_back(Linear(320, numClasses));
push_back(Functional(static_cast<torch::Tensor(&)(const torch::Tensor&, int64_t, torch::optional<torch::ScalarType> )>(torch::log_softmax), 1, torch::nullopt));
}
};
TORCH_MODULE(Net);
And here is the training function:
template <typename DataLoader>
void train(
size_t epoch,
Net model,
torch::Device device,
DataLoader& data_loader,
torch::optim::Optimizer& optimizer,
size_t dataset_size) {
model->train();
size_t batch_idx = 0;
float Loss = 0, Acc = 0;
for (auto& batch : data_loader) {
auto data = batch.data.to(device);
auto targets = batch.target.to(device).view({-1});
optimizer.zero_grad();
auto output = model->forward(data);
auto loss = torch::nll_loss(output, targets);
AT_ASSERT(!std::isnan(loss.template item<float>()));
auto acc = output.argmax(1).eq(targets).sum();
loss.backward();
optimizer.step();
Loss += loss.template item<float>();
Acc += acc.template item<float>();
if (batch_idx++ % kLogInterval == 0) {
auto end = std::min(dataset_size, batch_idx * batch.data.size(0));
std::cout << "Train Epoch: " << epoch << " " << end << "/" << dataset_size
<< "\tLoss: " << Loss / end << "\tAcc: " << Acc / end
<< std::endl;
}
}
}