[W TensorImpl.h:1156] Named tensors warning and the process is killed while training

I’m trying to create a model using the VGG-16 network for digit recognition using PyTorch C++ API.

I am getting the following warning, and the process gets killed:

Using CPU for training & Testing!
[W TensorImpl.h:1156] Warning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (function operator())
zsh: killed     ./vgg-16

I have seen a few of the issues in PyTorch about the following warning they say that the problem is resolved. And they had updated the latest release PyTorch 1.9.0, but I am not sure.

Issue: Irrelevant named tensor warnings · Issue #54846 · pytorch/pytorch · GitHub
As far as I can understand, that issue is for:

  • Internal error in Aten’s Channel Shuffle code.
  • Another thing they talked about is PyTorch’s max_pool2d function.

Will anyone please explain to me something about the error? What does that function operator() on the error mean? Where can I find the source code about it? And what mistake I made in my code? Does any internal error still in PyTorch code, or its something my fault?

I read few stack-overflow discussions about the process of getting killed during the training time.
Here: tensorflow - process killed while training - Stack Overflow
According to it, I calculated the memory usage as:

MemTotal:       16184328 kB

I tried to reduce the batch size and re-train my model. But it didn’t work. Will be glad if anyone can please help me out to resolve this error. Thanks!

The following is my code:

// GitHub Warnings: [W TensorImpl.h:1156] Warning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (function operator())
// Issue: https://github.com/pytorch/pytorch/issues/54846

#include <torch/torch.h>
#include <cstddef>        // utility library (NULL, offsetof, size_t, ptrdiff_t, nullptr_t, max_align_t, byte)
#include <cstdio>         // input / output operations
#include <iostream>
#include <string>
#include <vector>

const int64_t num_epochs = 10; 
const int64_t train_batch_size = 64;
const int64_t test_batch_size = 5;
const char* data_path = "../../../../datasets/mnist";
const int64_t interval = 10;

struct Net : torch::nn::Module {
		: conv1_1(torch::nn::Conv2dOptions(/*in_channels=*/ 1, /*out_channels=*/ 10, /*kernel_size=*/ 5).padding(1)),
		conv1_2(torch::nn::Conv2dOptions(/*in_channels=*/ 10, /*out_channels=*/ 20, /*kernel_size=*/ 5).padding(1)),
		conv2_1(torch::nn::Conv2dOptions(/*in_channels=*/ 20, /*out_channels=*/ 30, /*kernel_size=*/ 5).padding(1)),
		conv2_2(torch::nn::Conv2dOptions(/*in_channels=*/ 30, /*out_channels=*/ 40, /*kernel_size=*/ 5).padding(1)),

		conv3_1(torch::nn::Conv2dOptions(/*in_channels=*/ 40, /*out_channels=*/ 50, /*kernel_size=*/ 5).padding(1)),
		conv3_2(torch::nn::Conv2dOptions(/*in_channels=*/ 50, /*out_channels=*/ 60, /*kernel_size=*/ 5).padding(1)),
		conv3_3(torch::nn::Conv2dOptions(/*in_channels=*/ 60, /*out_channels=*/ 70, /*kernel_size=*/ 5).padding(1)),

		conv4_1(torch::nn::Conv2dOptions(/*in_channels=*/ 70, /*out_channels=*/ 80, /*kernel_size=*/ 5).padding(1)),
		conv4_2(torch::nn::Conv2dOptions(/*in_channels=*/ 80, /*out_channels=*/ 90, /*kernel_size=*/ 5).padding(1)),
		conv4_3(torch::nn::Conv2dOptions(/*in_channels=*/ 90, /*out_channels=*/ 100, /*kernel_size=*/ 5).padding(1)),

		conv5_1(torch::nn::Conv2dOptions(/*in_channels=*/ 100, /*out_channels=*/ 110, /*kernel_size=*/ 5).padding(1)),
		conv5_2(torch::nn::Conv2dOptions(/*in_channels=*/ 110, /*out_channels=*/ 120, /*kernel_size=*/ 5).padding(1)),
		conv5_3(torch::nn::Conv2dOptions(/*in_channels=*/ 120, /*out_channels=*/ 130, /*kerne;_size=*/ 5).padding(1)),

		fc1(130, 50),
		fc2(50, 20),
		fc3(20, 10) {
			register_module("conv1_1", conv1_1);
			register_module("conv1_2", conv1_2);
			register_module("conv2_1", conv2_1);
			register_module("conv2_2", conv2_2);
			register_module("conv3_1", conv3_1);
			register_module("conv3_2", conv3_2);
			register_module("conv3_3", conv3_3);
			register_module("conv4_1", conv4_1);
			register_module("conv4_2", conv4_2);
			register_module("conv4_3", conv4_3);
			register_module("conv5_1", conv5_1);
			register_module("conv5_2", conv5_2);
			register_module("conv5_3", conv5_3);
			register_module("fc1", fc1);
			register_module("fc2", fc2);
			register_module("fc3", fc3);

	torch::Tensor forward(torch::Tensor x) {
		x = torch::relu(conv1_1->forward(x));
		x = torch::relu(conv1_2->forward(x));
		x = torch::max_pool2d(x, 2);

		x = torch::relu(conv2_1->forward(x));
		x = torch::relu(conv2_2->forward(x));
		x = torch::max_pool2d(x, 2);

		x = torch::relu(conv3_1->forward(x));
		x = torch::relu(conv3_2->forward(x));
		x = torch::relu(conv3_3->forward(x));
		x = torch::max_pool2d(x, 2);

		x = torch::relu(conv4_1->forward(x));
		x = torch::relu(conv4_2->forward(x));
		x = torch::relu(conv4_3->forward(x));
		x = torch::max_pool2d(x, 2);

		x = torch::relu(conv5_1->forward(x));
		x = torch::relu(conv5_2->forward(x));
		x = torch::relu(conv5_3->forward(x));

		x = x.view({-1, 130});

		x = torch::relu(fc1->forward(x));
		x = torch::relu(fc2->forward(x));
		x = fc3->forward(x);

		return torch::log_softmax(x, 1);

	torch::nn::Conv2d conv1_1;
	torch::nn::Conv2d conv1_2;
	torch::nn::Conv2d conv2_1;
	torch::nn::Conv2d conv2_2;
	torch::nn::Conv2d conv3_1;
	torch::nn::Conv2d conv3_2;
	torch::nn::Conv2d conv3_3;
	torch::nn::Conv2d conv4_1;
	torch::nn::Conv2d conv4_2;
	torch::nn::Conv2d conv4_3;
	torch::nn::Conv2d conv5_1;
	torch::nn::Conv2d conv5_2;
	torch::nn::Conv2d conv5_3;
	torch::nn::Linear fc1;
	torch::nn::Linear fc2;
	torch::nn::Linear fc3;

template <typename DataLoader>
void train(
	Net& net,
	torch::Device device,
	DataLoader& train_loader, 
	torch::optim::Optimizer& optimizer,
	size_t epoch,
	size_t dataset_size){                             

	size_t batch_index = 0;
	for (auto& batch : train_loader) {
		auto data = batch.data.to(device), targets = batch.target.to(device);
		// Reset gradients
		// Execute model
		auto output = net.forward(data);
		// Compute loss
		auto loss = torch::nll_loss(output, targets);
		AT_ASSERT(!std::isnan(loss.template item<float>()));
		// Compute gradients
		// Update parameters
		if (batch_index++ % interval == 0) {
          			"\rTrain Epoch: %ld [%5ld/%5ld] Loss: %.4f",
          			batch_index * batch.data.size(0),
          			loss.template item<float>());

template <typename DataLoader>
void test(
	Net& net,
	torch::Device device,
	DataLoader& test_loader,
	size_t dataset_size) {
	torch::NoGradGuard no_grad;
	double loss = 0;
	int32_t accuracy = 0;
	for (const auto& batch : test_loader) {
		auto data = batch.data.to(device), targets = batch.target.to(device);
		// Executing Model
		auto output = net.forward(data);
		// Calculating loss
		loss += torch::nll_loss(
				.template item<float>();
		auto pred = output.argmax(1);
		accuracy += pred.eq(targets).sum().template item<int64_t>();
	loss /= dataset_size;
      		"\nTest set: Average loss: %.4f | Accuracy: %.3f\n",
      		static_cast<double>(accuracy) / dataset_size);

auto main() -> int {

	// Selecting Data type
	torch::DeviceType device_type;
	if (torch::cuda::is_available()) {
		std::cout << "CUDA available! Using GPU for training & testing." << std::endl;
		device_type = torch::kCUDA;
	} else {
		std::cout << "Using CPU for training & Testing!" << std::endl;
		device_type = torch::kCPU;
	torch::Device device(device_type);
	Net net;


	auto train_dataset = torch::data::datasets::MNIST(data_path)
				.map(torch::data::transforms::Normalize<>(0.13707, 0.3081))

	const size_t train_dataset_size = train_dataset.size().value();
	auto train_loader = torch::data::make_data_loader<torch::data::samplers::SequentialSampler>(
			std::move(train_dataset), train_dataset_size);
	auto test_dataset = torch::data::datasets::MNIST(
			data_path, torch::data::datasets::MNIST::Mode::kTest)
			.map(torch::data::transforms::Normalize<>(0.13707, 0.3081))

	const size_t test_dataset_size = test_dataset.size().value();
	auto test_loader = torch::data::make_data_loader(std::move(test_dataset), test_dataset_size);

	// Initializing optimizer, here using SGD 
        torch::optim::SGD optimizer(
			net.parameters(), torch::optim::SGDOptions(0.01).momentum(0.5));

	// Training
	for (size_t epoch = 1; epoch <= num_epochs; epoch++) {
		train(net, device, *train_loader, optimizer, epoch, train_dataset_size);
		test(net, device, *test_loader, test_dataset_size);


  • PyTorch: 1.9.0
  • LibTorch: Stable 1.9.0 (cxx11 ABI)

You could ignore the warning and focus on debugging the killed process.
To do so, run the script via gdb:

gdb --args python script.py args

and check the backtrace to isolate the issue further.