Loss with zero value and empty size, get no changed in parameters after optimizer.step()

wongkenchiang97 · March 2, 2024, 7:29am

Hello, I’m learning how to use torch::nn::functional::cross_entropy to compute loss.
I’ve encouter some strange loss result.

Code snippet:

/*Get Single Data from a batch*/
auto data_tensor = test_loader->begin()->data[0].to(device);
auto target_tensor = test_loader->begin()->target[0].to(device);
data_tensor = data_tensor.unsqueeze(0);
target_tensor = target_tensor.unsqueeze(0);
target_tensor = target_tensor.unsqueeze(0);

/*Model*/
auto in_channel = 3;
auto out_channel = 1;
auto double_conv = DoubleConv(in_channel,out_channel);

/*Optimizer*/
torch::optim::Adam optimizer(double_conv->parameters(), torch::optim::AdamOptions(learning_rate));
  
/*Forward*/
data_tensor.requires_grad_(true);
double_conv->to(device);
std::cout<<"data_tensor[size]: "<<data_tensor.sizes()<<std::endl;
std::cout<<"target_tensor[size]: "<<target_tensor.sizes()<<std::endl;
auto output_conv_double = double_conv->forward(data_tensor);
std::cout<<"output_conv_double[size]: "<<output_conv_double.sizes()<<std::endl;
auto loss = torch::nn::functional::cross_entropy(output_conv_double,target_tensor);
std::cout<<"loss: "<<loss<<std::endl;
std::cout<<"loss[size]: "<<loss.sizes()<<"\n"<<std::endl;

/*Backward and step*/
optimizer.zero_grad();
loss.backward();
std::cout<<"double_conv[params]: "<<double_conv->parameters()<<std::endl;
std::cout<<"\n========STEP========\n"<<std::endl;
optimizer.step();
std::cout<<"double_conv[params]: "<<double_conv->parameters()<<std::endl;

Terminal output:

data_tensor[size]: [1, 3, 300, 300]
target_tensor[size]: [1, 1, 300, 300]
output_conv_double[size]: [1, 1, 300, 300]
loss: -0
[ CUDAFloatType{} ]
loss[size]: []

double_conv[params]: (1,1,.,.) = 
  0.0234  0.0282 -0.0839
 -0.1429  0.1813 -0.0405
 -0.1422  0.0471  0.0029

(1,2,.,.) = 
  0.0984 -0.1271  0.1176
  0.1223  0.1840 -0.1340
  0.1659 -0.0649 -0.0265

(1,3,.,.) = 
 -0.1895  0.1487  0.1844
 -0.1412  0.1739  0.1029
 -0.1922 -0.1089 -0.1594
[ CUDAFloatType{1,3,3,3} ]  1
[ CUDAFloatType{1} ]  0
[ CUDAFloatType{1} ] (1,1,.,.) = 
  0.2240  0.1544 -0.2749
  0.2366 -0.0623  0.0697
  0.0303  0.1510 -0.1503
[ CUDAFloatType{1,1,3,3} ]  1
[ CUDAFloatType{1} ]  0
[ CUDAFloatType{1} ]

========STEP========

double_conv[params]: (1,1,.,.) = 
  0.0234  0.0282 -0.0839
 -0.1429  0.1813 -0.0405
 -0.1422  0.0471  0.0029

(1,2,.,.) = 
  0.0984 -0.1271  0.1176
  0.1223  0.1840 -0.1340
  0.1659 -0.0649 -0.0265

(1,3,.,.) = 
 -0.1895  0.1487  0.1844
 -0.1412  0.1739  0.1029
 -0.1922 -0.1089 -0.1594
[ CUDAFloatType{1,3,3,3} ]  1
[ CUDAFloatType{1} ]  0
[ CUDAFloatType{1} ] (1,1,.,.) = 
  0.2240  0.1544 -0.2749
  0.2366 -0.0623  0.0697
  0.0303  0.1510 -0.1503
[ CUDAFloatType{1,1,3,3} ]  1
[ CUDAFloatType{1} ]  0
[ CUDAFloatType{1} ]

The cross entropy loss between output_conv_double and target_tersor.

auto loss = torch::nn::functional::cross_entropy(output_conv_double,target_tensor);

The loss value is -0.

loss: -0

The size of loss is empty.

[ CUDAFloatType{} ]
loss[size]:

Any idea which part I’ve missed out?

cc: @ptrblck

ptrblck · March 2, 2024, 5:46pm

Check the outputs and targets manually to see if a zero loss is expected.
Also, posting code snippets by wrapping them into three backticks ``` is always better than posting screenshots.

wongkenchiang97 · March 3, 2024, 7:19am

Good idea, I’ve edited the post.
Thanks!

wongkenchiang97 · March 3, 2024, 3:02pm

If I change the loss criterion to torch::nn::BCEWithLogitsLoss() , I could get rid of loss=-0.

The parameters of double_conv(layers) will change when using torch::nn::BCEWithLogitsLoss() as the loss criterion.

auto criterion = torch::nn::BCEWithLogitsLoss();

Do I need to worry about the loss.sizes() == [] ?

Code:

/*Get Single Data from a batch*/
auto data_tensor = test_loader->begin()->data[0].to(device);
auto target_tensor = test_loader->begin()->target[0].to(device);
data_tensor = data_tensor.unsqueeze(0);
target_tensor = target_tensor.unsqueeze(0);
target_tensor = target_tensor.unsqueeze(0);
data_tensor.requires_grad_(true);

/*Model*/
auto in_channel = 3;
auto out_channel = 1;
auto double_conv = DoubleConv(in_channel,out_channel);
double_conv->to(device);

/*Optimizer*/
torch::optim::Adam optimizer(double_conv->parameters(), torch::optim::AdamOptions(learning_rate));

/*Loss*/
auto criterion = torch::nn::BCEWithLogitsLoss();

/*Forward*/
std::cout<<"data_tensor[size]: "<<data_tensor.sizes()<<std::endl;
std::cout<<"target_tensor[size]: "<<target_tensor.sizes()<<std::endl;
auto output_conv_double = double_conv->forward(data_tensor);
std::cout<<"output_conv_double[size]: "<<output_conv_double.sizes()<<std::endl;
auto loss = criterion(output_conv_double,target_tensor);
std::cout<<"loss: "<<loss<<std::endl;
std::cout<<"loss[size]: "<<loss.sizes()<<"\n"<<std::endl;

/*Backward and step*/
optimizer.zero_grad();
loss.backward();
std::cout<<"double_conv[params]: "<<double_conv->parameters()<<std::endl;
std::cout<<"\n========STEP========\n"<<std::endl;
optimizer.step();
std::cout<<"double_conv[params]: "<<double_conv->parameters()<<std::endl;

Terminal:

data_tensor[size]: [1, 3, 300, 300]
target_tensor[size]: [1, 1, 300, 300]
output_conv_double[size]: [1, 1, 300, 300]
loss: 0.749804
loss[size]: []

double_conv[params]: (1,1,.,.) = 
 -0.0965  0.0491 -0.1019
 -0.1215  0.1222 -0.1157
 -0.1352 -0.1835 -0.0463

(1,2,.,.) = 
  0.1872 -0.1846 -0.0210
  0.1778 -0.0371  0.0465
  0.0213 -0.0963 -0.0462

(1,3,.,.) = 
 0.01 *
  4.7169  2.6069 -19.2072
   4.0723 -15.5102 -7.2282
  -0.0160  7.9688 -16.0768
[ CUDAFloatType{1,3,3,3} ]  1
[ CUDAFloatType{1} ]  0
[ CUDAFloatType{1} ] (1,1,.,.) = 
  0.0704 -0.3255 -0.2237
  0.2020  0.2314 -0.2285
  0.2173  0.2519 -0.3084
[ CUDAFloatType{1,1,3,3} ]  1
[ CUDAFloatType{1} ]  0
[ CUDAFloatType{1} ]

========STEP========

double_conv[params]: (1,1,.,.) = 
 -0.0955  0.0501 -0.1009
 -0.1205  0.1232 -0.1147
 -0.1342 -0.1825 -0.0473

(1,2,.,.) = 
  0.1882 -0.1836 -0.0200
  0.1788 -0.0361  0.0475
  0.0223 -0.0953 -0.0472

(1,3,.,.) = 
 0.01 *
  4.8169  2.7069 -19.1072
   4.1723 -15.4102 -7.1282
   0.0840  8.0688 -16.1767
[ CUDAFloatType{1,3,3,3} ]  0.9990
[ CUDAFloatType{1} ] 0.0001 *
-10.0000
[ CUDAFloatType{1} ] (1,1,.,.) = 
  0.0714 -0.3245 -0.2227
  0.2030  0.2324 -0.2275
  0.2183  0.2529 -0.3074
[ CUDAFloatType{1,1,3,3} ]  0.9990
[ CUDAFloatType{1} ] 0.0001 *
-10.0000
[ CUDAFloatType{1} ]

ptrblck · March 3, 2024, 4:08pm

No, scalar values for losses are expected and the default if the loss is reduced.