Optimization and correctness of Yolo v1 in c++

Hello there i have a few questions about my model.

I want to implement YOLO v1 to detect my cats, on a Raspberry pi. But im still developing on my pc.
First i have a performance issue, it takes about 1 second for one training image (on a very good pc).
(I think my loss function takes ages, probably you have some ideas to increase the performance)

Another problem is, the outputs of my model does not change, after 100 of runs.

Some infos, i only want to detect my two cats, therefor the calculation of the best boxes is very diffrent ( I choose simple the highest value for each cat) , but this does not impact on the loss function.
input resolution is 3x480x640 output is 14x15x20 the first dimension is defined as (C1, X1, Y1, W1, H1, Cat1, Cat2) times two :slight_smile:

i programmed it after reading this: https://medium.com/oracledevs/final-layers-and-loss-functions-of-single-stage-detectors-part-1-4abbfa9aa71c

My Model:

#define LAYER1 16
#define LAYER2 32
#define LAYER3 64
#define LAYER4 128
#define LAYER5 256
#define LAYER6 512
#define LAYER7 1024
#define LAYER8 1024
#define LAYER9 14

struct MyYoloV1Impl : nn::Module
{
  MyYoloV1Impl()
      : convblock1(nn::Conv2dOptions(3, LAYER1, 3).padding(1).stride(2)),
        convblock2(nn::Conv2dOptions(LAYER1, LAYER2, 3).padding(1).stride(2)),
        convblock3(nn::Conv2dOptions(LAYER2, LAYER3, 3).padding(1).stride(2)),
        convblock4(nn::Conv2dOptions(LAYER3, LAYER4, 3).padding(1).stride(2)),
        convblock5(nn::Conv2dOptions(LAYER4, LAYER5, 3).padding(1).stride(2)),
        convblock6(nn::Conv2dOptions(LAYER5, LAYER6, 3).padding(1)),
        convblock7(nn::Conv2dOptions(LAYER6, LAYER7, 3).padding(1)),
        convblock8(nn::Conv2dOptions(LAYER7, LAYER8, 3).padding(1)),
        convblock9(nn::Conv2dOptions(LAYER8, LAYER9, 1)),
            batch_norm_block1(LAYER1),
            batch_norm_block2(LAYER2),
            batch_norm_block3(LAYER3),
            batch_norm_block4(LAYER4),
            batch_norm_block5(LAYER5),
            batch_norm_block6(LAYER6),
            batch_norm_block7(LAYER7),
            batch_norm_block8(LAYER8),
            batch_norm_block9(LAYER9)
  {
      register_module("convblock1", convblock1);
      register_module("convblock2", convblock2);
      register_module("convblock3", convblock3);
      register_module("convblock4", convblock4);
      register_module("convblock5", convblock5);
      register_module("convblock6", convblock6);
      register_module("convblock7", convblock7);
      register_module("convblock8", convblock8);
      register_module("convblock9", convblock9);

      register_module("batch_norm_block1", batch_norm_block1);
      register_module("batch_norm_block2", batch_norm_block2);
      register_module("batch_norm_block3", batch_norm_block3);
      register_module("batch_norm_block4", batch_norm_block4);
      register_module("batch_norm_block5", batch_norm_block5);
      register_module("batch_norm_block6", batch_norm_block6);
      register_module("batch_norm_block7", batch_norm_block7);
      register_module("batch_norm_block8", batch_norm_block8);
      register_module("batch_norm_block9", batch_norm_block9);

  }

    torch::Tensor forward(torch::Tensor x, bool train)
    {
        x = torch::relu(batch_norm_block1(convblock1(x)));
        //std::cout << "0 " << x.size(0) << " " << x.size(1) << " " << x.size(2) << " " << x.size(3) <<  " " << x.size(3)/(double)x.size(2) << std::endl;
        x = torch::relu(batch_norm_block2(convblock2(x)));
        //std::cout << "2 " << x.size(0) << " " << x.size(1) << " " << x.size(2) << " " << x.size(3) <<  " " << x.size(3)/(double)x.size(2) << std::endl;
        x = torch::relu(batch_norm_block3(convblock3(x)));
        //std::cout << "3 " << x.size(0) << " " << x.size(1) << " " << x.size(2) << " " << x.size(3) <<  " " << x.size(3)/(double)x.size(2) << std::endl;
        x = torch::relu(batch_norm_block4(convblock4(x)));
        //std::cout << "4 " << x.size(0) << " " << x.size(1) << " " << x.size(2) << " " << x.size(3) <<  " " << x.size(3)/(double)x.size(2) << std::endl;
        x = torch::relu(batch_norm_block5(convblock5(x)));
        //std::cout << "5 " << x.size(0) << " " << x.size(1) << " " << x.size(2) << " " << x.size(3) <<  " " << x.size(3)/(double)x.size(2) << std::endl;
        x = torch::relu(batch_norm_block6(convblock6(x)));
        //std::cout << "6 " << x.size(0) << " " << x.size(1) << " " << x.size(2) << " " << x.size(3) <<  " " << x.size(3)/(double)x.size(2) << std::endl;
        x = torch::relu(batch_norm_block7(convblock7(x)));
        //std::cout << "7 " << x.size(0) << " " << x.size(1) << " " << x.size(2) << " " << x.size(3) <<  " " << x.size(3)/(double)x.size(2) << std::endl;
        x = torch::relu(batch_norm_block8(convblock8(x)));
        //std::cout << "8 " << x.size(0) << " " << x.size(1) << " " << x.size(2) << " " << x.size(3) <<  " " << x.size(3)/(double)x.size(2) << std::endl;
        x = torch::relu(batch_norm_block9(convblock9(x)));
        //std::cout << "9 " << x.size(0) << " " << x.size(1) << " " << x.size(2) << " " << x.size(3) <<  " " << x.size(3)/(double)x.size(2) << std::endl;
        x = torch::sigmoid(x);
        return x;
    }
    nn::Conv2d convblock1;
    nn::Conv2d convblock2;
    nn::Conv2d convblock3;
    nn::Conv2d convblock4;
    nn::Conv2d convblock5;
    nn::Conv2d convblock6;
    nn::Conv2d convblock7;
    nn::Conv2d convblock8;
    nn::Conv2d convblock9;
    nn::BatchNorm2d batch_norm_block1;
    nn::BatchNorm2d batch_norm_block2;
    nn::BatchNorm2d batch_norm_block3;
    nn::BatchNorm2d batch_norm_block4;
    nn::BatchNorm2d batch_norm_block5;
    nn::BatchNorm2d batch_norm_block6;
    nn::BatchNorm2d batch_norm_block7;
    nn::BatchNorm2d batch_norm_block8;
    nn::BatchNorm2d batch_norm_block9;
};

TORCH_MODULE(MyYoloV1);

Constructor:

NeuralNetwork::NeuralNetwork() :
    device(torch::kCPU),
    nnYolo_optimizer(nnYolo->parameters(), torch::optim::AdamOptions(2e-4).betas(std::make_tuple(0.5, 0.9999)))
{
    if (torch::cuda::is_available())
        device = torch::kCUDA;

    nnYolo->to(device);
    loadMatrix();
}

learn function:

Cats NeuralNetwork::learnObjectDetector(Cats solution, BYTE* inputs)
{
    static bool f = false;
    if(!f)
    {
        f = true;
        nnYolo->to(device);
    }
    at::Tensor matrix = torch::from_blob(inputs, {1, 3, yResolution, xResolution}, at::kByte);
    matrix = matrix.to(at::kFloat).to(device);
    matrix = matrix/255.0;
    //std::cout << "Image" << matrix << std::endl;
    Tensor output;
    try
    {
        nnYolo_optimizer.zero_grad();
        output = nnYolo(matrix, true);
        Tensor d_loss;
        Cats result = lossAndResult(solution, &d_loss, &output);
        d_loss = d_loss.set_requires_grad(true);
        d_loss = d_loss.to(device);
        d_loss.backward();

        nnYolo_optimizer.step();
        return result;
    }
    catch (const c10::Error& e)
    {
        std::cout << "error" << std::endl;
        std::cout << e.what() << std::endl;
        return solution;
    }

}

pretty bad loss function:

{
    Cats result;
    *loss = torch::zeros({1,});
    *loss = loss->to(kFloat);
    int xRes = 20;
    int yRes = 15;
    float noobj = 0.5;
    float coord = 1.0;

    for(int y = 0; y < yRes ; y++)
    {
        for(int x = 0; x < xRes ; x++)
        {
            float xPos = x * xResolution/xRes;
            float yPos = y * yResolution/yRes;

            float xOffsetFindus = solution.findusX1 - xPos;
            float yOffsetFindus = solution.findusY1 - yPos;
            float xOffsetGeorgie = solution.georgianaX1 - xPos;
            float yOffsetGeorgie = solution.georgianaY1 - yPos;

            if(solution.findus <= 0.5)
                xOffsetFindus = -1;
            if(solution.georgiana <= 0.5)
                xOffsetGeorgie = -1;
            bool georgie = false;
            bool findus = false;

            if(xOffsetFindus <= xRes && xOffsetFindus >= 0 &&
                    yOffsetFindus <= yRes && yOffsetFindus >= 0)
            {
                findus = true;
            }

            if(xOffsetGeorgie <= xRes && xOffsetGeorgie >= 0 &&
                    yOffsetGeorgie <= yRes && yOffsetGeorgie >= 0)
            {
                georgie = true;
            }

            if(findus && !georgie)
            {
                *loss += pow((0.999 - (*modelOutput)[0][0][y][x].item<float>()), 2); // 1 object value
                *loss += coord*pow((xOffsetFindus/xRes - (*modelOutput)[0][1][y][x].item<float>()), 2); // 1 xoffset
                *loss += coord*pow((yOffsetFindus/yRes - (*modelOutput)[0][2][y][x].item<float>()), 2); // 1 yoffset
                *loss += coord*pow((sqrt(solution.findusX2 / xResolution) - sqrt((*modelOutput)[0][3][y][x].item<float>())), 2); // 1 width
                *loss += coord*pow((sqrt(solution.findusY2 / yResolution) - sqrt((*modelOutput)[0][4][y][x].item<float>())), 2); // 1 height
                *loss += pow((0.001 - (*modelOutput)[0][5][y][x].item<float>()), 2); // 1 georgie
                *loss += pow((0.999 - (*modelOutput)[0][6][y][x].item<float>()), 2); // 1 findus

                *loss += pow((0.999 - (*modelOutput)[0][7][y][x].item<float>()), 2); // 2 object value
                *loss += coord*pow((xOffsetFindus/xRes - (*modelOutput)[0][8][y][x].item<float>()), 2); // 2 xoffset
                *loss += coord*pow((yOffsetFindus/yRes - (*modelOutput)[0][9][y][x].item<float>()), 2); // 2 yoffset
                *loss += coord*pow((sqrt(solution.findusX2 / xResolution) - sqrt((*modelOutput)[0][10][y][x].item<float>())), 2); // 2 width
                *loss += coord*pow((sqrt(solution.findusY2 / yResolution) - sqrt((*modelOutput)[0][11][y][x].item<float>())), 2); // 2 height
                *loss += pow((0.001 - (*modelOutput)[0][12][y][x].item<float>()), 2); // 2 georgie
                *loss += pow((0.999 - (*modelOutput)[0][13][y][x].item<float>()), 2); // 2 findus
            }
            else if(!findus && georgie)
            {
                *loss += pow((0.999 - (*modelOutput)[0][0][y][x].item<float>()), 2); // 1 object value
                *loss += coord*pow((xOffsetGeorgie/xRes - (*modelOutput)[0][1][y][x].item<float>()), 2); // 1 xoffset
                *loss += coord*pow((yOffsetGeorgie/yRes - (*modelOutput)[0][2][y][x].item<float>()), 2); // 1 yoffset
                *loss += coord*pow((sqrt(solution.georgianaX2 / xResolution) - sqrt((*modelOutput)[0][3][y][x].item<float>())), 2); // 1 width
                *loss += coord*pow((sqrt(solution.georgianaY2 / yResolution) - sqrt((*modelOutput)[0][4][y][x].item<float>())), 2); // 1 height
                *loss += pow((0.001 - (*modelOutput)[0][5][y][x].item<float>()), 2); // 1 georgie
                *loss += pow((0.999 - (*modelOutput)[0][6][y][x].item<float>()), 2); // 1 findus

                *loss += pow((0.999 - (*modelOutput)[0][7][y][x].item<float>()), 2); // 2 object value
                *loss += coord*pow((xOffsetGeorgie/xRes - (*modelOutput)[0][8][y][x].item<float>()), 2); // 2 xoffset
                *loss += coord*pow((yOffsetGeorgie/yRes - (*modelOutput)[0][9][y][x].item<float>()), 2); // 2 yoffset
                *loss += coord*pow((sqrt(solution.georgianaX2 / xResolution) - sqrt((*modelOutput)[0][10][y][x].item<float>())), 2); // 2 width
                *loss += coord*pow((sqrt(solution.georgianaY2 / yResolution) - sqrt((*modelOutput)[0][11][y][x].item<float>())), 2); // 2 height
                *loss += pow((0.001 - (*modelOutput)[0][12][y][x].item<float>()), 2); // 2 georgie
                *loss += pow((0.999 - (*modelOutput)[0][13][y][x].item<float>()), 2); // 2 findus
            }
            else if(findus && georgie)
            {
                *loss += pow((0.999 - (*modelOutput)[0][0][y][x].item<float>()), 2); // 1 object value
                *loss += coord*pow((xOffsetFindus/xRes - (*modelOutput)[0][1][y][x].item<float>()), 2); // 1 xoffset
                *loss += coord*pow((yOffsetFindus/yRes - (*modelOutput)[0][2][y][x].item<float>()), 2); // 1 yoffset
                *loss += coord*pow((sqrt(solution.findusX2 / xResolution) - sqrt((*modelOutput)[0][3][y][x].item<float>())), 2); // 1 width
                *loss += coord*pow((sqrt(solution.findusY2 / yResolution) - sqrt((*modelOutput)[0][4][y][x].item<float>())), 2); // 1 height
                *loss += pow((0.001 - (*modelOutput)[0][5][y][x].item<float>()), 2); // 1 georgie
                *loss += pow((0.999 - (*modelOutput)[0][6][y][x].item<float>()), 2); // 1 findus

                *loss += pow((0.999 - (*modelOutput)[0][7][y][x].item<float>()), 2); // 2 object value
                *loss += coord*pow((xOffsetGeorgie/xRes - (*modelOutput)[0][8][y][x].item<float>()), 2); // 2 xoffset
                *loss += coord*pow((yOffsetGeorgie/yRes - (*modelOutput)[0][9][y][x].item<float>()), 2); // 2 yoffset
                *loss += coord*pow((sqrt(solution.georgianaX2 / xResolution) - sqrt((*modelOutput)[0][10][y][x].item<float>())), 2); // 2 width
                *loss += coord*pow((sqrt(solution.georgianaY2 / yResolution) - sqrt((*modelOutput)[0][11][y][x].item<float>())), 2); // 2 height
                *loss += pow((0.001 - (*modelOutput)[0][12][y][x].item<float>()), 2); // 2 georgie
                *loss += pow((0.999 - (*modelOutput)[0][13][y][x].item<float>()), 2); // 2 findus
            }
            else
            {
                *loss += noobj * pow((0.999 - (*modelOutput)[0][0][y][x].item<float>()), 2); // 1 object value
                *loss += noobj * pow((0.999 - (*modelOutput)[0][7][y][x].item<float>()), 2); // 2 object value
            }

            if((*modelOutput)[0][0][y][x].item<float>() >= 0.5)
            {
                if((*modelOutput)[0][5][y][x].item<float>() > (*modelOutput)[0][6][y][x].item<float>())
                {
                    if(result.findus <= (*modelOutput)[0][0][y][x].item<float>())
                    {
                        result.findus = (*modelOutput)[0][0][y][x].item<float>();
                        result.findusX1 = (*modelOutput)[0][1][y][x].item<float>()*xRes + x * xRes;
                        result.findusY1 = (*modelOutput)[0][2][y][x].item<float>()*yRes + y * yRes;
                        result.findusX2 = pow((*modelOutput)[0][3][y][x].item<float>(),2);
                        result.findusY2 = pow((*modelOutput)[0][4][y][x].item<float>(),2);
                    }
                }
                else
                {
                    if(result.georgiana <= (*modelOutput)[0][0][y][x].item<float>())
                    {
                        result.georgiana = (*modelOutput)[0][0][y][x].item<float>();
                        result.georgianaX1 = (*modelOutput)[0][1][y][x].item<float>()*xRes + x * xRes;
                        result.georgianaY1 = (*modelOutput)[0][2][y][x].item<float>()*yRes + y * yRes;
                        result.georgianaX2 = pow((*modelOutput)[0][3][y][x].item<float>(),2);
                        result.georgianaY2 = pow((*modelOutput)[0][4][y][x].item<float>(),2);
                    }
                }
            }
            else if((*modelOutput)[0][7][y][x].item<float>() >= 0.5)
            {
                if((*modelOutput)[0][12][y][x].item<float>() > (*modelOutput)[0][13][y][x].item<float>())
                {
                    if(result.findus <= (*modelOutput)[0][7][y][x].item<float>())
                    {
                        result.findus = (*modelOutput)[0][7][y][x].item<float>();
                        result.findusX1 = (*modelOutput)[0][8][y][x].item<float>()*xRes + x * xRes;
                        result.findusY1 = (*modelOutput)[0][9][y][x].item<float>()*yRes + y * yRes;
                        result.findusX2 = pow((*modelOutput)[0][10][y][x].item<float>(),2);
                        result.findusY2 = pow((*modelOutput)[0][11][y][x].item<float>(),2);
                    }
                }
                else
                {
                    if(result.georgiana <= (*modelOutput)[0][7][y][x].item<float>())
                    {
                        result.georgiana = (*modelOutput)[0][7][y][x].item<float>();
                        result.georgianaX1 = (*modelOutput)[0][8][y][x].item<float>()*xRes + x * xRes;
                        result.georgianaY1 = (*modelOutput)[0][9][y][x].item<float>()*yRes + y * yRes;
                        result.georgianaX2 = pow((*modelOutput)[0][10][y][x].item<float>(),2);
                        result.georgianaY2 = pow((*modelOutput)[0][11][y][x].item<float>(),2);
                    }
                }
            }
        }
    }
    return result;
}

thanks for your support!!. i have some trouble understanding the c++ interface and porting phyton examples to c++ :slight_smile: