How to copy NN models?

I tried for a while now but no matter what the parameters seems not to be copied over .
pulling the model from a central one

void ThreadHandler::updateNet(MyNet extern_net)
{
m_net.lock();
// MyNet cloned_net = MyNet(options.input_size, options.hidden_size, options.hidden_size2, options.hidden_size3, options.output_size);// should change the parameters to be dynamic

// auto temp = *std::dynamic_pointer_cast<MyNet>(main_net->clone());//just a pointer
auto paramsA = main_net->named_parameters(true);
auto paramsB = extern_net->named_parameters(true);

extern_net->to(options.device);
torch::manual_seed(options.seed);

bool brek = false;

torch::autograd::GradMode::set_enabled(false);
for (auto& paramA : paramsA) {
for (auto& paramB : paramsB) {
if (paramA.key() == paramB.key()) {
if (!paramA.value().grad().defined()) {
std::cout << “Error: Main Net” << paramA.key() << " is not defined" << std::endl;
brek = true;
break;
}
if (!paramB.value().grad().defined()) {
std::cout << “Error: Extern Net” << paramB.key() << " is not defined" << std::endl;
brek = true;
break;
}
//paramB.value().data().copy_(paramA.value().data().clone());

            paramB.value().grad().data() = paramA.value().grad().data().clone();
            
            break;
        }
    }
    if (brek)
    {
        break;
    }
}

torch::autograd::GradMode::set_enabled(true);

// std::cout << "Seed is : " << options.seed << std::endl;
m_net.unlock();
}

Just a primitive try to average between multiply models

void ThreadHandler::average(MyNet extern_net)
{
std::cout << “Averaging” << std::endl;
// MyNet mainnet = MyNet(options.input_size, options.hidden_size, options.hidden_size2, options.hidden_size3, options.output_size);
// cloneNet(mainnet);
m_net.lock();
auto paramsA = extern_net->named_parameters(true);
auto paramsB = main_net->named_parameters(true);

bool brek = false;

torch::autograd::GradMode::set_enabled(false);
for (auto& paramA : paramsA) {
for (auto& paramB : paramsB) {
if (paramA.key() == paramB.key()) {
if (!paramA.value().grad().defined()) {
std::cout << “Error: Main Net” << paramA.key() << " is not defined" << std::endl;
brek = true;
break;
}
if (!paramB.value().grad().defined()) {
std::cout << “Error: Extern Net” << paramB.key() << " is not defined" << std::endl;
brek = true;
break;
}
//paramB.value().data().copy_((paramA.value().data() + paramB.value().data()) / 2.0);
paramB.value().grad().data() = (paramA.value().grad().data()+ paramB.value().grad().data()) / 2.0;
break;
}
}
if (brek)
{
break;
}
}

// main_net = mainnet;
torch::autograd::GradMode::set_enabled(true);

m_net.unlock();

}

maybe my train method helps

void ThreadHandler::train(MyNet extern_net, Options options)
{

// for Debugging only
std::vector<int64_t> expected_size = { 29, 1, 4 };
extern_net->train();
auto train_set = ClimateDataset(data).map(torch::data::transforms::Stack<>());
torch::manual_seed(options.seed);
extern_net->to(options.device);

torch::optim::Adam optimizer(extern_net->parameters(), torch::optim::AdamOptions(options.learning_rate));

auto train_loader = torch::data::make_data_loader<torch::data::samplers::DistributedRandomSampler>(
    std::move(train_set),
    torch::data::DataLoaderOptions().batch_size(options.train_batch_size));

for (int i = 0; i < options.iterations; i++) {
    for (auto& batch : *train_loader) {

        optimizer.zero_grad();


        auto inputs = batch.data.to(options.device);
        auto targets = batch.target.to(options.device);
        std::vector<int64_t> input_size = inputs.sizes().vec();
        std::vector<int64_t> target_size = targets.sizes().vec();
        //for debugging and condinal breaks only
        int64_t ist = input_size.operator[](2);
        int64_t tes = target_size.operator[](2);
        TORCH_CHECK(input_size == expected_size);

        torch::Tensor prediction = extern_net->forward(inputs);

        auto loss = torch::l1_loss(prediction, targets);
        std::cout << "Loss: " << loss.item<float>() << std::endl;
        loss.backward();
        optimizer.step();
    }
}

if (options.notwhole) {
	average(extern_net);
}
else {
	averageWhole(extern_net);
}

stop();
//monitor();

}

Since half of it is just coping the modul (in pytorch terminology).
I done following by following older threads :

#pragma once
#include
#include <torch/torch.h>
#include

struct MyNetImpl : public torch::nn::Cloneable {

int64_t input_size, hidden_size, hidden_size2, hidden_size3, output_size;
torch::nn::Linear i2h, h2h, h2h2, h2h3, h2o;
std::string name = "MyNet";


MyNetImpl(int64_t input_size, int64_t hidden_size, int64_t hidden_size2, int64_t hidden_size3, int64_t output_size) :
    i2h(register_module("i2h", torch::nn::Linear(input_size, hidden_size))),
    h2h(register_module("h2h", torch::nn::Linear(hidden_size, hidden_size2))),
    h2h2(register_module("h2h2", torch::nn::Linear(hidden_size2, hidden_size3))),
    h2h3(register_module("h2h3", torch::nn::Linear(hidden_size3, hidden_size3))),
    h2o(register_module("h2o", torch::nn::Linear(hidden_size3, output_size))) {
    
    this->input_size = input_size;
    this->hidden_size = hidden_size;
    this->hidden_size2 = hidden_size2;
    this->hidden_size3 = hidden_size3;

}
    
void reset() override {
    // Re-construct the layers
    i2h = torch::nn::Linear(input_size, hidden_size);
    h2h = torch::nn::Linear(hidden_size, hidden_size2);
    h2h2 = torch::nn::Linear(hidden_size2, hidden_size3);
    h2h3 = torch::nn::Linear(hidden_size3, hidden_size3);
    h2o = torch::nn::Linear(hidden_size3, output_size);

    // Re-register the layers to the module
    register_module("i2h", i2h);
    register_module("h2h", h2h);
    register_module("h2h2", h2h2);
    register_module("h2h3", h2h3);
    register_module("h2o", h2o);

}

torch::Tensor forward(torch::Tensor x1) {
    std::vector<int64_t> expected_size = { 29, 1, 4 };
    std::vector<int64_t> input_size = x1.sizes().vec();
  //  std::cout << "Input tensor x1: " << x1 << std::endl;
    TORCH_CHECK(input_size == expected_size);
    
    //  torch::Tensor xi = torch::cat({ x1, x2, x3 }, 1);
   // std::cout << "Input tensor x1: " << x1 << std::endl;
    torch::Tensor xm = torch::relu(i2h->forward(x1));
    //std::cout << "Input tensor xm: " << xm << std::endl;
    torch::Tensor xm2 = torch::relu(h2h->forward(xm));
    torch::Tensor xm3 = torch::relu(h2h2->forward(xm2));
    torch::Tensor xo = torch::relu(h2h3->forward(xm3));
    //std::cout << "Input tensor xo: " << xo << std::endl;
    torch::Tensor x = h2o->forward(xo);
    // std::cout << "Input tensor x: " << x << std::endl;

    return x;
}

};
TORCH_MODULE(MyNet);

And then try to use it with :

void ThreadHandler::cloneNet(MyNet extern_net)
{
m_net.lock();

main_net->name = " main_net";
std::cout << "cloning main "<<  main_net->name << std::endl;
auto clone = main_net->clone();
std::shared_ptr<MyNetImpl> myNetImplPtr = std::dynamic_pointer_cast<MyNetImpl>(clone);
extern_net = MyNet(std::move(myNetImplPtr));
std::cout << "cloned main " << extern_net->name << std::endl;
extern_net->name = "extern_net";
m_net.unlock();

}

The result is a “Unhandled exception at 0x00007FFDC6BBCF19 in ClimaMultithreadedDebug.exe: Microsoft C++ exception: c10::Error at memory location 0x000000F3A8BBB6C0.”

Following the error code brought me to many forum thread here and else where a few say project settings are to cause but since I done everything human possible that they are correct.

So is this a bug or what is going on ?