Why do 2 identical Neural Network fowarding a different result

my torch::module::nn:

struct Net : torch::nn::Module {
	Net(int numIn, int numOut, int numHid, const size_t hid_count=1) {
		assert(hid_count >= 1);
		first = register_parameter("inputW", torch::rand({numIn, numHid}))/numHid;
		middle = new torch::Tensor[hid_count-1];
		for (int i = 1; i != hid_count; i++)
			middle[i] = register_parameter("hidW"+std::to_string(i), torch::rand({numHid, numHid}))/numHid;
		last = register_parameter("outputW", torch::rand({numHid, numOut}))/numOut;
		h_c = hid_count;
		n_h = numHid;
	}
	torch::Tensor forward(torch::Tensor input) {
		torch::Tensor output_layer,h;
		h = (torch::mm(input, first));
		for (int i = 1; i != h_c; i++)
			h = torch::sigmoid(torch::mm(h, middle[i]));
		output_layer = (torch::mm(h, last));
		return output_layer;
	}
	torch::Tensor first, last, *middle;
	size_t h_c, n_h;
};

my unit test:

{
		const int input_nodes = 20, output_nodes=10, hidden_nodes = 2, hidden_count = 1;
		Net nn0 (input_nodes, output_nodes, hidden_nodes, hidden_count);
		Net nn1 (input_nodes, output_nodes, hidden_nodes, hidden_count);
		for (size_t i = 0; i < nn0.parameters().size(); i++ ){//copy all parameters
			torch::Tensor t = nn0.parameters()[i].detach().clone();
			nn1.parameters()[i].set_data(t);
			//nn1.parameters()[i] = nn0.parameters()[i].detach().clone();
		}
		for (size_t i = 0; i < nn0.parameters().size(); i++ )//assert that all the parameters are equal
			assert(torch::sum(nn1.parameters()[i] == nn0.parameters()[i]).item<float>() == nn0.parameters()[i].numel());
		const torch::Tensor input = torch::ones({1,input_nodes});
		std::cout << nn0.forward(input) - nn1.forward(input) << std::endl;
		assert(torch::sum(nn0.forward(input) == nn1.forward(input)).item<float>() == (float) output_nodes);
	}

Unit test explanation:
First we create 2 nn::modules then we copy all the parameters(Tensors)
Second we assert that all the parameters are copied succefully
Third run both nn::modules (which have equal parameters) with the same input and observe different results

Sample output (of the delta of the nn fowards):

 0.3192  0.2746 -0.0047  0.2926 -0.4053 -0.1266 -0.2181 -0.4602  0.8585  0.0089

Also in this unit test we can see that on the same NN if we give the same inputs, it will give the same output

	{//assert that fowarding the same NN with the same inputs provides the same output
		const int input_nodes = 20, output_nodes=10, hidden_nodes = 2, hidden_count = 1;
		Net nn0 (input_nodes, output_nodes, hidden_nodes, hidden_count);
		const torch::Tensor input = torch::rand({1,input_nodes});
		assert(torch::sum(nn0.forward(input) == nn0.forward(input)).item<float>() == (float) output_nodes);
	}

Update:
The source of the issue appers to be that torch::nn::module::parameters() does reference the same underline torch::Tensor as it’s research value

struct Net : torch::nn::Module {
	Net(int numIn, int numOut, int numHid, const size_t hid_count=1) {
                ...
		first = register_parameter("inputW", torch::rand({numIn, numHid}))/numHid;
                ...
        }
        ...
}
{//in some unit test
...
		assert(torch::sum(nn1.parameters()[0] == nn1.first).item<float>() == nn1.parameters()[0].numel());
...
}//this fails

Any suggestion on how to fix this?

Your problem is caused by dividing the result of ‘register_parameter’ → it creates different tensor which is then assigned to member variable (thus can’t be affected from outside the module by editing ‘.parameters()’)
the part → “/numHid”:

first = register_parameter("inputW", torch::rand({numIn, numHid}))/numHid;

Full example using approach from: How to copy parameters:

struct NetImpl : torch::nn::Module {
	NetImpl(int numIn, int numOut, int numHid, const size_t hid_count = 1) {
		assert(hid_count >= 1);
		first = register_parameter("inputW", torch::rand({ numIn, numHid }));
		middle = new torch::Tensor[hid_count - 1];
		for (int i = 1; i != hid_count; i++)
			middle[i] = register_parameter("hidW" + std::to_string(i), torch::rand({ numHid, numHid }));
		last = register_parameter("outputW", torch::rand({ numHid, numOut }));
		h_c = hid_count;
		n_h = numHid;
	}
	torch::Tensor forward(torch::Tensor input)
	{
		torch::Tensor output_layer, h;
		h = (torch::mm(input, first));
		for (int i = 1; i != h_c; i++)
			h = torch::sigmoid(torch::mm(h, middle[i]));
		output_layer = (torch::mm(h, last));
		return output_layer;
	}
	torch::Tensor first, last, *middle;
	size_t h_c, n_h;
};

TORCH_MODULE(Net);

void main(int argc, char** argv)
{
	try
	{
		const int input_nodes = 20, output_nodes = 10, hidden_nodes = 2, hidden_count = 1;

		Net nn0(input_nodes, output_nodes, hidden_nodes, hidden_count);
		Net nn1(input_nodes, output_nodes, hidden_nodes, hidden_count);

		{
			torch::autograd::GradMode::set_enabled(false);
			auto src = nn0->named_parameters(true /*recurse*/);
			auto dst = nn1->named_parameters(true /*recurse*/);

			for (auto& val : src)
			{
				auto name = val.key();
				auto* t = dst.find(name);
				if (t != nullptr)
				{
					t->copy_(val.value());
				}
			}

			torch::autograd::GradMode::set_enabled(true);
		}

		const torch::Tensor input = torch::ones({ 1,input_nodes });

		std::cout << "Diff: " << nn0->forward(input) - nn1->forward(input) << std::endl;
	}
	catch (const c10::Error& e)
	{
		std::cout << e.what() << std::endl;
	}
	catch (const std::runtime_error& e)
	{
		std::cout << e.what() << std::endl;
	}
	
	system("PAUSE");
}