Hey there,
I implemented a four player card game using pytorch and reinforcement learning (PPO). To train the agents I make four exact copys and let them play against each other. I now would like to share after a certain update time the weights between this same networks.
I found this procedure:

Make all your modules

Make all your clones

Add all the modules and clones to a single nn.Container

Call :getParameters on the nn.Container to get params and grads. This will preserve any sharing of parameters between modules inside the nn.Container.

Now using the modules and clones as normal will play nice with optim because all of the params and grads reference the same storage as the tensors from :getParameters.
I tried to implement it as follows: (ppo are the models)
container = nn.Container()
for i in range(4):
container:add(ppo[i])
params = container.parameters()
How to apply the parameters now back to each model?
Is the above method the correct approach?
Further Snippets:
class PPO:
def __init__(self, state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip, lr_decay=1000000):
self.lr = lr
self.betas = betas
self.gamma = gamma
self.eps_clip = eps_clip
self.K_epochs = K_epochs
self.policy = ActorCritic(state_dim, action_dim, n_latent_var)
self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas, eps=1e5) # no eps before!
self.policy_old = ActorCritic(state_dim, action_dim, n_latent_var)
self.policy_old.load_state_dict(self.policy.state_dict())
#TO decay learning rate during training:
self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=lr_decay, gamma=0.9)
self.MseLoss = nn.MSELoss()
class ActorMod(nn.Module):
def __init__(self, state_dim, action_dim, n_latent_var):
super(ActorMod, self).__init__()
self.l1 = nn.Linear(state_dim, n_latent_var)
self.l1_tanh = nn.PReLU()
self.l2 = nn.Linear(n_latent_var, n_latent_var)
self.l2_tanh = nn.PReLU()
self.l3 = nn.Linear(n_latent_var+60, action_dim)
def forward(self, input):
x = self.l1(input)
x = self.l1_tanh(x)
x = self.l2(x)
out1 = self.l2_tanh(x) # 64x1
if len(input.shape)==1:
out2 = input[180:240] # 60x1 this are the available options of the active player!
output =torch.cat( [out1, out2], 0)
else:
out2 = input[:, 180:240]
output =torch.cat( [out1, out2], 1) #how to do that?
x = self.l3(output)
return x.softmax(dim=1)
class ActorCritic(nn.Module):
def __init__(self, state_dim, action_dim, n_latent_var):
super(ActorCritic, self).__init__()
# actor
#TODO see question: https://discuss.pytorch.org/t/pytorchmultipleinputsinsequential/74040
self.action_layer = ActorMod(state_dim, action_dim, n_latent_var)
# critic
self.value_layer = nn.Sequential(
nn.Linear(state_dim, n_latent_var),
nn.PReLU(),#prelu
nn.Linear(n_latent_var, n_latent_var),
nn.PReLU(),
nn.Linear(n_latent_var, 1)
)