Hey there,
I want to use Policy Gradients (see REINFORCE and probability distributions) to train a very simple 4-player card game. I have a lot of questions on this algorithm (part of the code is below, full code is available reinforcement_learning.py.
Questions
- How to design the network?
- I have only little experience on this and would like to hear your suggestions
- How many hidden layers, conv2d or linear etc?
- How to calculate the discounted rewards?
- I am currently using not discounted rewards
- Why should I use discounted rewards? (I have 2 discounted reward functions in the code below which of them would you use?)
- The algorithm learns very slowly (or nothing at all?)
- after 10000 games the result is: [-61199.0, -52777.0, -58842.0, -60025.0]), Player 2 is the reinforcement player.
- How to measure if the algorithm learns something? -> monitor losses.mean() ?
- How long should I train?
- What improvements can I do to learn faster? (Including more Reinforcement players ?, sharing policys?)
- Batches
- I do not use any batches as you can see in the code
- In other examples I saw that many are using batches - why?
Rules of the game explained:
[8] 0 Laura RANDOM Card 9 of R Hand Index 3
[8] 1 Alfons REINFO Card 13 of R Hand Index 5
[8] 2 Frank RANDOM Card 6 of R Hand Index 4
[8] 3 Lea RANDOM Card 5 of R Hand Index 4
Update rewards: [0, -4, 0, 0]
- Round 8, Player Index 0 = Laura plays a Random possible card
- Alfons Player Index 1 plays as Reinforcement Player he wins this round (highest card)
- In this round he earns 4 minus points
General Rules
- All red cards give -1 point
- 60 Cards in total (15 rounds has one game)
State Vector 180x1 binary vector consists of
- 60x1 binary vector of played cards
- 60x1 binary vector of cards in hand of the ai player
- 60x1 binary vector of cards currently played
Code
# tested with python 3.7.5
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np
import stdout
from gameClasses import card, deck, player, game
class PolicyGradientLoss(nn.Module):
def forward(self, log_action_probabilities, discounted_rewards):
# log_action_probabilities -> (B, timesteps, 1)
# discounted_rewards -> (B, timesteps, 1)
losses = -discounted_rewards * log_action_probabilities # -> (B, timesteps, 1)
loss = losses.mean()
print("Mean Loss :" ,round(loss.item(), 5), "Shape losses:", losses.shape)
return loss
### TODO GAME HERE
class TestReinforce:
def __init__(self, parent=None):
#self.playingPolicy = PlayingPolicy()
self.witchesPolicy = WitchesPolicy()
self.options = {}
self.options_file_path = "../data/reinforce_options.json"
with open(self.options_file_path) as json_file:
self.options = json.load(json_file)
self.my_game = game(self.options)
def notifyTrick(self, value):
# der schlimmste wert ist -17 (g10, g5, r1, r2)
# ausser wenn noch mal2 hinzukommt?! dann ist es wohl 21?!
#value +=21
normalizedReward = value / 21 # 21 zuvor sonst 26
if abs(normalizedReward)>1:
stdout.enable()
print(normalizedReward)
print(eeee)
#self.playingPolicy.feedback(normalizedReward)
self.witchesPolicy.feedback(normalizedReward)
def selectAction(self):
'''
the returned action is a hand card index no absolut index!
'''
current_player = self.my_game.active_player
if "RANDOM" in self.my_game.ai_player[current_player]:
action = self.my_game.getRandomOption_()
elif "REINFO" in self.my_game.ai_player[current_player]:
# get state of active player
active_player, state, options = self.my_game.getState()
#print("Options", options)
#print("State: [Ontable, hand, played]\n", state)
#torch_tensor = self.playingPolicy(torch.tensor(state).float() , torch.tensor(options))
torch_tensor = self.witchesPolicy(torch.tensor(state).float() , torch.tensor(options))
# absolut action index:
action_idx = int(torch_tensor[:, 0])
log_action_probability = torch_tensor[:, 1]
card = self.my_game.players[current_player].getIndexOfCard(action_idx)
action = self.my_game.players[current_player].specificIndexHand(card)
return action
def play(self):
total_points = [0, 0, 0, 0]
for j in range(0, 200):
i=0
nuGames = 100
while i<nuGames:
action = self.selectAction()
current_player = self.my_game.active_player
card = self.my_game.players[current_player].hand[action]
print("[{}] {} {}\t{}\tCard {}\tHand Index {}".format(self.my_game.current_round, current_player, self.my_game.names_player[current_player], self.my_game.ai_player[current_player], card, action))
rewards, round_finished = self.my_game.step_idx(action, auto_shift=False)
if round_finished:
# player idx of Reinforce
self.notifyTrick(rewards[1])
print("Update rewards: ", rewards, "\n")
if len(self.my_game.players[current_player].hand) == 0: # game finished
print("update policy at end of one game!")
#self.playingPolicy.updatePolicy()
self.witchesPolicy.updatePolicy()
stdout.enable()
if i == nuGames-1:
print("game finished with:::", self.my_game.total_rewards, "\n")
stdout.disable()
self.my_game.reset_game()
i+=1
if j>100:
for i in range(len(total_points)):
total_points[i] += self.my_game.total_rewards[i]
self.my_game.total_rewards = [0, 0, 0, 0]
stdout.enable()
print(total_points)
if __name__ == "__main__":
trainer = TestReinforce()
trainer.play()