lenobs = 100800
class ActorCritic(nn.Module):
def __init__(self, ran):
super(ActorCritic, self).__init__()
torch.random.manual_seed(ran)
self.l1 = nn.Linear(lenobs,25)
self.l2 = nn.Linear(25,50)
self.actor_lin1 = nn.Linear(50,6)
self.l3 = nn.Linear(50,25)
self.critic_lin1 = nn.Linear(25,1)
def forward(self,x):
x = F.normalize(x,dim=0)
y = F.relu(self.l1(x))
y = F.normalize(y,dim=0)
y = F.relu(self.l2(y))
y = F.normalize(y,dim=0)
actor = F.log_softmax(self.actor_lin1(y),dim=0)
c = F.relu(self.l3(y.detach()))
critic = F.hardtanh(self.critic_lin1(c))
return actor, critic
def doTrain(model, ran):
env = gym.make('Pong-v0')
mi = model(ran)
optimizer = optim.Adam(lr=1e-4,params=mi.parameters())
values, rewards, logprobs = [],[],[]
observation = env.reset()
done = False
N = 0
while done == False and N<10:
N+=1
pobservation = torch.from_numpy(observation)
flattened_pobservation = pobservation.view(-1).float()
policy, value = mi(flattened_pobservation)
values.append(value.item())
sampler = Categorical(policy)
action = sampler.sample()
logprobs.append(policy[action.item()].item())
observation, reward, done, log = env.step(action.item())
if done:
rewards.append(1.0)
else:
rewards.append(reward)
torch_values = torch.Tensor(values).view(-1)
torch_rewards = torch.Tensor(rewards)
torch_logprobs = torch.Tensor(logprobs)#.flip(0)
returns = []
gamma = 0.90
clc = 0.1
ret = torch.Tensor([0])
for r in torch_rewards:
ret = r + gamma*ret
returns.append(ret)
returns = torch.tensor(returns, requires_grad = True).view(-1)
returns = F.normalize(returns,dim=0)
actor_loss = -1*torch_logprobs * (returns - torch_values.detach())
critic_loss = torch.pow(torch_values - returns,2)
loss = actor_loss.sum() + clc*critic_loss.sum()
optimizer.zero_grad()
loss.backward()
gradients = []
for i in mi.parameters():
try:
gradients.append(i.grad)
except:
gradients.append('No Grad')
optimizer.step()
return gradients
updatedParams = []
results = []
with concurrent.futures.ProcessPoolExecutor() as executor:
for i in range(5):
results.append(executor.submit(doTrain, ActorCritic, int((torch.randn(1)**2)*200)))
for f in concurrent.futures.as_completed(results):
updatedParams.append(f.result())
updatedParams
[[None, None, None, None, None, None, None, None, None, None],
[None, None, None, None, None, None, None, None, None, None],
[None, None, None, None, None, None, None, None, None, None],
[None, None, None, None, None, None, None, None, None, None],
[None, None, None, None, None, None, None, None, None, None]]
I am trying to implement A2C for Pong.
I can’t figure why I am getting None for gradient values. Have I broken the computation graph
somewhere ?