I am using pytorch to make an A3C with 4 processes as in below code.
But to my surprise while training the A3C action values goes to nan for all actions. Initially it was not the case that action values were gone to nan.
But after an overnight training it goes to nan. Can someone please help to let me know what issue is there.
class SharedAdam(torch.optim.Adam):
def __init__(self, params, lr=1e-3, betas=(0.9, 0.99), eps=1e-8,
weight_decay=0):
super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
# State initialization
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
state['step'] = 0
state['exp_avg'] = torch.zeros_like(p.data)
state['exp_avg_sq'] = torch.zeros_like(p.data)
# share in memory
state['exp_avg'].share_memory_()
state['exp_avg_sq'].share_memory_()
class ActorCritic(torch.nn.Module):
def __init__(self, num_inputs, action_space):
super(ActorCritic, self).__init__()
self.num_inputs = num_inputs
self.action_space = action_space
self.lstm = nn.LSTMCell(num_inputs, num_inputs)
num_outputs = action_space
self.fc1 = nn.Linear(num_inputs, 256)
self.fc1.apply(init_weights)
self.fc2 = nn.Linear(256, 256)
self.fc2.apply(init_weights)
self.critic_linear = nn.Linear(256, 1)
self.critic_linear.apply(init_weights)
self.actor_linear = nn.Linear(256, num_outputs)
self.actor_linear.apply(init_weights)
self.lstm.bias_ih.data.fill_(0)
self.lstm.bias_hh.data.fill_(0)
self.sig1 = nn.Sigmoid()
self.train()
def forward(self, inputs):
inputs, (hx, cx) = inputs
hx, cx = self.lstm(inputs, (hx, cx))
x = self.sig1(self.fc1(hx))
x = torch.tanh(self.fc2(x))
return self.critic_linear(x), self.actor_linear(x), (hx, cx)
def save(self, filename, directory):
torch.save(self.state_dict(), '%s/%s_actor.pth' % (directory, filename))
def load(self, filename, directory):
self.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
below is code for training
def train(rank,params, model, optimizer,data):
try:
data = data.dropna()
count = 0
data = torch.DoubleTensor(np.asarray(data))
env = ENV(params.state_dim, params.action_dim, data)
print("env created\n")
# init training variables
max_timesteps = data.shape[0] - 1
state = env.reset()
done = True
episode_length = 0
count = 0
while count<max_timesteps-1:
episode_length += 1
if done:
cx = Variable(torch.zeros(1, params.state_dim))
hx = Variable(torch.zeros(1, params.state_dim))
else:
cx = Variable(cx.data)
hx = Variable(hx.data)
values = []
log_probs = []
rewards = []
entropies = []
while count<max_timesteps-1:
value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))
prob = F.softmax(action_values,dim = -1)
log_prob = F.log_softmax(action_values, dim=-1).reshape(-1,)
entropy = -(log_prob * prob).sum(1, keepdim=True)
entropies.append(entropy)
action = sample(prob)
log_prob = log_prob.gather(0, Variable(action))
state, reward, done = env.step(action)
done = (done or count == max_timesteps-2)
reward = max(min(reward, 1), -1)
count +=1
if done:
episode_length = 0
state = env.reset()
values.append(value)
log_probs.append(log_prob)
rewards.append(reward)
print(ticker, "rank ",rank," action:",action, "reward ",reward)
if done:
break
R = torch.zeros(1, 1)
if not done:
value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
R = value.data
values.append(Variable(R))
policy_loss = 0
value_loss = 0
R = Variable(R)
gae = torch.zeros(1, 1)
for i in reversed(range(len(rewards))):
R = params.gamma * R + rewards[i]
advantage = R - values[i]
value_loss = value_loss + 0.5 * advantage.pow(2)
TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data
gae = gae * params.gamma * params.tau + TD
policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i]
optimizer.zero_grad()
(policy_loss + 0.5 * value_loss).backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 40)
optimizer.step()
except:
traceback.print_exc()
below is code for sampling an action
def sample(logits):
noise = torch.rand(logits.shape)
return torch.argmax(logits - torch.log(-torch.log(noise)), 1)