Hi,
I am trying extract some features from time-series data of window size 50. However after some training of a3c, outputs of nn.conv1d turns to nans.
Can someone please help to let me know what am I missing here.
Below is my code for actor critic
class ActorCritic(nn.Module):
def __init__(self, params): super(ActorCritic, self).__init__() self.window = params.window self.state_dim = params.state_dim self.action_space = params.action_dim self.hidden_size = params.hidden_size self.conv1 = nn.Conv1d(1, params.out_channels, params.kernel_size1, stride = params.kernel_size1 - 1, padding = params.kernel_size1 - 2) self.conv2 = nn.Conv1d(params.out_channels, params.out_channels, params.kernel_size2, stride = params.kernel_size2 - 1, padding = params.kernel_size2 - 2) self.lstm_size = self.calc_inputs(params.window) lstm_size = self.lstm_size params.lstm_size = lstm_size self.lstm = nn.LSTMCell(lstm_size, lstm_size) self.lstm.bias_ih.data.fill_(0) self.lstm.bias_hh.data.fill_(0) lst = [lstm_size] for i in range(params.layers): lst.append(params.hidden_size) self.hidden = nn.ModuleList() for k in range(len(lst)-1): self.hidden.append(nn.Linear(lst[k], lst[k+1])) for layer in self.hidden: layer.apply(init_weights) self.critic_linear = nn.Linear(lst[-1], 1) self.critic_linear.apply(init_weights) self.actor_linear = nn.Linear(lst[-1], self.action_space) self.actor_linear.apply(init_weights) self.train() def forward(self, inputs): (state), (hx, cx) = inputs conv1_out = self.conv1(state[1]) conv2_out = self.conv2(conv1_out).reshape(1,-1) inputs = torch.cat([state[0], conv2_out], dim = 1) # print(inputs) hx, cx = self.lstm(inputs, (hx, cx)) x = hx for layer in self.hidden: x = F.elu(layer(x)) return self.critic_linear(x), self.actor_linear(x), (hx, cx) def calc_inputs(self,num_inputs): a = torch.rand(num_inputs).reshape(1,1,-1) a = self.conv2(self.conv1(a)).reshape(-1,) return a.shape[0] + 3 def save(self, filename, directory): torch.save(self.state_dict(), '%s/%s_actor.pth' % (directory, filename)) def load(self, filename, directory): self.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
and here is code for training.
state = env.reset() count = 0 done = True info = False while(info!=True): values = [] log_probs = [] rewards = [] entropies = [] for step in range(params.num_steps): if(info == True): break if(done): h_out = (Variable(torch.zeros([1, params.lstm_size])), Variable(torch.zeros([1, params.lstm_size]))) state = env.reset() else: h_out = (Variable(h_out[0].data), Variable(h_out[1].data)) h_in = h_out value, action_values, h_out = model(((state), h_in)) action_values = action_values.reshape(-1,) prob = F.softmax(action_values - max(action_values), dim = 0) log_prob = torch.log(prob) entropy = -(log_prob * prob).sum() entropies.append(entropy) action = epsilon_greedy(prob, epsilon) # action = Categorical(prob).sample().reshape(-1,) log_prob_a = log_prob.gather(0, Variable(action)) # print("action_values:",action_values) # print("prob:",prob) # print("log_prob:",log_prob) # print("action:",action, "log_prob_a:",log_prob_a) (state), reward, done, info, _ = env.step(action) reward = max(min(reward, 1), -1) count +=1 if done: (state) = env.reset() values.append(value) log_probs.append(log_prob_a) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model(((state), h_out)) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1).to(device) for i in reversed(range(len(rewards))): R = params.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data gae = gae * params.gamma * params.tau + TD policy_loss = policy_loss - log_probs[i] * gae - 0.01 * entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).mean().backward() nn.utils.clip_grad_norm_(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step() model.save(filename, directory="./pytorch_models")