Problem with the gradients

laurenssam · November 23, 2017, 12:23pm

Hello there,

I just started using Pytorch, but I got some problems with my gradients. Hopefully you can help me out:slight_smile: I’m trying to train these networks below, however when I’m calling the backward function is always prints None, besides it seems like my parameters of my network are not changing at all(since it keeps predicting the same thing). Any idea what the problem is?

class Variable(autograd.Variable):
def init(self, data, *args, **kwargs):
if USE_CUDA:
data = data.cuda()
super(Variable, self).init(data, *args, **kwargs)

def init_fanin(tensor):
fanin = tensor.size(1)
v = 1.0 / np.sqrt(fanin)
init.uniform(tensor, -v, v)

class Actor(nn.Module):
def init(self, num_feature, num_action):
""“
Initialize a Actor for low dimensional environment.
num_feature: number of features of input.
num_action: number of available actions in the environment.
”""
super(Actor, self).init()
self.fc1 = nn.Linear(num_feature, 400)
init_fanin(self.fc1.weight)
self.fc2 = nn.Linear(400, 300)
init_fanin(self.fc2.weight)
self.fc3 = nn.Linear(300, num_action)
init.uniform(self.fc3.weight, -3e-3, 3e-3)
init.uniform(self.fc3.bias, -3e-3, 3e-3)

def forward(self, x):
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = F.tanh(self.fc3(x))
    return x

class Critic(nn.Module):
def init(self, num_feature, num_action):
""“
Initialize a Critic for low dimensional environment.
num_feature: number of features of input.
”""
super(Critic, self).init()
self.fc1 = nn.Linear(num_feature, 400)
init_fanin(self.fc1.weight)
# Actions were not included until the 2nd hidden layer of Q.
self.fc2 = nn.Linear(400 + num_action, 300)
init_fanin(self.fc2.weight)
self.fc3 = nn.Linear(300, 1)
init.uniform(self.fc3.weight, -3e-3, 3e-3)
init.uniform(self.fc3.bias, -3e-3, 3e-3)

def forward(self, states, actions):
    x = F.relu(self.fc1(states))
    # Actions were not included until the 2nd hidden layer of Q.
    x = torch.cat((x, actions), 1)
    x = F.relu(self.fc2(x))
    x = self.fc3(x)
    return x

def train(self):
batch_X = np.zeros((self.batch_size, self.number_of_features))
batch_reward = np.zeros((self.batch_size, 1))
batch_actions = np.zeros((self.batch_size, len(self.action_space)))
batch_next = np.zeros((self.batch_size, self.number_of_features))
mask = np.ones(self.batch_size)
for n in range(self.batch_size):
train_state, train_action, train_reward, train_newState, check = self.replay_memory[np.random.randint(0, len(self.replay_memory))]
batch_X[n] = train_state
batch_reward[n] = train_reward
batch_actions[n] = train_action
batch_next[n] = train_newState
if check == True:
mask[n] = 0

    state_batch = Variable(torch.from_numpy(batch_X).type(torch.FloatTensor))
    action_batch = Variable(torch.from_numpy(batch_actions).type(torch.FloatTensor))
    reward_batch = Variable(torch.cat(torch.from_numpy(batch_reward).type(torch.FloatTensor)))
    next_state_batch = Variable(torch.from_numpy(batch_next).type(torch.FloatTensor))
    not_done_mask = Variable(torch.from_numpy(mask).type(torch.FloatTensor))
   
    ### Critic ###
    # Compute current Q value, critic takes state and action choosen
    current_Q_values = self.critic(state_batch, action_batch)

    # Compute next Q value based on which action target actor would choose
    # Detach variable from the current graph since we don't want gradients for next Q to propagated
    target_actions = self.target_actor(state_batch)

    next_max_q = self.target_critic(next_state_batch, target_actions).detach().max(1)[0]
    next_Q_values = not_done_mask * next_max_q

    # Compute the target of the current Q values
    target_Q_values = reward_batch + (self.gamma * next_Q_values)
    # Compute Bellman error (using Huber loss)
    critic_loss = F.smooth_l1_loss(current_Q_values, target_Q_values)
    # Optimize the critic
    print(self.critic_optimizer)
    print(self.critic_optimizer.zero_grad()) ## ALWAYS NONE?
    sys.exit()
    critic_loss.backward()
    self.critic_optimizer.step()



    actor_loss = -self.critic(state_batch, self.actor(state_batch)).mean()
    # Optimize the actor
    self.actor_optimizer.zero_grad()
    print(actor_loss.backward())
    print(actor_loss.grad)

    self.actor_optimizer.step()

    # Update the target networks
    self.update_target(self.target_critic, self.critic)
    self.update_target(self.target_actor, self.actor)
    return critic_loss

jdhao · November 27, 2017, 3:20pm

Format your code first.

SimonW · November 27, 2017, 4:28pm

Please format your code as @jdhao commented above.

Also, self.critic_optimizer.zero_grad() just clears all accumulated gradients. So it is supposed to not return anything, ie return None.