My pytorch version is 0.3.
When I train LSTM with reinforcement learning methods(A2C), the input of one time step is the output of the last time step.However,some outputs of the last step are unrelative so I have to mofidy the output variable.I have tried two ways to implement this and the first is use inplace operations ,just like the following code:
def forward(self, x):
hidden, cell = self.init_hidden()
outputs = []
for i in range(1, self.seq_len+1):
hidden, cell = self.lstmcell(x, (hidden, cell))
output = F.sigmoid(self.classfier(hidden))
if i < self.seq_len:
output = output.clone()
output[:, i:] = 0
output = output.clamp(1e-15, 1-1e-15)
outputs.append(output)
x = output
outputs = torch.cat(outputs, dim=0)
states = (hidden, cell)
return outputs, states
and the other way is like this:
def forward(self, x):
hidden, cell = self.init_hidden()
outputs = []
for i in range(1, self.seq_len+1):
hidden, cell = self.lstmcell(x, (hidden, cell))
output = F.sigmoid(self.classfier(hidden))
if i < self.seq_len:
mask = torch.cat([torch.ones(output.size(0), i), \
torch.zeros(output.size(0), self.seq_len-i)], dim=1)
mask = Variable(mask.cuda())
output = output*mask
output = output.clamp(1e-15, 1-1e-15)
outputs.append(output)
x = output
outputs = torch.cat(outputs, dim=0)
states = (hidden, cell)
return outputs, states
The code can run normally,but I get the graident NAN when we I have trained tens of epochs and it cause all parameters of the LSTM become NAN.I have debug that the forward computation results and loss is normal before the gradient becomes NAN.
Of course, I have tried
torch.nn.utils.clip_grad_norm(agent.parameters(), 0.25)
and it seems not work.I show the completed code here,
class Agent(nn.Module):
def __init__(self, batch_size, seq_len, input_size, hidden_size, num_class):
super(Agent, self).__init__()
self.batch_size = batch_size
self.seq_len = seq_len
self.input_size = input_size
self.cell_size = self.hidden_size = hidden_size
self.lstmcell = nn.LSTMCell(self.input_size, self.hidden_size)
self.classfier = nn.Linear(self.hidden_size, num_class)
def init_hidden(self):
return (Variable(torch.zeros(self.batch_size, self.hidden_size).cuda()),\
Variable(torch.zeros(self.batch_size, self.cell_size).cuda()))
def forward(self, x):
hidden, cell = self.init_hidden()
outputs = []
for i in range(1, self.seq_len+1):
hidden, cell = self.lstmcell(x, (hidden, cell))
output = F.sigmoid(self.classfier(hidden))
if i < self.seq_len:
# output = output.clone() #use inplace operation must use clone()
# output[:, i:] = 0
mask = torch.cat([torch.ones(output.size(0), i), \
torch.zeros(output.size(0), self.seq_len-i)], dim=1)
mask = Variable(mask.cuda())
output = output*mask
output = output.clamp(1e-15, 1-1e-15)
outputs.append(output)
x = output
outputs = torch.cat(outputs, dim=0)
states = (hidden, cell)
return outputs, states
def train(trainloader, epoch, num_blocks, dnet, agent, agent_optimizer, value_network, value_network_optimizer, block_config, growth_rate):
agent.train()
value_network.train()
dnet.eval()
layer_num = sum(block_config)
for batch_idx, (inputs, targets) in tqdm.tqdm(enumerate(trainloader), total=len(trainloader)):
inputs, targets = Variable(inputs.cuda()), Variable(targets.cuda(async=True))
empty_embedding = Variable(torch.zeros(1, sum(block_config)).cuda())
# hook = agent.output.register_hook(agent.print_grad)
probs, _ = agent(empty_embedding)
distribution = Bernoulli(probs)
policy = distribution.sample()
if args.cl_step < num_blocks:
for i in range(layer_num-args.cl_step):
policy[i, 0:(i+1)] = 1 #in order to ca lculate reward
policy_mask = Variable(torch.ones(policy.size(0)).cuda())
policy_mask[:-args.cl_step] = 0 #in order to calculate loss
else:
policy_mask = None
preds = dnet.forward(inputs, policy.data.cpu().numpy())
qs, match = get_reward(preds, targets, policy.data, growth_rate, block_config)
qs = Variable(qs)
v_inputs = Variable(empty_embedding.data)
vs = value_network(v_inputs).detach()
advantage = qs - vs
#agent loss
loss = -distribution.log_prob(policy)
loss = loss.sum(dim=1)
if policy_mask is not None:
loss = policy_mask.type(torch.cuda.FloatTensor)*loss
loss = loss.sum()*advantage.sum()
entropy_loss = -probs * torch.log(probs)
entropy_loss = 0.01*args.beta * entropy_loss.sum()
loss = loss/inputs.size(0) - entropy_loss
#train agent
agent_optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm(agent.parameters(), 0.25)
agent_optimizer.step()
#value network loss
criterion = nn.MSELoss().cuda()
values = value_network(Variable(empty_embedding.data))
values = values.repeat(inputs.size(0), 1)
target_values = qs
value_network_loss = criterion(values, target_values)
#train value network
value_network_optimizer.zero_grad()
value_network_loss.backward()
value_network_optimizer.step()
I have been stuck in this problem for a few days.Hope anyone can help me.Thanks very much!