First I briefly describe what I am trying a achieve:
I have two FOR loops. Outerloop changes the lambda. For different lambda, the inner loop constructs a network to minimize the loss.
I have the following code:
import torch
import numpy.random as npr
import torch.nn as nn
from models.nn_model import Network
from torch.utils.tensorboard import SummaryWriter
from examples.wumpus import paras
from examples.wumpus.robot_mdp import RobotMDP
from examples.wumpus.wumpus_mdp import WumpusMDP
from examples.wumpus.wumpus_pomdp import WumpusGridwrold
from examples.wumpus.product_wumpus_pomdp import ProductWumpusPOMDP
from examples.wumpus.wumpus_pomdp import *
import pickle
from common import *
def detachToNumpy(tensor):
return tensor.to(torch.device('cpu')).detach().numpy()
class UBVOModel():
def e_vio(self, trajs):
f = torch.zeros(len(trajs))
for i in range(len(trajs)):
f[i] = self.traj_v(trajs[i])
return torch.mean(f)
def traj_v(self, traj):
h_g = torch.zeros(len(traj) - 1)
for i in range(len(traj) - 1):
y = traj[i][-1]
ny = traj[i + 1][-1]
h_g[i] = self.step_v(y, ny)
return torch.sum(h_g)
def step_v(self, y, ny):
f = torch.zeros(len(self.env.action_set))
for i in range(len(self.env.action_set)):
a = list(self.env.action_set.keys())[i]
f[i] = torch.exp((self.env.product_belief_reward(y, a) + self.value.getValue(ny)) / self.value.mu)
g = self.value.mu * torch.log(torch.sum(f)) - self.value.getValue(y)
return torch.max(g, torch.Tensor([0])) ** 2
def traj_l(self, traj):
l = torch.zeros(len(traj))
for i in range(len(traj) - 1): # for each state in the simulated history
y = traj[i][-1]
ny = traj[i + 1][-1]
step_v = self.step_v(y, ny)
l[i] = self.value.getValue(y) + self.value.lam * step_v + self.value.c / 2 * torch.abs(step_v) ** 2
return torch.sum(l)
def entire_l(self, trajs):
l = torch.zeros(len(trajs))
for i in range(len(trajs)):
l[i] = self.traj_l(trajs[i]) # for the ith history
return torch.sum(l)
def train(self, env):
torch.autograd.set_detect_anomaly(True)
self.env = env
# set hyperparameters
self.num_traj = paras.num_traj # the number of trajectories
self.max_time = paras.max_time
self.max_inner_iter = paras.max_inner_iter
self.max_outer_iter = paras.max_outer_iter
self.beta = paras.beta
self.eta = paras.eta
# Create the network
input_size = 66 # 64 inputs for b, 1 input for q, 1 input for a
hidden_size = 128
output_size = 1
net = Network(input_size, hidden_size, output_size)
# create a value obj
self.value = PytorchValue(net, env)
self.value.mu = paras.mu # temperature
self.value.lam = paras.lam
self.value.c = paras.c
# print the parameter's shape
for name, param in self.value.model.named_parameters():
print(name, '\t\t', param.shape)
OPTIMIZER_CONSTRUCTOR = torch.optim.SGD # This is the SGD algorithm.
### TensorBoard Writer Setup ###
log_name = str(self.eta) + str(OPTIMIZER_CONSTRUCTOR.__name__)
writer = SummaryWriter(log_dir="../logs/" + log_name)
print("To see tensorboard, run: tensorboard --logdir=logs/")
# add model into the tensorboard
x = torch.randn(1, input_size)
writer.add_graph(net, x)
# Create the optimizer
optimizer = OPTIMIZER_CONSTRUCTOR(self.value.model.parameters(), lr=self.eta)
# Create the learning rate scheduler
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=paras.max_inner_iter, gamma=0.9)
outer_step = 0
epochs = 0
e_v = np.Inf
while outer_step < self.max_outer_iter:
o_e_v = e_v # set the old expectation as the last round of the expectation
inner_step = 0
print("-------------------------------------------------------------------------------------------")
print("Outer iteration", '\t\t', outer_step, '\t\t', 'C', '\t\t', self.value.c, '\t\t', 'lambda', '\t\t',
self.value.lam)
while inner_step < self.max_inner_iter:
epochs += 1
inner_step += 1
# sample some trajectories
trajs = list()
for i in range(self.num_traj):
y = self.env.sample_belief()
trajs.append(self.env.sample_trajectory(y, self.value, self.max_time))
# compute the loss
loss = self.entire_l(trajs)
loss.backward(retain_graph=True)
# check if the weights are updated
a = list(self.value.model.parameters())[0].clone()
with torch.no_grad():
optimizer.zero_grad()
optimizer.step()
b = list(self.value.model.parameters())[0].clone()
print("Weights Updated: ", not torch.equal(a.data, b.data))
# compute the expectation of h of g
e_v = self.e_vio(trajs)
print('Inner iteration', '\t\t', inner_step, '\t\t', 'loss:', loss.data, '\t\t', 'e_v:', e_v)
writer.add_scalar('Expected violation', e_v, global_step=epochs)
writer.add_scalar('L', loss, global_step=epochs)
writer.add_scalar('Value of' + str((self.env.b0, 0)), self.value.getValue((self.env.b0, 0)),
global_step=epochs)
# Decay Learning Rate
scheduler.step()
# if the expectation is not decreased, then increase the penalty term
if abs(e_v) > 0.9 * abs(o_e_v):
self.value.c = self.beta * self.value.c # c is increasing to infinity
else:
self.value.c = self.value.c
# update the slack variable
self.value.lam = self.value.lam + self.value.c * e_v
outer_step += 1
print("Finish the training!")
writer.close()
pass
class Value(object):
"""
The policy object for genetic algorithms
"""
def __init__(self, srl_model=True):
self.srl_model = srl_model
def getAction(self, obs):
raise NotImplementedError
def getParamSpace(self):
raise NotImplementedError
def setParam(self, param):
raise NotImplementedError
class PytorchValue(Value):
"""
The value object for genetic algorithms, using Pytorch networks
:param model: (Pytorch nn.Module) make sure there is no Sequential, as it breaks .shape function
:param env: (bool) if using an srl model or not
:param mu: (reals) temperature
:param cuda: (bool)
"""
def __init__(self, model, env, cuda=False):
super(PytorchValue, self).__init__()
self.model = model
self.param_len = np.sum([np.prod(x.shape) for x in self.model.parameters()])
self.env = env
self.cuda = cuda
self.device = torch.device("cuda" if torch.cuda.is_available() and cuda else "cpu")
self.policy = {}
self.model = self.model.to(self.device)
def __getstate__(self):
d = self.__dict__.copy()
d['model'] = d['model'].to(torch.device('cpu'))
if 'device' in d:
d['device'] = 'cpu'
return d
def __setstate__(self, d):
if 'device' in d:
d['device'] = torch.device("cuda" if torch.cuda.is_available() and d['cuda'] else "cpu")
d['model'] = d['model'].to(d['device'])
self.__dict__.update(d)
def getAction(self, y):
"""
Returns an action for the given observation
:return: the action
"""
# if y in self.policy:
# actions = list(self.policy[y].keys())
# prob = list(self.policy[y].values())
# normed = [i / sum(prob) for i in prob]
# return actions[npr.choice(range(len(actions)), p=normed)]
# else:
actions = list(self.env.action_set.keys())
normed = [1 / len(actions)] * len(actions)
return actions[npr.choice(range(len(actions)), p=normed)]
def toTensor(self, arr):
"""
Returns a pytorch Tensor object from a numpy array
:param arr: ([float])
:return: (Tensor)
"""
tensor = torch.from_numpy(arr).to(torch.float).to(self.device)
return tensor
def getParamSpace(self):
"""
Returns the size of the parameters for the pytorch network
:return: (int)
"""
return self.param_len
def setParam(self, param):
"""
Set the network bias and weights
:param param: ([float])
"""
nn.utils.vector_to_parameters(self.toTensor(param).contiguous(), self.model.parameters())
def getQValue(self, y, a):
(b, q) = y
b_v = self.belifdict2vector(b) # convert from a dict of belief into a 64 dimension vector
input = torch.cat((b_v, torch.tensor([q]), torch.tensor([a])))
value = self.model(input)
return value
def getValue(self, y):
temp = torch.zeros(len(self.env.action_set))
for i in range(len(list(self.env.action_set.keys()))):
temp[i] = torch.exp(self.getQValue(y, i) / self.mu)
return self.mu * torch.log(torch.sum(temp))
def updatePolicy(self):
for y in self.env.visited:
for i in range(len(list(self.env.action_set.keys()))):
a = list(self.env.action_set.keys())[i]
q_v = detachToNumpy(self.getQValue(y, i) - self.getValue(y))[0][0]
self.policy = put_into_dict2(self.policy, y, a, np.exp(q_v / self.mu))
def belifdict2vector(self, b):
b_v = torch.zeros(len(self.env.wumpus_grid.state_set))
for i in range(len(self.env.wumpus_grid.state_set)):
s = self.env.wumpus_grid.state_set[i]
if s not in b:
b_v[i] = 0
else:
b_v[i] = b[s]
return b_v
if __name__ == '__main__':
# create two mdp
robot = RobotMDP()
wumpus = WumpusMDP()
# create one grid world
wumpus_grid = WumpusGridwrold(robot, wumpus)
# open dfa
with open('../examples/wumpus/dfa.pkl', 'rb') as f:
dfa = pickle.load(f)
product_pomdp = ProductWumpusPOMDP(wumpus_grid, dfa)
# create an agent to optimize
agent = UBVOModel()
agent.train(product_pomdp)
However, I got the following errors:
-------------------------------------------------------------------------------------------
Outer iteration 0 C 2 lambda 0
Weights Updated: False
Inner iteration 1 loss: tensor(782.4988) e_v: tensor(9.7637, grad_fn=<MeanBackward0>)
Weights Updated: False
Inner iteration 2 loss: tensor(790.7100) e_v: tensor(9.8262, grad_fn=<MeanBackward0>)
-------------------------------------------------------------------------------------------
Outer iteration 1 C 2 lambda tensor(19.6525, grad_fn=<AddBackward0>)
Weights Updated: False
Inner iteration 1 loss: tensor(6560.8262) e_v: tensor(9.7936, grad_fn=<MeanBackward0>)
C:\Users\lli4\Anaconda3\lib\site-packages\torch\autograd\__init__.py:130: UserWarning: Error detected in MmBackward. Traceback of forward call that caused the error:
File "D:/workspace/POMDP_CODE/ubvo/ubvo.py", line 298, in <module>
agent.train(product_pomdp)
File "D:/workspace/POMDP_CODE/ubvo/ubvo.py", line 135, in train
e_v = self.e_vio(trajs)
File "D:/workspace/POMDP_CODE/ubvo/ubvo.py", line 24, in e_vio
f[i] = self.traj_v(trajs[i])
File "D:/workspace/POMDP_CODE/ubvo/ubvo.py", line 32, in traj_v
h_g[i] = self.step_v(y, ny)
File "D:/workspace/POMDP_CODE/ubvo/ubvo.py", line 41, in step_v
g = self.value.mu * torch.log(torch.sum(f)) - self.value.getValue(y)
File "D:/workspace/POMDP_CODE/ubvo/ubvo.py", line 261, in getValue
temp[i] = torch.exp(self.getQValue(y, i) / self.mu)
File "D:/workspace/POMDP_CODE/ubvo/ubvo.py", line 255, in getQValue
value = self.model(input)
File "C:\Users\lli4\Anaconda3\lib\site-packages\torch\nn\modules\module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "D:\workspace\POMDP_CODE\models\nn_model.py", line 15, in forward
x = self.l3(x)
File "C:\Users\lli4\Anaconda3\lib\site-packages\torch\nn\modules\module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\Users\lli4\Anaconda3\lib\site-packages\torch\nn\modules\linear.py", line 93, in forward
return F.linear(input, self.weight, self.bias)
File "C:\Users\lli4\Anaconda3\lib\site-packages\torch\nn\functional.py", line 1692, in linear
output = input.matmul(weight.t())
(Triggered internally at ..\torch\csrc\autograd\python_anomaly_mode.cpp:104.)
Variable._execution_engine.run_backward(
Traceback (most recent call last):
File "D:/workspace/POMDP_CODE/ubvo/ubvo.py", line 298, in <module>
agent.train(product_pomdp)
File "D:/workspace/POMDP_CODE/ubvo/ubvo.py", line 124, in train
loss.backward(retain_graph=True)
File "C:\Users\lli4\Anaconda3\lib\site-packages\torch\tensor.py", line 221, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "C:\Users\lli4\Anaconda3\lib\site-packages\torch\autograd\__init__.py", line 130, in backward
Variable._execution_engine.run_backward(
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [128, 1]], which is output 0 of TBackward, is at version 4; expected version 3 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
Can anyone help me with that? I have stuck here for days.