The following runs with pytorch 0.4 (note that there are no more variables needed since they have been merged with tensors in 0.4) . For lower versions you have to do some minor changes.

```
import torch
torch.manual_seed(42)
from torch import nn
from torch.distributions import Categorical
import gym
class Policy(nn.Module):
"""model definition: Simple Network with Linear Layers and 2 Outputs"""
def __init__(self):
super(Policy, self).__init__()
# actual network
self.network = nn.Sequential(nn.Linear(4, 128), nn.ReLU(), nn.Linear(128, 2), nn.Softmax(dim=0))
# lists to store log_probs and the corresponding rewards
self.saved_log_probs = []
self.rewards = []
def forward(self, state):
# propagate states through network (PyTorch automatically saves their gradient path and intermediate results)
return self.network(state)
def select_action(model, state):
"""
Function to select the actual action upon a model's decision
:param model: model which predicts next action
:param state: current state (on which to react)
"""
probs = model(state)
m = Categorical(probs)
action = m.sample()
# save log_probs as tensor
model.saved_log_probs.append(-m.log_prob(action))
return action.item()
def update_model(model: Policy, optimizer):
"""
Function to update the model's parameters by the gradients of the saved results (rewards and saved log_probs)
:param model:
:param optimizer:
:return:
"""
exp_return = 0
returns = []
policy_loss = []
# calculate rewards
for reward in model.rewards[::-1]:
exp_return = reward + 0.99 * exp_return
returns.insert(0, exp_return)
# multiply rewards with saved log_probs (tensors) to use their gradient path
for idx, _reward in enumerate(returns):
policy_loss.append(_reward*model.saved_log_probs[idx])
# add batch dimension, concatenate the list entries and sum them up for a total loss
summed_policy_loss = torch.cat([tmp.unsqueeze(0) for tmp in policy_loss]).sum()
# actual weight update
optimizer.zero_grad()
summed_policy_loss.backward()
optimizer.step()
# empty saved rewards and saved log_probs
model.saved_log_probs, model.rewards = [], []
def train(render=False):
"""
Major train routine
:param render: whether or not to render the environment
"""
# create environment
env = gym.make('CartPole-v0')
env.seed(42)
# create device (run on GPU if possible)
if torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
# create optimizer and model (and push model to according device)
policy = Policy().to(device)
optimizer = torch.optim.SGD(policy.parameters(), lr=1e-3)
# iterate through episodes (specifiy max_episodes here)
episode = 1
max_episodes = None
while episode:
# get current state from environment
state = env.reset()
# play a sequence (maximum of 10000 actions)
for t in range(10000):
# create tensor from state and push it to same device as model
state_tensor = torch.from_numpy(state).to(torch.float).to(device)
# select the action for each state
action = select_action(policy, state_tensor)
# execute action, get reward, new state and whether the sequence can be continued
# (whether pole did not topple down)
state, reward, done, _ = env.step(action)
# render the environment if necessary
if render:
env.render()
# save reward for current state
policy.rewards.append(reward)
# breaking condition (break if pole toppled over)
if done:
break
# update model by previous rewards and log_probs (saved in model)
update_model(policy, optimizer)
# optional: print weights of networks's first layer to see if parameters changed
# (if they change the gradient path is okay)
# print(policy.network[0].weight)
# breaking condition for number of episodes
if max_episodes is not None and episode >= max_episodes:
break
# move to next episode
episode += 1
if __name__ == '__main__':
train(True)
```

EDIT: Just noticed that the code is pretty similar to the one which is given as example in the pytorch repo. But I hope the explanations are helpful.