RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation - REINFORCE Algorithm

vaduma_263 · March 3, 2024, 5:26pm

I get a “RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation:” when I run the following supposedly simple reinforcement learning code.

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np

Constants

NUM_ALPHABET = 26
ORDINAL_LOWER_A = ord(‘a’)

Hangman Environment (replace this with your Hangman environment)

class HangmanEnvironment:
def init(self, words):
self.words = words
self.reset()

def reset(self):
    self.target_word = np.random.choice(self.words)
    self.state = torch.zeros(len(self.target_word), dtype=torch.long)
    self.correct_guesses = set()
    self.incorrect_guesses = set()
    self.attempts = 0
    self.max_attempts = 6  # Set the maximum attempts for Hangman
    self.game_over = False

def get_state(self):
    return self.state

def step(self, action):
    if action in self.correct_guesses or action in self.incorrect_guesses:
        # Penalize repeated guesses
        reward = -1
    elif action in self.target_word:
        # Correct guess
        reward = 1
        self.correct_guesses.add(action)
        for i, letter in enumerate(self.target_word):
            if letter == action:
                self.state[i] = 1
    else:
        # Incorrect guess
        reward = -1
        self.incorrect_guesses.add(action)
        self.attempts += 1

    if torch.all(self.state == 1) or self.attempts == self.max_attempts:
        # Game over conditions
        self.game_over = True

    return self.state.clone(), reward, self.game_over

Policy Network

class Policy(nn.Module):
def init(self, vocab_size, embedding_size, hidden_size, action_size):
super(Policy, self).init()
self.embedding = nn.Embedding(vocab_size, embedding_size)
self.rnn = nn.GRU(embedding_size, hidden_size, batch_first=True)
self.fc = nn.Linear(hidden_size, action_size)

def forward(self, input_indices):
    embedded_letters = self.embedding(input_indices)
    _, hidden_state = self.rnn(embedded_letters)

    # You can use the hidden_state as input to the final fully connected layer
    x = self.fc(hidden_state.squeeze(0))

    return F.softmax(x, dim=1)

def act(self, state):
    state = state.unsqueeze(0)
    probs = self.forward(state).cpu()
    m = Categorical(probs)
    action = m.sample()
    return action.item(), m.log_prob(action)

Training the Hangman REINFORCE algorithm

def reinforce(env, policy, optimizer, num_episodes=1000, gamma=0.99):
for episode in range(1, num_episodes + 1):
state = env.get_state()
print("printing state ",state)
log_probs =
rewards =

    while not env.game_over:
        action, log_prob = policy.act(state)
        log_probs.append(log_prob)
        state, reward, game_over = env.step(chr(action + ORDINAL_LOWER_A))
        rewards.append(reward)

    # Calculate returns and normalize
    returns = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        returns.insert(0, R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + 1e-8)

    # Calculate policy loss and update policy
    policy_loss = [-log_prob * R for log_prob, R in zip(log_probs, returns)]
    policy_loss = torch.cat(policy_loss).sum()

    optimizer.zero_grad()
    policy_loss.backward()
    optimizer.step()

    # Print the total reward for each episode
    print(f"Episode {episode}, Total Reward: {sum(rewards)}")

    # Reset the environment for the next episode
    env.reset()

Example usage

if name == “main”:
# Sample words for Hangman
word_list = [“hangman”, “python”, “programming”, “challenge”, “learning”]

# Hangman environment
hangman_env = HangmanEnvironment(word_list)

# Policy network
vocab_size = NUM_ALPHABET + 1  # 26 letters + 1 for "_"
embedding_size = 16
hidden_size = 32
action_size = NUM_ALPHABET
max_word_length = 30
policy_net = Policy(vocab_size, embedding_size, hidden_size, action_size)

# Optimizer
optimizer = optim.Adam(policy_net.parameters(), lr=0.001)

# Train the REINFORCE algorithm
reinforce(hangman_env, policy_net, optimizer, num_episodes=1000)