Struggling with a Runtime Error related to in-place operations

@ptrblck Thanks for taking a look at it I made a super simple environment class so everything can run together. The simple environment class still works with linear layers, just not LSTM.

Simple environment and some parameters

default_params = {
    'discrete' : False,
    'lr' : 3e-4,
    'num_steps' : 300,
    'mini_batch_size' : 30,
    'ppo_epochs' : 6,
    'action_dim' : 2,
    'seq_len' : 10,
    'type_' : 'LSTM',
    
    'dropout' : False,
    'discrete' : False,
    'std' : 0.0,
    'hidden_dim' : 20,
    'dim_1' : 64,
    'dim_2' : 32
    
}


class debug_env():

	def __init__(self, params = default_parameters):

		self.action_dim = 2
		self.state_dim = 4
		self.n_pts = 50000
		self.seq_len = params['seq_len']
		self.data_df = self.build()

	def build(self):
		return pd.DataFrame(np.random.normal(size=(self.n_pts, self.state_dim)))

	def initial_state(self):
		i = 0
		return self.data_df.iloc[i:i+self.seq_len].values, i +self.seq_len - 1

	def step(self, action, state, idx):
		return self.rewards(), self.data_df.iloc[idx].values.reshape(1, -1)
	
	def rewards(self):
		return np.random.normal()

Training Loop and What not

###used for single actor###
class trainer_tester():
    '''
    class for implementing training/testing functionality
    '''
    def __init__(self, envs, params = default_parameters_single_actor):
        ###make the environment
        self.envs = envs
        self.discrete = params['discrete']
        
        ### intialize some hyper parameters
        self.state_dim = envs.state_dim 
        self.action_dim = envs.action_dim

        ###hyper params:
        self.lr               = params['lr']
        self.num_steps        = params['num_steps']
        self.mini_batch_size  = params['mini_batch_size']
        self.ppo_epochs       = params['ppo_epochs']
        self.seq_len          = envs.seq_len
        self.type_            = params['type_']


        if self.type_ == 'linear':
            self.sf = training_functionality(params)
            ###intialize actors and critics
            self.actor_ = actor(self.state_dim, self.action_dim, params = params)
            self.critic_ = critic(self.state_dim, params = params)

        elif self.type_ == 'LSTM':
            self.sf = training_functionality_LSTM(params)
            ###intialize actors and critics
            self.actor_ = actor_LSTM(self.state_dim, self.action_dim, params = params)
            self.critic_ = critic_LSTM(self.state_dim, params = params)
            
        self.optimizer_actor = torch.optim.Adam(self.actor_.parameters(), lr = self.lr)
        self.optimizer_critic = torch.optim.Adam(self.critic_.parameters(), lr = self.lr)
       
    def add_states(self, actor_, envs, state, idx, entropy):
        '''
        add states to the batch of state, action, rewards ... etc pairs
        
        input(s): state-tensor with current state, current index in the data set, entropy-constant representing entropy
        '''
        state = torch.FloatTensor(state)#.to(device)
        
        if self.type_ == 'LSTM':
            ###create batch dimension
            state = state.unsqueeze(1)
        
        dist = actor_(state)
        value = self.critic_(state)

        action = dist.sample()

        if self.discrete:
            temp = action.numpy().item()
        else:
            temp = F.softmax(action, dim = 1)
            temp = temp.numpy()[0]

        if self.type_ == 'LSTM':
            state = torch.index_select(state, 0, torch.tensor([state.size(0) -1])).squeeze(1)
            
        reward, next_state = envs.step(temp, state.numpy(), idx)

        log_prob = dist.log_prob(action)

        entropy += dist.entropy().mean()
        
        return log_prob.unsqueeze(1), value, torch.FloatTensor(np.array([reward])).unsqueeze(1), state, action.unsqueeze(1), next_state, entropy 

    
    def train(self, multiple_runs):
        
        self.rewards_all = []
        update_counter = 0
        for i in range(multiple_runs):
            if i == 0:
                state, j = self.envs.initial_state()
            else:
                state, j = self.envs.initial_state(random = True)
            max_idx = self.envs.data_df.shape[0] - j
            idx  = j + 1
        
            while idx < max_idx:
                log_probs = []
                values    = []
                states    = []
                actions   = []
                rewards   = []
                entropy = 0

                # self.actor_.eval()
                # self.critic_.eval()
                # with torch.no_grad():

                if self.type_ == 'LSTM':
                    saved_cell_actor = (self.actor_.hidden_cell[0].clone(),self.actor_.hidden_cell[1].clone())
                    saved_cell_critic = (self.critic_.hidden_cell[0].clone(),self.critic_.hidden_cell[1].clone())

                    self.actor_.hidden_cell = (self.actor_.hidden_cell[0][:, -1:, :].clone(), self.actor_.hidden_cell[1][:, -1:, :].clone())
                    self.critic_.hidden_cell = (self.critic_.hidden_cell[0][:, -1:, :].clone(), self.critic_.hidden_cell[1][:, -1:, :].clone())

                    # self.actor_.reset_hidden_cell(1)
                    # self.critic_.reset_hidden_cell(1)

                    for k in range(self.seq_len-1):
                        states.append(torch.FloatTensor(state[k:k+1,:]))

                for _ in range(self.num_steps):

                    if self.type_ == 'LSTM':
                        temp_state = state[1:, :]

                    log_prob, value, reward, state, action, next_state, entropy = self.add_states(self.actor_, self.envs, state, idx, entropy)
                    
                    log_probs.append(log_prob)
                    values.append(value)
                    rewards.append(reward)#.to(device))
                    states.append(state)
                    actions.append(action)

                    state = next_state


                    if self.type_ == 'LSTM':
                        state = np.concatenate((temp_state, state), axis = 0)

                    self.rewards_all.append(reward)

                    idx += 1
                    if idx >= max_idx:
                        break

                next_state = torch.FloatTensor(next_state)



                if self.type_ == 'LSTM':     
                    next_state = torch.FloatTensor(np.concatenate((state[1:, :], next_state), axis = 0)).unsqueeze(1)#.to(device)

                next_value = self.critic_(next_state)
                next_value = next_value.item()

                returns = self.sf.compute_gae(next_value, rewards, values)
                
                
                returns   = torch.cat(returns).detach()
                log_probs = torch.cat(log_probs).detach()
                values    = torch.cat(values).detach()
                states    = torch.cat(states)
                actions   = torch.cat(actions)
                advantage = returns - values

                if self.type_ == 'LSTM':
                    # self.actor_.reset_hidden_cell(self.mini_batch_size)
                    # self.critic_.reset_hidden_cell(self.mini_batch_size)
                    self.actor_.hidden_cell = saved_cell_actor 
                    self.critic_.hidden_cell = saved_cell_critic

                
                # self.actor_.train()
                # self.critic_.train()

                update_counter += 1
                if update_counter % 3 == 0:
                    print('{} % done.'.format((idx/max_idx)*100))       
                    
                self.sf.ppo_update(states, actions, log_probs, returns, advantage, self.actor_, self.critic_, self.optimizer_actor, self.optimizer_critic)
                
    def test(self, envs):
        '''
        Implementation to test some of the data without updating any parameters of the networks
        '''
        self.envs = envs
        self.rewards_all_test = []
        state, j = self.envs.initial_state()
        max_idx = self.envs.data_df.shape[0] - j
        idx  = j + 1
        rewards_all = []
        entropy = 0
        while idx < max_idx:
            log_prob, value, reward, state, action, next_state, entropy = self.add_states(self.actor_, envs, state, idx, entropy)
            state = next_state
            self.rewards_all_test.append(reward)
            idx += 1
            if idx >= max_idx:
                break

Supplementary functionality for training

class training_functionality():
    '''
    wrapper class for implementing training supplementary functions
    '''
    def __init__(self, params):
        self.mini_batch_size = params['mini_batch_size']
        self.ppo_epochs      = params['ppo_epochs']
        self.seq_len         = params['seq_len']


    def compute_gae(self, next_value, rewards, values, gamma=0.99, tau=0.95):
        '''
        General Advantage Estimation function from paper: used to estimate the advatage the actor gains 
            from a particular action
        input(s):
        '''
        values = values + [next_value]
        gae = 0
        returns = []
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + gamma * values[step + 1]  - values[step]
            gae = delta + gamma * tau * gae
            returns.insert(0, gae + values[step])
        return returns


    def ppo_iter(self, states, actions, log_probs, returns, advantage):
        '''
        iterate through a set of the data using minibatches
        input(s):
        '''

        batch_size = states.size(0)
        for _ in range(batch_size // self.mini_batch_size):
            rand_ids = np.random.randint(0, batch_size, self.mini_batch_size)
            #rand_ids = np.random.randint(0, batch_size - mini_batch_size)
            yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]
            #yield states[rand_ids:rand_ids+mini_batch_size:, :], actions[rand_ids:rand_ids+mini_batch_size, :], log_probs[rand_ids:rand_ids+mini_batch_size, :], returns[rand_ids:rand_ids+mini_batch_size, :], advantage[rand_ids:rand_ids+mini_batch_size, :]

    def ppo_update(self, states, actions, log_probs, returns, advantages, actor_, critic_, optimizer_actor, optimizer_critic, clip_param=0.2):
        '''
        training loop: iterate through batches of states, actions, returns and advantages 
        '''
        for _ in range(self.ppo_epochs):
            for state, action, old_log_probs, return_, advantage in self.ppo_iter(states, actions, log_probs, returns, advantages):
                #with torch.autograd.set_detect_anomaly(True):
    
                ###obtain outputs from neural nets
                dist = actor_(state)
                value = critic_(state)

                ###calculate entropy and log_prob
                entropy = dist.entropy().mean()
                new_log_probs = dist.log_prob(action)

                # print(action.size())
                # print(new_log_probs.size())
                # print(advantage.size())

                ###calculate ppo ratio from paper
                ratio = (new_log_probs - old_log_probs).exp()
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage

                ###actor and critic losses
                actor_loss  = - torch.min(surr1, surr2).mean() - 0.001 * entropy
                mse_loss = nn.MSELoss()
                critic_loss = mse_loss(value, return_) - 0.001 * entropy
        
                ###update
                optimizer_actor.zero_grad()
                optimizer_critic.zero_grad()
                

                actor_loss.backward(retain_graph=True)
                critic_loss.backward(retain_graph=True)
                
                optimizer_actor.step()
                optimizer_critic.step()



class training_functionality_LSTM(training_functionality):
    '''
    class for the LSTM training functionality 
    needed to overwrite a couple of functions
    '''
    def __init__(self, params):
        super(training_functionality_LSTM, self).__init__(params)

    def f(self, temp_tensor):
        ###depreciated
        '''
        helper function for building out the sequences 
        '''
        temp = [temp_tensor[j:j+self.seq_len, :].clone().unsqueeze(1) for j in range(self.mini_batch_size)]
        # for i in temp:
        #     print(i.size())
        return torch.cat(temp, 1)

    def ppo_iter(self, states, actions, log_probs, returns, advantage):
        '''
        iterate through a set of the data using minibatches
        input(s):
        '''
        #with torch.autograd.set_detect_anomaly(True):
        batch_size = states.size(0) - self.mini_batch_size
        for _ in range(batch_size // self.mini_batch_size):
            #rand_ids = np.random.randint(0, batch_size, mini_batch_size)
            #lump = self.mini_batch_size + self.seq_len 
            rand_ids = np.random.randint(self.mini_batch_size+self.seq_len, batch_size)
            #print(rand_ids)
            inds = np.array([[(rand_ids - 1) - i - j for i in reversed(range(self.seq_len))] for j in reversed(range(self.mini_batch_size))])
            inds= torch.tensor(inds.T)
            inds = inds.reshape(inds.size(0)*inds.size(1))
            temp = states.unsqueeze(1)
            #print(inds)
            x = torch.index_select(temp, 0, inds).view(self.seq_len, self.mini_batch_size, -1)
            #yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]
            yield x, actions[rand_ids- self.mini_batch_size:rand_ids, :], log_probs[rand_ids- self.mini_batch_size:rand_ids, :], returns[rand_ids- self.mini_batch_size:rand_ids, :], advantage[rand_ids- self.mini_batch_size:rand_ids, :]

network classes


def base_net(state_dim, dim_1, dim_2, type_, drop_out):
    '''
    function to define base_network
    '''
    if type_ == 'linear':
        layers = [
            nn.Linear(state_dim, dim_1),
            nn.Tanh(),
            nn.Linear(dim_1, dim_2),
            nn.Tanh()
        ]
        if drop_out:
            layers.intsert(2, nn.Dropout(p = 0.3))
            layers.append(nn.Dropout(p = 0.3))
        
    
    return layers


###actor build###       
class actor(nn.Module):
    '''
    The actor class
    '''
    def __init__(self, state_dim, action_dim, params = default_params):
        '''
        input(s): dimensions of state and action, and the max action
        '''
        super(actor, self).__init__()
        
        ###for a discrete action space
        self.discrete = params['discrete']
        self.type_ = params['type_']
        self.dropout = params['dropout']
        self.std = params['std']
        self.hidden_dim = params['hidden_dim']
        self.dim_1 = params['dim_1']
        self.dim_2 = params['dim_2']
        self.mini_batch_size = params['mini_batch_size']
        
        self.log_std = nn.Parameter(torch.ones(1, action_dim) * self.std)
        
        
        self.set_up_layers(state_dim, action_dim)
        
        ###apply the intial weights if needed
        #self.apply(init_weights)
    
    def set_up_layers(self, state_dim, action_dim):
        '''
        make structure for the actor network
        '''
        if self.type_ == 'linear':

            layers = base_net(state_dim, self.dim_1, self.dim_2, self.type_, self.dropout)
            layers.append(nn.Linear(self.dim_2, action_dim))
        
            self.model = nn.Sequential(*layers)
        
        elif self.type_ == 'LSTM':
            self.lstm = nn.LSTM(state_dim, hidden_size = self.hidden_dim)
            self.lin_cap = nn.Linear(self.hidden_dim, action_dim)
            

    
    def forward(self, state):
        '''
        forward propagate the network
        input(s): state
        output(s): result of the actor network
        '''
        out = self.model(state)
        if self.discrete:
            probs = F.softmax(out, dim = 1)
            return Categorical(probs)
        else: 
            mu  = self.model(state)
            std   = self.log_std.exp().expand_as(mu)
            dist  = Normal(mu, std)
            return dist
            

###critic build###
class critic(nn.Module):
    '''
    critic class
    '''
    def __init__(self, state_dim, params = default_params):
        '''
        input(s): dimensions of state and action
        '''
        super(critic, self).__init__()
        self.type_ = params['type_']
        self.dropout = params['dropout']
        self.hidden_dim = params['hidden_dim']
        self.dim_1 = params['dim_1']
        self.dim_2 = params['dim_2']
        self.mini_batch_size = params['mini_batch_size']
        
        self.set_up_layers(state_dim)
        
    
    def set_up_layers(self, state_dim):
        '''
        make structure for the actor network
        '''
        if self.type_ == 'linear':
            layers = base_net(state_dim, self.dim_1, self.dim_2, self.type_, self.dropout)
            layers.append(nn.Linear(self.dim_2, 1))
            self.model = nn.Sequential(*layers)

        elif self.type_ == 'LSTM':
            self.lstm = nn.LSTM(state_dim, hidden_size = self.hidden_dim)
            self.lin_cap = nn.Linear(self.hidden_dim, 1)
        
    
    def forward(self, state):
        '''
        forward propagate the network
        input(s): state and action
        output(s): result of the critic network
        '''
        return self.model(state)




class actor_LSTM(actor):
    '''
    LSTM actor class
    need to overwrite the forward function
    '''
    def __init__(self, state_dim, action_dim, params = default_params):
        
        super(actor_LSTM, self).__init__(state_dim, action_dim, params = params)

        self.reset_hidden_cell(self.mini_batch_size)

    def reset_hidden_cell(self, mini_batch_size):
        '''
        resets the hidden cell state, or intializes it 
        '''
        self.hidden_cell = (torch.zeros(1,mini_batch_size,self.hidden_dim), torch.zeros(1,mini_batch_size,self.hidden_dim))

    def forward(self, x):
        
        lstm_out, self.hidden_cell = self.lstm(x, self.hidden_cell)
        mu = self.lin_cap(lstm_out[-1])

        if self.discrete:
            probs = F.softmax(mu, dim = 1)
            return Categorical(probs)
        else: 
            std   = self.log_std.exp().expand_as(mu)
            dist  = Normal(mu, std)
            return dist

class critic_LSTM(critic):
    '''
    LSTM critic class
    need to overwrite the forward function
    '''
    def __init__(self, state_dim, params = default_params):
        
        super(critic_LSTM, self).__init__(state_dim, params = params)
        
        self.reset_hidden_cell(self.mini_batch_size)
    
    def reset_hidden_cell(self, mini_batch_size):
        '''
        resets the hidden cell state, or intializes it 
        '''
        self.hidden_cell = (torch.zeros(1,mini_batch_size,self.hidden_dim), torch.zeros(1,mini_batch_size, self.hidden_dim))
    
    def forward(self, x):
        #.view(len(x) ,1, -1)
        lstm_out, self.hidden_cell = self.lstm(x, self.hidden_cell)
        #lstm_out[-1].clone()
        return self.lin_cap(lstm_out[-1])

calls

de = debug_env()

trainer = trainer_tester(de)
trainer.train(1)

Thanks again for the help, I’ve been stumped for days and have exhausted all the similar errors people have run into.