RuntimeError mat1 dim 1 must match mat2 dim 0

I see a couple of such posts in forum but I have hardtime generalizing it to my own problem. Here’s the error:

Result:

File "Pmain.py", line 244, in Pmain
    optimize_model()
  File "Pmain.py", line 114, in optimize_model
    state_action_values = Policy_Net(state_batch).gather(1, action_batch)
  File "Programs\Python\Python37\lib\site-packages\torch\nn\modules\module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "OriNet.py", line 59, in forward
    h1 = F.relu(self.fc1(x)) 
  File "Programs\Python\Python37\lib\site-packages\torch\nn\modules\module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "Programs\Python\Python37\lib\site-packages\torch\nn\modules\linear.py", line 93, in forward
    return F.linear(input, self.weight, self.bias)
  File "Programs\Python\Python37\lib\site-packages\torch\nn\functional.py", line 1692, in linear
    output = input.matmul(weight.t())
RuntimeError: mat1 dim 1 must match mat2 dim 0

Network

class DQN(nn.Module):
    
    def __init__(self, num_states, num_actions):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(num_states, 32)
        self.fc2 = nn.Linear(32, 32) 
        self.fc3 = nn.Linear(32,32)  
        self.fc4 = nn.Linear(32,12)   
        self.fc5 = nn.Linear(12, num_actions)

    
    def forward(self, x):
        h1 = F.relu(self.fc1(x)) 
        h2 = F.relu(self.fc2(h1))
        h3 = F.relu(self.fc3(h2))
        h4 = F.relu(self.fc4(h3))
        output = self.fc5(h4)
        return output

optimize_model

def optimize_model():
        if len(memory) < BATCH_SIZE:
            return
        transitions = memory.sample(BATCH_SIZE)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        
#        state_batch = torch.unsqueeze(state_batch,1)
#        state_batch = torch.unsqueeze(state_batch,0)
#        state_batch = torch.unsqueeze(state_batch,0)

        state_action_values = Policy_Net(state_batch).gather(1, action_batch)

        next_state_values = torch.zeros(BATCH_SIZE, device=device)

        non_final_mask = torch.ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))
        non_final_next_states = torch.cat([s for s in batch.next_state
                                                    if s is not None])
        next_state_values[non_final_mask] = Target_Net(
            non_final_next_states).max(1)[0].detach()

        expected_state_action_values = (next_state_values * GAMMA) + reward_batch

        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        optimizer.zero_grad()
        loss.backward()
        for param in Policy_Net.parameters():
            param.grad.data.clamp_(-1, 1)
        optimizer.step()

DQN input: [A,B,C,D,E,F,G,H] ( FloatTensor 8 variable,1dimension)

state_batch:

tensor([  0., 181., 171.,  ..., 181., 196., 171.], device='cuda:0')

action_batch:

tensor([[0],
        [0],
        [0],
        [0],
    ~
        [0],
        [0],
        [0]], device='cuda:0')

I tryed “torch.unsqueeze”, but this program returned same error.
Can anyone help to solve this problem ?

Assuming num_states is set to 8, the posted model expects an input in the shape [batch_size, 8].
Could you check the input shape and make sure it matches the expected one?

Thank you for advice.
I’m not very good at English, so I apologize if I misinterpreted your sentence.
num_states is set to 8.batch_size is set to 128.

print(state_batch.size())

torch.Size([1024])

“input” is model’s declaration?
InputNumber is set to 8, AttackNumber is set to 6.


Policy_Net = OriNet.DQN(InputNumber,AttackNumber).to(device)
Target_Net = OriNet.DQN(InputNumber,AttackNumber).to(device)
Target_Net.load_state_dict(Policy_Net.state_dict())
Target_Net.eval()
optimizer = optim.Adam(Policy_Net.parameters(), lr=0.001)
memory = OriNet.ReplayMemory(10000)

I understand that that output “torch.Size([1024])” has an error, It is needed to become "torch.Size([128,8])

it appears be flatten.

I don’t use the gym , PLE ,etc.
I refer to tutorial(Reinforcement Learning (DQN) Tutorial — PyTorch Tutorials 1.7.1 documentation),

inputs is equal to tutorial’s state.

inputs =  [A,B,C,D,E,F,G,H]
inputs = np.array(inputs)
inputs = torch.from_numpy(inputs).type(torch.FloatTensor).to(device)

I can’t run my progam yet.
Please Help me.

Replay Memory module is same as Tutorial’s ReplayMemory.

class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

I’m still unsure how the input is created. Are you seeing the same issue if you run the tutorial directly?
If not, could you post an executable code snippet with random values, which would reproduce this issue?

I can run tutorial.

I made a simple programs, it outputs the same error.

import Battle
import re
import numpy as np
import os
import copy
import random
from copy import deepcopy
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from collections import namedtuple
import math
import OriNet


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 500
InputNumber = 8
AttackNumber = 6
steps_done = 0
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

AttackList = [1,2,3,4,5,6]

def snippet():
    EPISODE = 2000
    n_actions = 6
    Policy_Net = OriNet.DQN(InputNumber,AttackNumber).to(device)
    Target_Net = OriNet.DQN(InputNumber,AttackNumber).to(device)
    Target_Net.load_state_dict(Policy_Net.state_dict())
    Target_Net.eval()
    optimizer = optim.Adam(Policy_Net.parameters(), lr=0.001)
    memory = OriNet.ReplayMemory(10000)

    def select_action(state):
        global steps_done
        sample = random.random()
        eps_threshold = EPS_END + (EPS_START - EPS_END) * \
            math.exp(-1. * steps_done / EPS_DECAY)
        steps_done += 1
        if sample > eps_threshold:
            with torch.no_grad():
#                print(Policy_Net(state)[0])
                return Policy_Net(state)
        else:
            return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)


    def optimize_model():
        if len(memory) < BATCH_SIZE:
            return
        transitions = memory.sample(BATCH_SIZE)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

#        state_batch = torch.unsqueeze(state_batch,1)


        state_action_values = Policy_Net(state_batch).gather(1, action_batch)

        next_state_values = torch.zeros(BATCH_SIZE, device=device)
#        next_state_values[non_final_mask] = Target_Net(non_final_next_states).max(1)[0].detach()

        non_final_mask = torch.ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))
        non_final_next_states = torch.cat([s for s in batch.next_state
                                                    if s is not None])
        next_state_values[non_final_mask] = Target_Net(
            non_final_next_states).max(1)[0].detach()

        expected_state_action_values = (next_state_values * GAMMA) + reward_batch

        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        optimizer.zero_grad()
        loss.backward()
        for param in Policy_Net.parameters():
            param.grad.data.clamp_(-1, 1)
        optimizer.step()
        print('aaa')

    for i in range(EPISODE):
        inputs = [100,200,300,400,500,600,700,800]
        inputs = np.array(inputs)
        inputs = torch.from_numpy(inputs).type(torch.FloatTensor).to(device)

        Action = select_action(inputs)
        AttackNum = Action[0].to('cpu').detach().numpy().copy()
        Attack = AttackList[np.argmax(AttackNum)]

#      this time, program updates status in this game. 
        NowState = [101,201,301,401,501,601,701,801]
        NowState = np.array(NowState)
        NowState = torch.from_numpy(NowState).type(torch.FloatTensor).to(device)

        reward = 1
        reward = torch.tensor([reward],device = device)

        AttackNum = torch.tensor([[Attack]],device = device)

        memory.push(inputs,AttackNum,NowState,reward)
        optimize_model()



snippet()

DQN model

class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

class DQN(nn.Module):
    
    def __init__(self, num_states, num_actions):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(num_states, 32)
        self.fc2 = nn.Linear(32, 32) 
        self.fc3 = nn.Linear(32,32)  
        self.fc4 = nn.Linear(32,12)   
        self.fc5 = nn.Linear(12, num_actions)

    
    def forward(self, x):
        h1 = F.relu(self.fc1(x)) 
        h2 = F.relu(self.fc2(h1))
        h3 = F.relu(self.fc3(h2))
        h4 = F.relu(self.fc4(h3))
        output = self.fc5(h4)
        return output


Thanks for the code snippet.
The error is raised, as torch.cat(batch_state) and torch.cat([s for s in batch.next_state if s is not None]) will return the flattened tensors. Use torch.stack instead and the code should be working fine.
I haven’t checked the exact differences between your code and the tutorial, but guess your states are containing multiple state values, which would yield this error.

Thank you for your advices.
My program appears to be working.
Writing for the knowledge of younger students, I have corrected this:

before

state_batch = torch.cat(batch.state) 
non_final_next_states = torch.cat([s for s in batch.next_state
                                                    if s is not None])

after

state_batch = torch.stack(batch.state) 
non_final_next_states = torch.stack([s for s in batch.next_state

                                                    if s is not None])

My program ran once, but returned an error when I ran it again.

non_final_next_states).max(1)[0].detach()
RuntimeError: CUDA error: device-side assert triggered

When I tried to use the CPU, it returned the following error.

state_action_values = Policy_Net(state_batch).gather(1, action_batch)
RuntimeError: index 8 is out of bounds for dimension 1 with size 6

def optimize_model():
        if len(memory) < BATCH_SIZE:
            return
        transitions = memory.sample(BATCH_SIZE)
        batch = Transition(*zip(*transitions))

        non_final_mask = torch.ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))
        non_final_next_states = torch.stack([s for s in batch.next_state
                                                if s is not None])
        
        state_batch = torch.stack(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        state_action_values = Policy_Net(state_batch).gather(1, action_batch)

        next_state_values = torch.zeros(BATCH_SIZE, device=device)
        next_state_values[non_final_mask] = Target_Net(non_final_next_states).max(1)[0].detach()

        expected_state_action_values = (next_state_values * GAMMA) + reward_batch

        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        optimizer.zero_grad()
        loss.backward()
        for param in Policy_Net.parameters():
            param.grad.data.clamp_(-1, 1)
        optimizer.step()