# DQN always gives same output regardless of input

I am solving combinational optimization problem with DQN.
My goal is to reach the optimal state

I just revised a little from the following pytorch DQN tutorials

https://tutorials.pytorch.kr/intermediate/reinforcement_q_learning.html

``````
class DQN(nn.Module):

def __init__(self, h, w, outputs):
super(DQN, self).__init__()
self.channel = 7
self.conv1 = nn.Conv2d(self.channel, 16, kernel_size=3, stride=2)
self.bn1 = nn.BatchNorm2d(16)
self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=2)
self.bn2 = nn.BatchNorm2d(32)
self.conv3 = nn.Conv2d(32, 32, kernel_size=1, stride=2)
self.bn3 = nn.BatchNorm2d(32)

def conv2d_size_out(size, kernel_size=5, stride=2):
return (size - (kernel_size - 1) - 1) // stride + 1

convw = 2
convh = 2
linear_input_size = convw * convh * 32

def forward(self, x):
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
x = F.relu(self.bn3(self.conv3(x)))

``````
``````def select_action(state):
global steps_done
sample = random.random()
#sample = 1.0
eps_threshold = EPS_END + (EPS_START - EPS_END) * \
math.exp(-1. * steps_done / EPS_DECAY)
steps_done += 1
if sample > eps_threshold:
# t.max (1)μ κ° νμ κ°μ₯ ν° μ΄ κ°μ λ°νν©λλ€.
# μ΅λ κ²°κ³Όμ λλ²μ§Έ μ΄μ μ΅λ μμμ μ£Όμκ°μ΄λ―λ‘,
# κΈ°λ λ³΄μμ΄ λ ν° νλμ μ νν  μ μμ΅λλ€.
return policy_net(state).max(1)[1].view(1, 1)
else:

def optimize_model():
if len(memory) < BATCH_SIZE:
return
transitions = memory.sample(BATCH_SIZE)

batch = Transition(*zip(*transitions))

non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
batch.next_state)), device=device, dtype=torch.bool)

non_final_next_states = torch.cat([s for s in batch.next_state
if s is not None])
state_batch = torch.cat(batch.state)
action_batch = torch.cat(batch.action)
reward_batch = torch.cat(batch.reward)

state_action_values = policy_net(state_batch).gather(1, action_batch)  # double DQN

next_state_values = torch.zeros(BATCH_SIZE, device=device, dtype=torch.double)

expected_state_action_values = (next_state_values * GAMMA) + reward_batch

loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

loss.backward()
for param in policy_net.parameters():