At the end RL is always repeating same actions

I am trying to train a game i design in pygame environment. the input state has 25 value which they show 5 by 5 grid environment around the agent. and the allowed action are 4. There are empty space with reward 0, obstacles with reward -7 and target with reward 7. I made simple linear decrement equation for exploring at initial state. I used code from PyTorch website done modified little. So my problem is that, after exploration it stuck on making only 2 decisions and it oscillates there. I really appreciate if some one would help me!

this is the code

class DQN(nn.Module):

    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 200)
        self.layer2 = nn.Linear(200, 200)
        self.layer3 = nn.Linear(200, n_actions)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)


Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


the optimizer function

def optimize_model():
    Transition = namedtuple('Transition',('state', 'action', 'next_state', 'reward')) 
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions))    
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    state_action_values = policy_net(state_batch).gather(1, action_batch)
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()

the training loop is

for i_episode in range(num_episodes):
    # Initialize the environment and get it's state
    state = game_.get_state()
    action = select_action(state)
    
    previous_action=action.item()
    game_.move_agent(action.item())
    game_.move_agent(action.item())
    done,reward = game_.get_reward()
    reward = torch.tensor([reward], device=device)
    next_state=game_.get_state()

    # Store the transition in memory
    memory.push(state, action, next_state, reward)


    # Perform one step of the optimization (on the policy network)
    optimize_model()

    # Soft update of the target network's weights
    # θ′ ← τ θ + (1 −τ )θ′
    target_net_state_dict = target_net.state_dict()
    policy_net_state_dict = policy_net.state_dict()
    for key in policy_net_state_dict:
        target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
    target_net.load_state_dict(target_net_state_dict)
    
    if done:
        game_.reset()