Deep Attention Q Network

Hi, does anyone know how to implement the following structure in pytorch.

image

The problem is, LSTMs (Decoder) accepts input as Batch_Size x Seq_length x Input_size whereas convolution neural network (Encoder) takes input as Batch_Size x Input_size. Is there any way to give input to the CNN encoder as Batch_Size x Seq x Input size and get an output as Batch_size x Seq_length x output_size and use this output to feed into LSTM? I was able to do this in Torch using sequencers but I don’t know how to do this in pytorch.

I assume your input data is image frames and your convolutions are 2D.

If your data is of shape (batch, seq, image_channels, height, width), then I you will need the CNN to treat the seq as an additional batch dimension. This sounds hard to achieve but there is an easy alternative: combine the batch and seq dimensions before feeding the data to the CNN. Something like this will do.

cnn_input = input.view(batch_size*seq_len, channels, height, width)

Then you can separate the batch and seq dimensions of the cnn output before feeding the data to the LSTM.

lstm_input = cnn_output.view(batch_size, seq_len, cnn_output.size(1), cnn_output.size(2), cnn_output.size(3))

Hi, thank you for your response. I followed what you had said. It makes sense but my network gets stuck in local minima, means training error does not decrease after 3 epochs. Initially, I thought my data loading is wrong but I trained a CNN classifier with the same data loading and it gave 90%+ accuracy. Here is my model.py and train.py code (for now, I haven’t used any attention (g) function).
##model.py
import math
import torch.nn as nn
import torch.nn.functional as F

class EncoderCNN(nn.Module):
def init(self,rnn_input_size):
super(EncoderCNN, self).init()
self.conv1 = nn.Conv2d(1, 64, kernel_size=9, stride=3, padding=0)
self.pool1 = nn.MaxPool2d(2)
self.conv2 = nn.Conv2d(64, 128, kernel_size=7, stride=1, padding=0)
self.pool2 = nn.MaxPool2d(2)
self.conv3 = nn.Conv2d(128, 256, kernel_size=5, stride=1, padding=0)
self.pool3 = nn.MaxPool2d(2)
self.fc = nn.Linear(256x5x5, rnn_input_size)

def forward(self, x):
    # (batch * seq, 1, 210, 210)
    out = self.conv1(x)
    out = F.relu(out)
    out = self.pool1(out)
    out = self.conv2(out)
    out = F.relu(out)
    out = self.pool2(out)
    out = self.conv3(out)
    out = F.relu(out)
    out = self.pool3(out)
    # (batch * seq, 64, 5, 5)
    out = out.view(out.size(0), -1)
    # (batch * seq, 64 * 5 * 5)
    out = self.fc(out)
    return out

class DecoderRNN(nn.Module):
def init(self, input_size, hidden_size, output_size, num_layers):
super(DecoderRNN, self).init()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)

def forward(self, x):
    out = x.view(-1, 5, self.input_size)
    out, _ = self.lstm(out)
    out = out.contiguous().view(-1, out.size(2))
    out = self.fc(out)
    return out

snippet of train.py

for epoch in range(args.num_epochs):
avg_loss=0
for i in range (0,len(dataset),args.batch_size):
# Forward, Backward and Optimize
d,t=get_input(i,dataset,targets,args.batch_size)

        d = to_var(d)
        t = to_var(t)
        # Forward, Backward and Optimize
        
        decoder.zero_grad()
        encoder.zero_grad()
        features = encoder(d.view(-1,1, d.size(2), d.size(3)).float())
        outputs = decoder(features)            
        loss = criterion(outputs, t.view(-1).long())
        avg_loss=avg_loss+loss.data[0]
        loss.backward()
        optimizer.step()