Hi, thank you for your response. I followed what you had said. It makes sense but my network gets stuck in local minima, means training error does not decrease after 3 epochs. Initially, I thought my data loading is wrong but I trained a CNN classifier with the same data loading and it gave 90%+ accuracy. Here is my model.py and train.py code (for now, I haven’t used any attention (g) function).
##model.py
import math
import torch.nn as nn
import torch.nn.functional as F
class EncoderCNN(nn.Module):
def init(self,rnn_input_size):
super(EncoderCNN, self).init()
self.conv1 = nn.Conv2d(1, 64, kernel_size=9, stride=3, padding=0)
self.pool1 = nn.MaxPool2d(2)
self.conv2 = nn.Conv2d(64, 128, kernel_size=7, stride=1, padding=0)
self.pool2 = nn.MaxPool2d(2)
self.conv3 = nn.Conv2d(128, 256, kernel_size=5, stride=1, padding=0)
self.pool3 = nn.MaxPool2d(2)
self.fc = nn.Linear(256x5x5, rnn_input_size)
def forward(self, x):
# (batch * seq, 1, 210, 210)
out = self.conv1(x)
out = F.relu(out)
out = self.pool1(out)
out = self.conv2(out)
out = F.relu(out)
out = self.pool2(out)
out = self.conv3(out)
out = F.relu(out)
out = self.pool3(out)
# (batch * seq, 64, 5, 5)
out = out.view(out.size(0), -1)
# (batch * seq, 64 * 5 * 5)
out = self.fc(out)
return out
class DecoderRNN(nn.Module):
def init(self, input_size, hidden_size, output_size, num_layers):
super(DecoderRNN, self).init()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
out = x.view(-1, 5, self.input_size)
out, _ = self.lstm(out)
out = out.contiguous().view(-1, out.size(2))
out = self.fc(out)
return out
snippet of train.py
for epoch in range(args.num_epochs):
avg_loss=0
for i in range (0,len(dataset),args.batch_size):
# Forward, Backward and Optimize
d,t=get_input(i,dataset,targets,args.batch_size)
d = to_var(d)
t = to_var(t)
# Forward, Backward and Optimize
decoder.zero_grad()
encoder.zero_grad()
features = encoder(d.view(-1,1, d.size(2), d.size(3)).float())
outputs = decoder(features)
loss = criterion(outputs, t.view(-1).long())
avg_loss=avg_loss+loss.data[0]
loss.backward()
optimizer.step()