I have created a custom nn module, which I added to a VGG layer. However, the program runs very slowly after adding the custom LSTM layer. Essentially, the LSTM layer makes an LSTM cell for each row, where the weights are two types of learnable convolutions. One convolution takes the input and convolves, while the other convolution convolves the previous LSTM cell’s hidden layer. These two inputs are then added, and using the equations defined in DeepMind’s Pixel RNN paper, the next gates for the next LSTM cell are computed. I am wondering why the model learns so slowly? I have included code below.
Here is the function which creates the convolutions and LSTM cells, along with the the definition of my custom LSTM cell.
class RLSTM(nn.Module):
def __init__(self,ch):
super(RLSTM,self).__init__()
self.ch=ch
self.input_to_state = torch.nn.Conv2d(self.ch,4*self.ch,kernel_size=(1,3),padding=(0,1)).cuda()
self.state_to_state = torch.nn.Conv2d(self.ch,4*self.ch,kernel_size=(1,3),padding=(0,1)).cuda() # error is here: hidPrev is an array - not a valid number of input channel
self.cell_list = []
def forward(self, image):
size = image.size()
b = size[0]
indvs = list(image.split(1,0))
tensor_array = []
for i in range(b):
tensor_array.append(self.RowLSTM(indvs[i]))
seq=tuple(tensor_array)
trans = torch.cat(seq,0)
return trans.cuda()
def RowLSTM(self, image):
# input-to-state (K_is * x_i) : 1x3 convolution. generate h x n x n tensor. hxnxn tensor contains all i -> s info
self.cell_list=[]
igates = []
n = image.size()[2]
ch=image.size()[1]
for i in range(n):
if i==0:
isgates = self.splitIS(self.input_to_state(image)) # convolve, then split into gates (4 per row)
cell=RowLSTMCell(0,isgates[0][0],isgates[0][1],isgates[0][2],isgates[0][3],torch.zeros(ch,n,1).cuda(),torch.zeros(ch,n,1).cuda())
cell.c=isgates[0][0]*isgates[0][3]
cell.h=torch.tanh(cell.c)*isgates[0][1]
# now have dummy variables for first row
self.cell_list.append(cell)
else:
cell_prev = self.cell_list[i-1]
hid_prev = cell_prev.getHiddenState()
ssgates = self.splitSS(self.state_to_state(hid_prev.unsqueeze(0)))
gates = self.addGates(isgates, ssgates,i)
ig, og, fg, gg = gates[0], gates[1], gates[2], gates[3]
cell = RowLSTMCell(cell_prev, ig, og, fg, gg, 0 ,0) #MORE zeros
cell.compute()
self.cell_list.append(cell)
# now have a list of all cell data, concatenate hidden state into 1 x h x n x n
hidden_layers = []
for i in range(n):
hid = self.cell_list[i].h
hidden_layers.append(torch.unsqueeze(hid,0))
seq = tuple(hidden_layers)
tensor = torch.cat(seq,3)
return tensor
def splitIS(self, tensor): #always going to be splitting into 4 pieces, so no need to add extra parameters
inputStateGates={}
size=tensor.size() # 1 x 4h x n x n
out_ft=size[1] # get 4h for the nxnx4h tensor
num=size[2] # get n for the nxn image
hh=out_ft/4 # we want to split the tensor into 4, for the gates
tensor = torch.squeeze(tensor).cuda() # 4h x n x n
# First, split by row: Creates n tensors of 4h x n x 1
rows = list(tensor.split(1,2))
for i in range(num):
# Each row is a tensor of 4h x n x 1, split it into 4 of h x n x 1
row=rows[i]
# print("Each row using cuda: "+str(row.is_cuda))
inputStateGates[i]=list(row.split(hh,0))
return inputStateGates
def splitSS(self, tensor): # 1 x 4h x n x 1, create 4 of 1 x h x n x 1
size=tensor.size()
out_ft=size[1] # get 4h for the 1x4hxn tensor
num=size[2] # get n for the 1xhxn row
hh=out_ft/4 # we want to split the tensor into 4, for the gates
tensor = tensor.squeeze(0).cuda() # 4h x n x 1
splitted=list(tensor.split(hh,0))
return splitted
def addGates(self, i2s,s2s,key):
""" these dictionaries are of form {key : [[i], [o], [f], [g]]}
we want to add pairwise elemeents """
# i2s is of form key: [[i], [o], [f], [g]] where each gate is hxn
# s2s is of form [[h,n],[h,n],[h,n], [h,n]]
gateSum = []
for i in range(4): # always of length 4, representing the gates
gateSum.append(torch.sigmoid(i2s[key][i] + s2s[i]))
return gateSum
Next, here is the definition of the LSTM cell.
class RowLSTMCell(): #inherit torch.nn.LSTM?
def __init__(self,prev_row, i, o, f, g, c, h):
self.c=c
self.h=h
self.i=i
self.i = self.i.cuda()
self.o=o
self.o = self.o.cuda()
self.g=g
self.g = self.g.cuda()
self.f=f
self.f = self.f.cuda()
self.prev_row=prev_row
def getStateSize(self):
return self._state_size
def getOutputSize(self):
return self._output_size
def compute(self):
c_prev = self.prev_row.getCellState()
h_prev = self.prev_row.getHiddenState()
self.c = self.f * c_prev + self.i * self.g
self.h = torch.tanh(self.c) * self.o
def getHiddenState(self):
return self.h
def getCellState(self):
return self.c
I add the RLSTM layer in between two specific convolutions of the VGG-16 model. However, now the training process becomes very slow. I am wondering why that is? Any help is much appreciated.