Size mismatch error when using custom LSTM cell

Roni_Kobrosly · May 13, 2018, 10:59pm

Hi, I’m trying to create a custom LSTM cell with a modified forget gate. Before I can do that, I need to create add a working template of a standard LSTM layer and cell to my code and get it to run. But I’m getting an error with this.

As you can see, I’m working with sequences of max length 100, with 1000 unique integers in the sequences. It’s a binary classification.

I’m hitting the following error: "RuntimeError: size mismatch, m1: [1 x 10900], m2: [50 x 400] at /opt/conda/conda-bld/pytorch_1501971235237/work/pytorch-0.1.12/torch/lib/TH/generic/THTensorMath.c:1237
"

When I remove the custom LSTM code and replace

LSTM(embedding_dim,hidden_dim)

with

nn.LSTM(embedding_dim,hidden_dim)

There is no error.

The complete code (data and model included) is below. Intended for Anaconda python3. Do you have any suggestions?


from keras.datasets import imdb
import re
import unicodedata
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import pickle
import math

# Get test dataset
top_words = 1000
max_review_length = 100
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
vocabLimit = 1000
max_sequence_len = 100









#### CUSTOM LSTM HERE
############################################################################################################
############################################################################################################

class LSTM(nn.Module):

    """
    An implementation of Hochreiter & Schmidhuber:
    'Long-Short Term Memory'
    http://www.bioinf.jku.at/publications/older/2604.pdf
    Special args:
    dropout_method: one of
            * pytorch: default dropout implementation
            * gal: uses GalLSTM's dropout
            * moon: uses MoonLSTM's dropout
            * semeniuta: uses SemeniutaLSTM's dropout
    """

    def __init__(self, input_size, hidden_size, bias=True, dropout=0.0, dropout_method='pytorch'):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.dropout = dropout
        self.i2h = nn.Linear(input_size, 4 * hidden_size, bias=bias)
        self.h2h = nn.Linear(hidden_size, 4 * hidden_size, bias=bias)
        self.reset_parameters()
        assert(dropout_method.lower() in ['pytorch', 'gal', 'moon', 'semeniuta'])
        self.dropout_method = dropout_method

    def sample_mask(self):
        keep = 1.0 - self.dropout
        self.mask = V(th.bernoulli(T(1, self.hidden_size).fill_(keep)))

    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, x, hidden):
        do_dropout = self.training and self.dropout > 0.0
        h, c = hidden
        h = h.view(h.size(1), -1)
        c = c.view(c.size(1), -1)
        x = x.view(x.size(1), -1)

        # Linear mappings
        preact = self.i2h(x) + self.h2h(h)

        # activations
        gates = preact[:, :3 * self.hidden_size].sigmoid()
        g_t = preact[:, 3 * self.hidden_size:].tanh()
        i_t = gates[:, :self.hidden_size]
        f_t = gates[:, self.hidden_size:2 * self.hidden_size]
        o_t = gates[:, -self.hidden_size:]

        # cell computations
        if do_dropout and self.dropout_method == 'semeniuta':
            g_t = F.dropout(g_t, p=self.dropout, training=self.training)

        c_t = th.mul(c, f_t) + th.mul(i_t, g_t)

        if do_dropout and self.dropout_method == 'moon':
                c_t.data.set_(th.mul(c_t, self.mask).data)
                c_t.data *= 1.0/(1.0 - self.dropout)

        h_t = th.mul(o_t, c_t.tanh())

        # Reshape for compatibility
        if do_dropout:
            if self.dropout_method == 'pytorch':
                F.dropout(h_t, p=self.dropout, training=self.training, inplace=True)
            if self.dropout_method == 'gal':
                    h_t.data.set_(th.mul(h_t, self.mask).data)
                    h_t.data *= 1.0/(1.0 - self.dropout)

        h_t = h_t.view(1, h_t.size(0), -1)
        c_t = c_t.view(1, c_t.size(0), -1)
        return h_t, (h_t, c_t)

############################################################################################################
############################################################################################################










class Model(torch.nn.Module) :
	def __init__(self,embedding_dim,hidden_dim) :
		super(Model,self).__init__()
		self.hidden_dim = hidden_dim
		self.embeddings = nn.Embedding(vocabLimit+1, embedding_dim)
		self.lstm = LSTM(embedding_dim,hidden_dim)
		self.linearOut = nn.Linear(hidden_dim,2)
	def forward(self,inputs,hidden) :
		x = self.embeddings(inputs).view(len(inputs),1,-1)
		lstm_out,lstm_h = self.lstm(x,hidden)
		x = lstm_out[-1]
		x = self.linearOut(x)
		x = F.log_softmax(x)
		return x,lstm_h
	def init_hidden(self) :
		return (Variable(torch.zeros(1, 1, self.hidden_dim)),Variable(torch.zeros(1, 1, self.hidden_dim)))	


model = Model(50,100)

loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

epochs = 1

print('starting training')

for i in range(epochs) :
	avg_loss = 0.0
	for idx,lines in enumerate(X_train):
		input_data = lines
		input_data = Variable(torch.LongTensor(input_data))
		target = int(y_train[idx])
		target_data = Variable(torch.LongTensor([target]))
		hidden = model.init_hidden()
		y_pred,_ = model(input_data,hidden)
		model.zero_grad()
		loss = loss_function(y_pred,target_data)
		avg_loss += loss.data[0]
		
		if idx%10 == 0 or idx == 0:
			print('epoch :%d iterations :%d loss :%g'%(i,idx,loss.data[0]))

			
		loss.backward()
		optimizer.step()			
	print('the average loss after completion of %d epochs is %g'%((i+1),(avg_loss/len(f))))