I want to implement Multiplicative LSTM as described in [Krause et al. 2016]. But It seems there isn’t some useful tutorial for implementing customised RNNs. Anayone have some tutorial for it ?
Thanks a lot.
there isn’t a specific tutorial just for implementing customized RNNs.
Thanks a lot. I will work on it.
Hi @jacob,
I implemented a couple of custom RNNs recently by simply modifying this code,
Hope that helps,
Best,
Ajay
I’m new to PyTorch as well but I think this should do it?
class mLSTM(nn.Module):
def __init__(self, input_size, hidden_size, embed_size, output_size):
super(mLSTM, self).__init__()
self.hidden_size = hidden_size
# input embedding
self.encoder = nn.Embedding(input_size, embed_size)
# lstm weights
self.weight_fm = nn.Linear(hidden_size, hidden_size)
self.weight_im = nn.Linear(hidden_size, hidden_size)
self.weight_cm = nn.Linear(hidden_size, hidden_size)
self.weight_om = nn.Linear(hidden_size, hidden_size)
self.weight_fx = nn.Linear(embed_size, hidden_size)
self.weight_ix = nn.Linear(embed_size, hidden_size)
self.weight_cx = nn.Linear(embed_size, hidden_size)
self.weight_ox = nn.Linear(embed_size, hidden_size)
# multiplicative weights
self.weight_mh = nn.Linear(hidden_size, hidden_size)
self.weight_mx = nn.Linear(embed_size, hidden_size)
# decoder
self.decoder = nn.Linear(hidden_size, output_size)
def forward(self, inp, h_0, c_0):
# encode the input characters
inp = self.encoder(inp)
# calculate the multiplicative matrix
m_t = self.weight_mx(inp) * self.weight_mh(h_0)
# forget gate
f_g = F.sigmoid(self.weight_fx(inp) + self.weight_fm(m_t))
# input gate
i_g = F.sigmoid(self.weight_ix(inp) + self.weight_im(m_t))
# output gate
o_g = F.sigmoid(self.weight_ox(inp) + self.weight_om(m_t))
# intermediate cell state
c_tilda = F.tanh(self.weight_cx(inp) + self.weight_cm(m_t))
# current cell state
cx = f_g * c_0 + i_g * c_tilda
# hidden state
hx = o_g * F.tanh(cx)
out = self.decoder(hx.view(1,-1))
return out, hx, cx
def init_hidden(self):
h_0 = Variable(torch.zeros(1, self.hidden_size)).cuda()
c_0 = Variable(torch.zeros(1, self.hidden_size)).cuda()
return h_0, c_0
Thank you for your kindly help:wink:
You are nice, thanks!
@Flo Why we need self.encoder = nn.Embedding(input_size, embed_size)
suppose I have 3D tensor [64, 10, 128] where 64 is batch size, 10 is sequence length, and 128 is feature size. What should I do with inp
?
Here is a version that I extracted from https://github.com/MaximumEntropy/Seq2Seq-PyTorch/blob/master/model.py The original code contains attention mechanism.
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, batch_first=True):
"""Initialize params."""
super(PersonaLSTMAttentionDot, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = 1
self.batch_first = batch_first
self.input_weights = nn.Linear(input_size, 4 * hidden_size)
self.hidden_weights = nn.Linear(hidden_size, 4 * hidden_size)
def forward(self, input, hidden, ctx, ctx_mask=None):
"""Propogate input through the network."""
# tag = None #
def recurrence(input, hidden):
"""Recurrence helper."""
hx, cx = hidden # n_b x hidden_dim
gates = self.input_weights(input) + \
self.hidden_weights(hx)
ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
ingate = F.sigmoid(ingate)
forgetgate = F.sigmoid(forgetgate)
cellgate = F.tanh(cellgate) # o_t
outgate = F.sigmoid(outgate)
cy = (forgetgate * cx) + (ingate * cellgate)
hy = outgate * F.tanh(cy) # n_b x hidden_dim
return hy, cy
if self.batch_first:
input = input.transpose(0, 1)
output = []
steps = range(input.size(0))
for i in steps:
hidden = recurrence(input[i], hidden)
if isinstance(hidden, tuple):
output.append(hidden[0])
else:
output.append(hidden)
# output.append(hidden[0] if isinstance(hidden, tuple) else hidden)
# output.append(isinstance(hidden, tuple) and hidden[0] or hidden)
output = torch.cat(output, 0).view(input.size(0), *output[0].size())
if self.batch_first:
output = output.transpose(0, 1)
return output, hidden
In this implementation, h and x are combined using summation instead of concatenation. I found the two approaches are mathematically equivalent. And I think the implementation I provided is more efficient because it has higher dimensional multiplications.