Greetings,
I am trying to make my own model with PyTorch.
It is based on LSTM and convolutions.
It is incomplete but works well.
However, my model is getting slower forward by forward.
Additionally, I apply .backward(retain_graph=True) according to the following instruction.
“Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.”
class DLSTMCell(nn.Module):
def __init__(self, input_size, hidden_size, x_kernel_size, h_kernel_size, residual_length_, stride=1):
super(DLSTMCell, self).__init__()
pad_x = math.floor(x_kernel_size/2)
pad_h = math.floor(h_kernel_size/2)
self.hidden_size = hidden_size
self.stride = stride
self.residual_length = residual_length_
self.output_size = int(hidden_size * residual_length_)
# input gate
self.conv_i_x = nn.Parameter(torch.FloatTensor(input_size, self.hidden_size)).cuda()
self.conv_i_x_bias = nn.Parameter(torch.FloatTensor(self.hidden_size)).cuda()
self.batchnorm_i_x = nn.BatchNorm1d(hidden_size)
self.conv_i_h = nn.Conv1d(self.output_size, hidden_size, h_kernel_size, stride=1, padding=pad_h)
self.batchnorm_i_h = nn.BatchNorm1d(hidden_size)
# forget gate
self.conv_f_x = nn.Parameter(torch.FloatTensor(input_size, self.hidden_size)).cuda()
self.conv_f_x_bias = nn.Parameter(torch.FloatTensor(self.hidden_size)).cuda()
self.batchnorm_f_x = nn.BatchNorm1d(hidden_size)
self.conv_f_h = nn.Conv1d(self.output_size, hidden_size, h_kernel_size, stride=1, padding=pad_h)
self.batchnorm_f_h = nn.BatchNorm1d(hidden_size)
# cell gate
self.conv_c_x = nn.Parameter(torch.FloatTensor(input_size, self.hidden_size)).cuda()
self.conv_c_x_bias = nn.Parameter(torch.FloatTensor(self.hidden_size)).cuda()
self.batchnorm_c_x = nn.BatchNorm1d(hidden_size)
self.conv_c_h = nn.Conv1d(self.output_size, hidden_size, h_kernel_size, stride=1, padding=pad_h)
self.batchnorm_c_h = nn.BatchNorm1d(hidden_size)
# output gate
self.conv_o_x = nn.Parameter(torch.FloatTensor(input_size, self.hidden_size)).cuda()
self.conv_o_x_bias = nn.Parameter(torch.FloatTensor(self.hidden_size)).cuda()
self.batchnorm_o_x = nn.BatchNorm1d(hidden_size)
self.conv_o_h = nn.Conv1d(self.output_size, hidden_size, h_kernel_size, stride=1, padding=pad_h)
self.batchnorm_o_h = nn.BatchNorm1d(hidden_size)
self.last_cell = None
def reset_state(self):
self.last_cell = None
def reset_h_list(self):
self.register_buffer('h_list', torch.zeros(8, self.residual_length, self.hidden_size))
self.h_list = self.h_list.cuda()
def forward(self, x):
batch_size = x.size(0)
# first sequence
if self.last_cell is None:
self.last_cell = Variable(torch.zeros(batch_size, self.hidden_size))
self.conv_i_x_bias = self.conv_i_x_bias.unsqueeze(0).expand(batch_size, *self.conv_i_x_bias.size())
self.conv_f_x_bias = self.conv_f_x_bias.unsqueeze(0).expand(batch_size, *self.conv_f_x_bias.size())
self.conv_c_x_bias = self.conv_c_x_bias.unsqueeze(0).expand(batch_size, *self.conv_c_x_bias.size())
self.conv_o_x_bias = self.conv_o_x_bias.unsqueeze(0).expand(batch_size, *self.conv_o_x_bias.size())
self.last_cell = self.last_cell.cuda()
self.h_list = self.h_list.cuda()
h = self.h_list[:, -self.residual_length:, :].contiguous().view(batch_size, self.output_size, -1)
# input gate
input_h = self.batchnorm_i_h(self.conv_i_h(h))
input_x = self.batchnorm_i_x(torch.mm(x, self.conv_i_x) + self.conv_i_x_bias)
input_h = torch.squeeze(input_h)
input_gate = F.sigmoid(input_x + input_h)
# forget gate
forget_x = self.batchnorm_f_x(torch.mm(x, self.conv_f_x) + self.conv_f_x_bias)
forget_h = self.batchnorm_f_h(self.conv_f_h(h))
forget_h = torch.squeeze(forget_h)
forget_gate = F.sigmoid(forget_x + forget_h)
# forget gate
cell_x = self.batchnorm_c_x(torch.mm(x, self.conv_c_x) + self.conv_c_x_bias)
cell_h = self.batchnorm_c_h(self.conv_c_h(h))
cell_h = torch.squeeze(cell_h)
cell_intermediate = F.tanh(cell_x + cell_h) # g
cell_gate = (forget_gate * self.last_cell) + (input_gate * cell_intermediate)
# output gate
output_x = self.batchnorm_o_x(torch.mm(x, self.conv_o_x) + self.conv_o_x_bias)
output_h = self.batchnorm_o_h(self.conv_o_h(h))
output_h = torch.squeeze(output_h)
output_gate = F.sigmoid(output_x + output_h)
next_h = output_gate * F.tanh(cell_gate)
self.last_cell = cell_gate
next_h = torch.unsqueeze(next_h, dim=1)
self.h_list = torch.cat((self.h_list, next_h), dim=1)
#print("self.h_list shape: {}".format(self.h_list.shape))
return next_h
class LSTM(nn.Module):
"""A module that runs multiple steps of LSTM."""
def __init__(self, cell_class, input_size, hidden_size, x_kernel_size, h_kernel_size, num_layers=1, residual_length=5,
use_bias=True, batch_first=True, dropout=0, **kwargs):
super(LSTM, self).__init__()
self.cell_class = cell_class
self.input_size = input_size
self.hidden_size = hidden_size
self.x_kernel_size = x_kernel_size
self.h_kernel_size = h_kernel_size
self.num_layers = num_layers
self.use_bias = use_bias
self.batch_first = batch_first
self.dropout = dropout
self.residual_length = residual_length
self.output_size = hidden_size * residual_length
for layer in range(num_layers):
layer_input_size = input_size if layer == 0 else hidden_size
cell = cell_class(input_size=layer_input_size, hidden_size=hidden_size, x_kernel_size=x_kernel_size, h_kernel_size=h_kernel_size, residual_length_=residual_length, **kwargs)
setattr(self, 'cell_{}'.format(layer), cell)
self.dropout_layer = nn.Dropout(dropout)
self.reset_parameters()
def get_cell(self, layer):
return getattr(self, 'cell_{}'.format(layer))
def reset_parameters(self):
for layer in range(self.num_layers):
cell = self.get_cell(layer)
@staticmethod
def _forward_rnn(cell, input_, energy_, length): # energy_: the amount of information of each sequence (attention)
assert input_.size(0) == energy_.size(0), "Sequence length of input and energy must be the same"
max_time = input_.size(0)
output = []
for time in range(max_time):
h_next = cell(x=input_[time])
output.append(h_next)
output = torch.stack(output, 0)
#print("output shape: {}".format(output.shape))
return output
def forward(self, input_, energy_, length=None, hx=None):
if self.batch_first:
input_ = input_.transpose(0, 1)
energy_ = energy_.transpose(0, 1)
max_time, batch_size, _ = input_.size()
if length is None:
length = Variable(torch.LongTensor([max_time] * batch_size))
if input_.is_cuda:
device = input_.get_device()
length = length.cuda(device)
layer_output = None
for layer in range(self.num_layers):
cell = self.get_cell(layer)
cell.reset_h_list()
layer_output = LSTM._forward_rnn(cell=cell, input_=input_, energy_=energy_, length=length)
layer_output = layer_output[-self.residual_length:, :, :].transpose(0, 1)
layer_output = layer_output.contiguous().view(batch_size, -1)
return layer_output