I am trying to build a “LSTM” that has the following structure:
class DGM2(nn.Module): # this includes batch dimnesions [batch, dim1, dim2] of the data
def __init__(self, num_inputs, L, M, phi):
super(DGM2, self).__init__()
self.num_inputs = num_inputs
self.L = L # How many layers
self.M = M # neurons per hidden layer
self.phi = phi # activation function
init_weight = lambda *shape: nn.init.xavier_uniform_(nn.Parameter(torch.empty(*shape)), gain=nn.init.calculate_gain('tanh'))
double_1 = lambda: ( init_weight(M,num_inputs),
init_weight(M,1))
double_2 = lambda: ( init_weight(1,M),
init_weight(1,1))
triple = lambda: ( init_weight(M,num_inputs),
init_weight(M,M),
init_weight(M,1))
self.W_1, self.bias_1 = double_1()
# self.U_z, self.W_z, self.b_z = triple()
# self.U_g, self.W_g, self.b_g = triple()
# self.U_r, self.W_r, self.b_r = triple()
# self.U_h, self.W_h, self.b_h = triple()
self.W, self.bias = double_2()
# Because we have more layers we need a Parameter Dictionary to keep track of all the learnable parameters. !!Find more efficient ways to implement this!!
self.dict_Z = nn.ParameterDict()
self.dict_G = nn.ParameterDict()
self.dict_R = nn.ParameterDict()
self.dict_H = nn.ParameterDict()
for i in range(L):
self.U_z, self.W_z, self.b_z = triple()
self.dict_Z["U_z{k}".format(k=i)] = self.U_z
self.dict_Z["W_z{k}".format(k=i)] = self.W_z
self.dict_Z["b_z{k}".format(k=i)] = self.b_z
self.U_g, self.W_g, self.b_g = triple()
self.dict_G["U_g{k}".format(k=i)] = self.U_g
self.dict_G["W_g{k}".format(k=i)] = self.W_g
self.dict_G["b_g{k}".format(k=i)] = self.b_g
self.U_r, self.W_r, self.b_r = triple()
self.dict_R["U_r{k}".format(k=i)] = self.U_r
self.dict_R["W_r{k}".format(k=i)] = self.W_r
self.dict_R["b_r{k}".format(k=i)] = self.b_r
self.U_h, self.W_h, self.b_h = triple()
self.dict_H["U_h{k}".format(k=i)] = self.U_h
self.dict_H["W_h{k}".format(k=i)] = self.W_h
self.dict_H["b_h{k}".format(k=i)] = self.b_h
def forward(self, x):
S=torch.empty(self.L+1, x.shape[0] ,self.M, 1)
S[0] = self.phi(torch.matmul(self.W_1, torch.transpose(x, 1, 2)) + self.bias_1)
Z=torch.empty(self.L, x.shape[0],self.M, 1)
G=torch.empty(self.L, x.shape[0],self.M, 1)
R=torch.empty(self.L, x.shape[0],self.M, 1)
H=torch.empty(self.L, x.shape[0],self.M, 1)
for layer in range(self.L):
Z[layer] = self.phi(torch.matmul(self.dict_Z.get("U_z{k}".format(k=layer)),torch.transpose(x, 1, 2)) + torch.matmul(self.dict_Z.get("W_z{k}".format(k=layer)),S[layer]) + self.dict_Z.get("b_z{k}".format(k=layer)))
G[layer] = self.phi(torch.matmul(self.dict_G.get("U_g{k}".format(k=layer)),torch.transpose(x, 1, 2)) + torch.matmul(self.dict_G.get("W_g{k}".format(k=layer)),S[0]) + self.dict_G.get("b_g{k}".format(k=layer)))
R[layer] = self.phi(torch.matmul(self.dict_R.get("U_r{k}".format(k=layer)),torch.transpose(x, 1, 2)) + torch.matmul(self.dict_R.get("W_r{k}".format(k=layer)),S[layer]) + self.dict_R.get("b_r{k}".format(k=layer)))
H[layer] = self.phi(torch.matmul(self.dict_H.get("U_h{k}".format(k=layer)),torch.transpose(x, 1, 2)) + torch.matmul(self.dict_H.get("W_h{k}".format(k=layer)),torch.mul(S[layer], R[layer])) + self.dict_H.get("b_h{k}".format(k=layer)))
S[layer+1] = torch.mul((1 - G[layer]), H[layer]) + torch.mul(Z[layer], S[layer])
output = torch.matmul(self.W, S[self.L]) + self.bias #.squeeze(-1)
return output
I have 2 questions regarding the initialisation of the parameters and the forward function.
- Is the ParameterDict a good and effiecient way to initialise and keep track of all learnable parameters?
- In the forward method I create the torch.empty tensors that I later change in the for loop using in-place operation. This raises a PyTorch error RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation. I figured out using
torch.autograd.set_detect_anomaly(True)
that the error is cause by the for loop and specifically the lineS[layer+1] = torch.mul((1 - G[layer]), H[layer]) + torch.mul(Z[layer], S[layer])
. My question is that if i change the torch.empty tensors with python lists the error goes away but is this an efficient way of writting PyTorch code and how it affects complexity of the code in training situation?
Thank you in advance!