Hi! I’m implementing a class with nn.Module, GRUcells, normalisation and dropout. I am mostly wondering if the way I implemented the GRUCells forward pass is correct and autograd would take care of properly transmitting the gradients. I used a for loop taking every time step in the input then passed that time step through the GRU cell and then used the hidden output and the next time step to continue until the sequence finishes.
Here is a snippet for clarity.
Many thanks!
class myGRU(nn.Module):
def __init__(self, input_size, hidden_size, t_samps, dropout=0.0, bias=True):
super(myGRU, self).__init__()
self.GRUcell = torch.nn.GRUCell(input_size, hidden_size, bias=bias)
# LayerNorm
self.Norm = nn.LocalResponseNorm(input_size)
# Dropout
self.Dropout = nn.Dropout(p=dropout)
# Others
self.hids = hidden_size
def forward(self, x, h0=0):
device = getDevice()
# Initialise output sequence
out_seq = torch.zeros(x.shape[0], self.hids, x.shape[-1]).to(device)
# Dropout units
x = self.Dropout(x)
# Normalise
x = self.Norm(x)
# x [batch_size, input_size, length]
for i in range(x.shape[-1]): # for all elements in sequence
if i == 0:
h_out = self.GRUcell(x[:,:,i].to(device)) # h_out [B, hidden_size]
else:
h_out = self.GRUcell(x[:,:,i].to(device), h_out.to(device))
if h_out.requires_grad:
h_out.register_hook(lambda grad: grad.clamp(-10.0,10.0))
out_seq[:,:,i] = h_out # Store in initialised memory. Is the graph properly defined?
return out_seq.to(device)