Hello PyTorch developers,
I was solving Exercise 4 from the book Dive into Deep Learning, which goes as follows:
What happens if you implement only parts of a GRU, e.g., with only a reset gate or only an update gate?
Here is the implementation from the book, which works:
def gru(inputs, state, params):
W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q = params
H, = state
outputs = []
for X in inputs:
Z = torch.sigmoid((X @ W_xz) + (H @ W_hz) + b_z)
R = torch.sigmoid((X @ W_xr) + (H @ W_hr) + b_r)
H_tilda = torch.tanh((X @ W_xh) + ((R * H) @ W_hh) + b_h)
H = Z * H + (1 - Z) * H_tilda
Y = H @ W_hq + b_q
outputs.append(Y)
return torch.cat(outputs, dim=0), (H,)
When I train it, all works well:
vocab_size, num_hiddens, device = len(vocab), 256, d2l.try_gpu()
num_epochs, lr = 500, 1
model = d2l.RNNModelScratch(len(vocab), num_hiddens, device, get_params,
init_gru_state, gru)
d2l.train_ch8(model, train_iter, vocab, lr, num_epochs, device)
Now, here’s my implementation, with only the update gate:
def gru(inputs, state, params):
W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q = params
H, = state
outputs = []
for X in inputs:
Z = torch.sigmoid((X @ W_xz) + (H @ W_hz) + b_z)
#R = torch.sigmoid((X @ W_xr) + (H @ W_hr) + b_r)
#H_tilda = torch.tanh((X @ W_xh) + ((R * H) @ W_hh) + b_h)
H_tilda = torch.tanh((X @ W_xh) + (H @ W_hh) + b_h)
H = Z * H + (1 - Z) * H_tilda
Y = H @ W_hq + b_q
outputs.append(Y)
return torch.cat(outputs, dim=0), (H,)
However, when I try to train it, I get the following error:
vocab_size, num_hiddens, device = len(vocab), 256, d2l.try_gpu()
num_epochs, lr = 500, 1
model = d2l.RNNModelScratch(len(vocab), num_hiddens, device, get_params,
init_gru_state, gru)
d2l.train_ch8(model, train_iter, vocab, lr, num_epochs, device)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/tmp/ipykernel_56388/2704541672.py in <module>
3 model = d2l.RNNModelScratch(len(vocab), num_hiddens, device, get_params,
4 init_gru_state, gru)
----> 5 d2l.train_ch8(model, train_iter, vocab, lr, num_epochs, device)
~/anaconda3/envs/d2l/lib/python3.8/site-packages/d2l/torch.py in train_ch8(net, train_iter, vocab, lr, num_epochs, device, use_random_iter)
799 # Train and predict
800 for epoch in range(num_epochs):
--> 801 ppl, speed = train_epoch_ch8(net, train_iter, loss, updater, device,
802 use_random_iter)
803 if (epoch + 1) % 10 == 0:
~/anaconda3/envs/d2l/lib/python3.8/site-packages/d2l/torch.py in train_epoch_ch8(net, train_iter, loss, updater, device, use_random_iter)
777 else:
778 l.backward()
--> 779 grad_clipping(net, 1)
780 # Since the `mean` function has been invoked
781 updater(batch_size=1)
~/anaconda3/envs/d2l/lib/python3.8/site-packages/d2l/torch.py in grad_clipping(net, theta)
741 else:
742 params = net.params
--> 743 norm = torch.sqrt(sum(torch.sum((p.grad**2)) for p in params))
744 if norm > theta:
745 for param in params:
~/anaconda3/envs/d2l/lib/python3.8/site-packages/d2l/torch.py in <genexpr>(.0)
741 else:
742 params = net.params
--> 743 norm = torch.sqrt(sum(torch.sum((p.grad**2)) for p in params))
744 if norm > theta:
745 for param in params:
TypeError: unsupported operand type(s) for ** or pow(): 'NoneType' and 'int'
Does anyone see what is going on here? I’m baffled by the error and can’t figure out what is it saying in this particular code.
Thank you in advance!