I’m still trying to wrap my head around PyTorch’s Autograd engine. I wanted to implement a toy network architecture but keep getting the same error:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
Looking at my code, I can’t figure out, what exactly causes the error. It would be great, if someone could give me a hint. Here is my code:
import time
import torch
import random
def test_toy_network():
torch.autograd.set_detect_anomaly(True)
# Parameters
n_samples = 2000
lr = 1e-4
n_inputs = 2
n_outputs = 2
sigma = 0.1
n_steps = 1000
# Fake data
x_data = torch.rand(size=(n_samples, n_inputs))
y_data = torch.rand(size=(n_samples, n_outputs))
matrix_height = 3
matrix_width = 1
# Trainable parameters
w_in = torch.normal(mean=0.0, std=sigma, size=(matrix_height, n_inputs), requires_grad=True)
b_in = torch.normal(mean=0.0, std=sigma, size=(matrix_height,), requires_grad=True)
w = [[torch.normal(mean=0.0, std=sigma, size=(1,), requires_grad=True)
for _ in range(matrix_width)] for _ in range(matrix_height)]
w_out = torch.normal(mean=0.0, std=sigma, size=(n_outputs, matrix_height), requires_grad=True)
b_out = torch.normal(mean=0.0, std=sigma, size=(n_outputs,), requires_grad=True)
# Placeholder
a = [[torch.zeros(size=(1,)) for _ in range(matrix_width + 1)] for _ in range(matrix_height)]
a_tmp = torch.zeros(size=(matrix_height,))
h = torch.nn.Sigmoid()
for n in range(n_steps):
t0 = time.time()
# Draw data points
rand_idx = random.randint(0, n_samples - 1)
x = x_data[rand_idx]
y = y_data[rand_idx]
# Feedforward
a_in = h(w_in.matmul(x) + b_in)
for i in range(matrix_height):
a[i][0] = a_in[i]
a[0][1] = h(a[0][0] * w[0][0] + a[1][0] * w[1][0])
a[1][1] = h(a[0][0] * w[0][0] + a[1][0] * w[1][0] + a[2][0] * w[2][0])
a[2][1] = h(a[1][0] * w[1][0] + a[2][0] * w[2][0]) # <--- Error
for i in range(matrix_height):
a_tmp[i] = a[i][1]
a_out = torch.sigmoid(w_out.matmul(a_tmp) + b_out)
loss = torch.nn.MSELoss(reduction="mean")(a_out, y)
# Backpropagation
loss.backward(retain_graph=True)
# Gradient descent
with torch.no_grad():
w_in.sub_(lr * w_in.grad)
b_in.sub_(lr * b_in.grad)
w_out.sub_(lr * w_out.grad)
b_out.sub_(lr * b_out.grad)
w[0][0].sub_(lr * w[0][0].grad) # <--- Error
w[1][0].sub_(lr * w[1][0].grad) # <--- Error
w[2][0].sub_(lr * w[2][0].grad) # <--- Error
t1 = time.time()
if n % 100 == 0:
print(f"n {n} loss {loss.item()} time {(t1-t0)}")
if __name__ == "__main__":
test_toy_network()