Hi,
config: torch 1.10.2, CUDA 11.3, Python 3.8.10
after one iteration turning on torch.autograd.set_detect_anomaly(True)
I get the following error:
[W python_anomaly_mode.cpp:104] Warning: Error detected in MulBackward0. Traceback of forward call that caused the error:
File “/home/ddd/work/ASR/NeuroPhysModel/reproduceProblem.py”, line 75, in
net = Network(center_frequency).to(device)
File “/home/ddd/work/ASR/NeuroPhysModel/reproduceProblem.py”, line 33, in init
self.G__filters = A * T_filter ** (L - 1) * torch.exp(
(function _print_stack)
Traceback (most recent call last):
File “/home/ddd/work/ASR/NeuroPhysModel/reproduceProblem.py”, line 90, in
loss.backward()
File “/home/ddd/envPy3.8Sl/lib/python3.8/site-packages/torch/_tensor.py”, line 307, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File “/home/ddd/envPy3.8Sl/lib/python3.8/site-packages/torch/autograd/init.py”, line 154, in backward
Variable._execution_engine.run_backward(
RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.
I tried several ways of resetting the gradients using net.zero_grad()
or optimizer.zero_grad()
, but nothing. I assume this is due to the graph being unconnected with just leafs (params)? To my understanding, it should be connected via the mathematical operation on these params at Network.__init__
method and by the cumulative operations in the forward pass.
Here is a stand-alone reproducible code:
import argparse
import torch
import torch.nn.functional as F
import torch.nn as nn
from radam import radam
torch.autograd.set_detect_anomaly(True)
class Network(torch.nn.Module):
def __init__(self, center_frequency):
super(Network, self).__init__()
## filters paramters
self.nr_batch = args.b
self.nr_channels = center_frequency.nelement()
self.resolution_freq = 32
self.normalized_temporal_length = 16
self.filter_length = 511
self.center_frequency = center_frequency.to(device)
self.base_freq = self.resolution_freq * self.center_frequency[0]
T_filter = torch.stack([torch.arange(1. / (self.resolution_freq * f),
self.normalized_temporal_length / (f),
1. / (self.resolution_freq * f), device=device)[
:(self.filter_length)] for f in self.center_frequency])
CF = self.center_frequency.unsqueeze(1).repeat(1, self.filter_length)
# self.resample_up = torchaudio.transforms.Resample(self.base_freq, self.SR)
## Trainable params
self.a = nn.Parameter(torch.rand((self.nr_channels, 1), device=device), requires_grad=True)
self.l = nn.Parameter(5 * torch.rand((self.nr_channels, 1), device=device), requires_grad=True)
# make the above 2D preparing for filter computation on
A = self.a.repeat(1, self.filter_length)
L = self.l.repeat(1, self.filter_length)
## filter tensors and vars
self.G__filters = A * T_filter ** (L - 1) * torch.exp(
-2 * torch.pi * (24.7 + 0.108 * CF) * T_filter)
## ****uncomment this two lines to get the behaviour needed and comment the line below
# self.Rep_GF = torch.zeros(len(self.center_frequency), len(self.center_frequency), self.filter_length, device = device)
# self.Rep_GF[range(self.nr_channels), range(self.nr_channels), :] = self.G__filters.flip(1) #accounting for convolution as opposed to crosscorr
## ****comment this when uncommenting the two lines above
self.Rep_GF = self.G__filters.flip(1).unsqueeze(0).repeat(self.nr_batch, 1, 1)
def forward(self, input):
res = F.conv1d(input, self.Rep_GF, padding=0)[:, :, :self.base_freq]
return res.sum(1), res
def mse_loss(x, target, reduction='mean', order=2):
if reduction == 'sum':
return ((x - target) ** order).sum() ** (1. / order)
else:
return ((x - target) ** order).mean() ** (1. / order)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-gpu', type=int, default=[0], help='which gpu(s) to use', nargs='+')
parser.add_argument('-b', type=int, default=64, help='batch size for dataloader')
parser.add_argument('-lr', type=float, default=0.01, help='initial learning rate')
parser.add_argument('-seed', type=int, default=None, help='random seed of the experiment')
parser.add_argument('-epoch', type=int, default=5000, help='number of epochs to run')
parser.add_argument('-step', type=int, default=[1000], help='milestones for learning rate scheduler', nargs='+')
args = parser.parse_args()
if args.seed is not None:
torch.manual_seed(args.seed)
print('Using GPUs {}'.format(args.gpu))
device = torch.device('cuda:{}'.format(args.gpu[0]))
### Dummy data
center_frequency = torch.tensor([125, 250, 500], device=device)
input = (torch.randn((args.b, center_frequency.nelement(), 1000), device = device)>1).float() #stores a bunch of binary events as input
target = torch.randn((args.b, 1000), device=device) # stores target signals as targets
net = Network(center_frequency).to(device)
module = net
# Define optimizer module.
optimizer = radam.RAdam(net.parameters(), lr=args.lr, weight_decay=1e-5)
for epoch in range(args.epoch):
for i in range(args.b):
# net.zero_grad()
net.train()
audio_out, _ = net.forward(input)
loss = mse_loss(audio_out, target[:,:490], reduction='mean', order=2)
optimizer.zero_grad()
loss.backward()
optimizer.step()
optimizer.zero_grad()
print(i)