Hi. I’m using the following 1D resnet. I can train it in the non-distributed mode without any error but when switching to data distributed parallel mode I get the gradient computation has been modified by an in-place operation
which usually occurs for in-place operations.
class MyConv1dPadSame(nn.Module):
"""
extend nn.Conv1d to support SAME padding
"""
def __init__(self, in_channels, out_channels, kernel_size, stride, groups=1):
super(MyConv1dPadSame, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.stride = stride
self.groups = groups
self.conv = torch.nn.Conv1d(
in_channels=self.in_channels,
out_channels=self.out_channels,
kernel_size=self.kernel_size,
stride=self.stride,
groups=self.groups)
def forward(self, x):
net = x
# compute pad shape
in_dim = net.shape[-1]
out_dim = (in_dim + self.stride - 1) // self.stride
p = max(0, (out_dim - 1) * self.stride + self.kernel_size - in_dim)
pad_left = p // 2
pad_right = p - pad_left
net = F.pad(net, (pad_left, pad_right), "constant", 0)
net = self.conv(net)
return net
class MyMaxPool1dPadSame(nn.Module):
"""
extend nn.MaxPool1d to support SAME padding
"""
def __init__(self, kernel_size):
super(MyMaxPool1dPadSame, self).__init__()
self.kernel_size = kernel_size
self.stride = 1
self.max_pool = torch.nn.MaxPool1d(kernel_size=self.kernel_size)
def forward(self, x):
net = x
# compute pad shape
in_dim = net.shape[-1]
out_dim = (in_dim + self.stride - 1) // self.stride
p = max(0, (out_dim - 1) * self.stride + self.kernel_size - in_dim)
pad_left = p // 2
pad_right = p - pad_left
net = F.pad(net, (pad_left, pad_right), "constant", 0)
net = self.max_pool(net)
return net
class BasicBlock(nn.Module):
"""
ResNet Basic Block
"""
def __init__(self, in_channels, out_channels, kernel_size, stride, groups, downsample, use_bn, use_do, is_first_block=False):
super(BasicBlock, self).__init__()
self.in_channels = in_channels
self.kernel_size = kernel_size
self.out_channels = out_channels
self.stride = stride
self.groups = groups
self.downsample = downsample
if self.downsample:
self.stride = stride
else:
self.stride = 1
self.is_first_block = is_first_block
self.use_bn = use_bn
self.use_do = use_do
# the first conv
self.bn1 = nn.BatchNorm1d(in_channels)
self.relu1 = nn.ReLU(inplace=True)
self.do1 = nn.Dropout(p=0.5)
self.conv1 = MyConv1dPadSame(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=self.stride,
groups=self.groups)
# the second conv
self.bn2 = nn.BatchNorm1d(out_channels)
self.relu2 = nn.ReLU(inplace=True)
self.do2 = nn.Dropout(p=0.5)
self.conv2 = MyConv1dPadSame(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=1,
groups=self.groups)
self.max_pool = MyMaxPool1dPadSame(kernel_size=self.stride)
def forward(self, x):
identity = x
# the first conv
out = x
if not self.is_first_block:
if self.use_bn:
out = self.bn1(out)
out = self.relu1(out)
if self.use_do:
out = self.do1(out)
out = self.conv1(out)
# the second conv
if self.use_bn:
out = self.bn2(out)
out = self.relu2(out)
if self.use_do:
out = self.do2(out)
out = self.conv2(out)
# if downsample, also downsample identity
if self.downsample:
identity = self.max_pool(identity)
# if expand channel, also pad zeros to identity
if self.out_channels != self.in_channels:
identity = torch.transpose(identity,-1,-2)
ch1 = (self.out_channels-self.in_channels)//2
ch2 = self.out_channels-self.in_channels-ch1
identity = F.pad(identity, (ch1, ch2), "constant", 0)
identity = torch.transpose(identity,-1,-2)
# shortcut
out = out + identity
return out
class ResNet1D(nn.Module):
"""
Input:
X: (n_samples, n_channel, n_length)
Y: (n_samples)
Output:
out: (n_samples)
Pararmetes:
in_channels: dim of input, the same as n_channel
base_filters: number of filters in the first several Conv layer, it will double at every 4 layers
kernel_size: width of kernel
stride: stride of kernel moving
groups: set larget to 1 as ResNeXt
n_block: number of blocks
n_classes: number of classes
"""
def __init__(self, in_channels, base_filters, kernel_size, stride, groups, n_block, n_classes, downsample_gap=2, increasefilter_gap=4, use_bn=True, use_do=True, verbose=False):
super(ResNet1D, self).__init__()
self.verbose = verbose
self.n_block = n_block
self.kernel_size = kernel_size
self.stride = stride
self.groups = groups
self.use_bn = use_bn
self.use_do = use_do
self.downsample_gap = downsample_gap # 2 for base model
self.increasefilter_gap = increasefilter_gap # 4 for base model
# first block
self.first_block_conv = MyConv1dPadSame(in_channels=in_channels, out_channels=base_filters, kernel_size=self.kernel_size, stride=1)
self.first_block_bn = nn.BatchNorm1d(base_filters)
self.first_block_relu = nn.ReLU()
out_channels = base_filters
# residual blocks
self.basicblock_list = nn.ModuleList()
for i_block in range(self.n_block):
# is_first_block
if i_block == 0:
is_first_block = True
else:
is_first_block = False
# downsample at every self.downsample_gap blocks
if i_block % self.downsample_gap == 1:
downsample = True
else:
downsample = False
# in_channels and out_channels
if is_first_block:
in_channels = base_filters
out_channels = in_channels
else:
# increase filters at every self.increasefilter_gap blocks
in_channels = int(base_filters*2**((i_block-1)//self.increasefilter_gap))
if (i_block % self.increasefilter_gap == 0) and (i_block != 0):
out_channels = in_channels * 2
else:
out_channels = in_channels
tmp_block = BasicBlock(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=self.kernel_size,
stride = self.stride,
groups = self.groups,
downsample=downsample,
use_bn = self.use_bn,
use_do = self.use_do,
is_first_block=is_first_block)
self.basicblock_list.append(tmp_block)
# final prediction
self.final_bn = nn.BatchNorm1d(out_channels)
self.final_relu = nn.ReLU(inplace=True)
# self.do = nn.Dropout(p=0.5)
self.dense = nn.Linear(out_channels, n_classes)
# self.softmax = nn.Softmax(dim=1)
def forward(self, x):
out = x
# first conv
if self.verbose:
print('input shape', out.shape)
out = self.first_block_conv(out)
if self.verbose:
print('after first conv', out.shape)
if self.use_bn:
out = self.first_block_bn(out)
out = self.first_block_relu(out)
# residual blocks, every block has two conv
for i_block in range(self.n_block):
net = self.basicblock_list[i_block]
if self.verbose:
print('i_block: {0}, in_channels: {1}, out_channels: {2}, downsample: {3}'.format(i_block, net.in_channels, net.out_channels, net.downsample))
out = net(out)
if self.verbose:
print(out.shape)
# final prediction
if self.use_bn:
out = self.final_bn(out)
out = self.final_relu(out)
out = torch.mean(out, -1)
if self.verbose:
print('final pooling', out.shape)
# out = self.do(out)
out = self.dense(out)
if self.verbose:
print('dense', out.shape)
# out = self.softmax(out)
if self.verbose:
print('softmax', out.shape)
return out
here is the error traceback:
[W python_anomaly_mode.cpp:104] Warning: Error detected in CudnnBatchNormBackward. Traceback of forward call that caused the error:
File "<string>", line 1, in <module>
File "/usr/lib64/python3.6/multiprocessing/spawn.py", line 105, in spawn_main
exitcode = _main(fd)
File "/usr/lib64/python3.6/multiprocessing/spawn.py", line 118, in _main
return self._bootstrap()
File "/usr/lib64/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib64/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
File "/alto/nima/textAnomaly/train_encoder_dd.py", line 168, in train
h1 = h_net(x1_rep)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 619, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/alto/nima/textAnomaly/resent1D.py", line 275, in forward
out = self.final_bn(out)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/modules/batchnorm.py", line 136, in forward
self.weight, self.bias, bn_training, exponential_average_factor, self.eps)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/functional.py", line 2058, in batch_norm
training, momentum, eps, torch.backends.cudnn.enabled
(function _print_stack)
0%| | 0/235 [00:03<?, ?it/s]
[W python_anomaly_mode.cpp:104] Warning: Error detected in CudnnBatchNormBackward. Traceback of forward call that caused the error:
File "<string>", line 1, in <module>
File "/usr/lib64/python3.6/multiprocessing/spawn.py", line 105, in spawn_main
exitcode = _main(fd)
File "/usr/lib64/python3.6/multiprocessing/spawn.py", line 118, in _main
return self._bootstrap()
File "/usr/lib64/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib64/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
File "/alto/nima/textAnomaly/train_encoder_dd.py", line 168, in train
h1 = h_net(x1_rep)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 619, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/alto/nima/textAnomaly/resent1D.py", line 275, in forward
out = self.final_bn(out)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/modules/batchnorm.py", line 136, in forward
self.weight, self.bias, bn_training, exponential_average_factor, self.eps)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/functional.py", line 2058, in batch_norm
training, momentum, eps, torch.backends.cudnn.enabled
(function _print_stack)
[W python_anomaly_mode.cpp:104] Warning: Error detected in CudnnBatchNormBackward. Traceback of forward call that caused the error:
File "<string>", line 1, in <module>
File "/usr/lib64/python3.6/multiprocessing/spawn.py", line 105, in spawn_main
exitcode = _main(fd)
File "/usr/lib64/python3.6/multiprocessing/spawn.py", line 118, in _main
return self._bootstrap()
File "/usr/lib64/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib64/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
File "/alto/nima/textAnomaly/train_encoder_dd.py", line 168, in train
h1 = h_net(x1_rep)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 619, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/alto/nima/textAnomaly/resent1D.py", line 275, in forward
out = self.final_bn(out)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/modules/batchnorm.py", line 136, in forward
self.weight, self.bias, bn_training, exponential_average_factor, self.eps)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/nn/functional.py", line 2058, in batch_norm
training, momentum, eps, torch.backends.cudnn.enabled
(function _print_stack)
0%| | 0/235 [00:04<?, ?it/s]
0%| | 0/235 [00:03<?, ?it/s]
Traceback (most recent call last):
File "train_encoder_dd.py", line 210, in <module>
mp.spawn(train, nprocs=args.num_gpus, args=(args,))
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
while not context.join():
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 118, in join
raise Exception(msg)
Exception:
-- Process 2 terminated with the following error:
Traceback (most recent call last):
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
File "/alto/nima/textAnomaly/train_encoder_dd.py", line 174, in train
loss.backward()
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/tensor.py", line 221, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/alto/nima/torch-env/lib/python3.6/site-packages/torch/autograd/__init__.py", line 132, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [1024]] is at version 4; expected version 3 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!