Hi there,
I ran my code below on RTX A6000 with 2 GPUs or 4 GPUs. However, the CE loss becomes nan after just a few iterations. Then I check, it because the learnable parameters become nan after almost the first backpropagation. Then I test my code on other GPUs such as TITAN RTX, TITAN V, and Tesla V100 (32G). They work well on other GPUs except for RTX A6000. I am wondering if there is a bug on PyTorch for A6000. Could you guys please re-test my code on A6000 again to see if there is a PyTorch bug on A6000 or not?
The command I ran my code is
python -m torch.distributed.launch --master_port=6396 --nproc_per_node=2 debug_train_dist.py --ngpu 2 --reduction 8 -lr 0.001 -epoch 80 -nb_worker 8 -bs 50
The project code is below which helps you to reproduce. By the way, my PyTorch version on RTX A6000 is 1.10.0.dev20210831. The PyTorch version I test on other GPUs is 1.6.0.
import numpy as np
from random import randrange
import torch
import torch.nn as nn
from torch.utils import data
from tqdm import tqdm
import argparse
import json
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-bs', type=int, default=100)
parser.add_argument('-lr', type=float, default=0.001)
parser.add_argument('-epoch', type=int, default=80)
parser.add_argument('-nb_worker', type=int, default=8)
parser.add_argument('-seed', type=int, default=1234)
parser.add_argument('-model_params', type=json.loads, default=
'{"first_conv":3, "in_channels":1, "filts":[128, [128,128], [128,256], [256,256]],' \
'"blocks":[2,4], "nb_fc_att_node":[1], "nb_fc_node":1024, "gru_node":1024, "nb_gru_layer":1}')
parser.add_argument('--local_rank', type=int, default=0)
parser.add_argument('--ngpu', type=int, default=2)
parser.add_argument('--reduction', type=int, help='reduction rate')
args = parser.parse_args()
return args
def keras_lr_decay(step, decay = 0.0001):
return 1./(1. + decay * step)
class Residual_block_imgs(nn.Module):
def __init__(self, nb_filts, shift, reduction=8, first = False):
super(Residual_block_imgs, self).__init__()
self.shift = shift
self.lrelu = nn.LeakyReLU()
self.lrelu_keras = nn.LeakyReLU(negative_slope=0.3)
self.conv1 = nn.Conv1d(in_channels=nb_filts[0],
out_channels=nb_filts[1],
kernel_size=3,
padding=1,
stride=1)
self.bn2 = nn.BatchNorm1d(num_features=nb_filts[1])
self.conv2 = nn.Conv1d(in_channels=nb_filts[1],
out_channels=nb_filts[1],
padding=1,
kernel_size=3,
stride=1)
if nb_filts[0] != nb_filts[1]:
self.downsample = True
self.conv_downsample = nn.Conv1d(in_channels=nb_filts[0],
out_channels=nb_filts[1],
padding=0,
kernel_size=1,
stride=1)
else:
self.downsample = False
self.mp = nn.MaxPool1d(3)
channel = nb_filts[0]
self.avg_pool_t1 = nn.AdaptiveAvgPool1d(1)
self.avg_pool_t2 = nn.AdaptiveAvgPool1d(1)
self.down_t1 = nn.Sequential(
nn.Linear(channel, channel // reduction, bias=False),
nn.ReLU(inplace=True)
)
self.up_diff = nn.Sequential(
nn.Linear(channel // reduction, channel, bias=False),
nn.Sigmoid()
)
self.down_t2 = nn.Sequential(
nn.Linear(channel, channel // reduction, bias=False),
nn.ReLU(inplace=True)
)
def forward(self, x):
identity = x
x = self._middle(x)
out = self.conv1(x)
out = self.bn2(out)
out = self.lrelu_keras(out)
out = self.conv2(out)
if self.downsample:
identity = self.conv_downsample(identity)
out += identity
out = self.mp(out)
return out
def _middle(self, x):
b, c, _ = x.size()
length = x[:, :, self.shift:].shape[-1]
x_t2 = x[:,:,self.shift:]
x_t1 = x[:,:,:length]
b_t2, c_t2, _ = x_t2.size()
y_t2 = self.avg_pool_t2(x_t2).view(b_t2, c_t2)
b_t1, c_t1, _ = x_t1.size()
y_t1 = self.avg_pool_t1(x_t1).view(b_t1, c_t1)
y_t2 = self.down_t2(y_t2)
y_t1 = self.down_t1(y_t1)
y = y_t2 - y_t1
y = self.up_diff(y).view(b, c, 1)
x = x * y.expand_as(x)
return x
class Model_imgs(nn.Module):
def __init__(self, d_args, reduction=8):
super(Model_imgs, self).__init__()
self.reduction = reduction
self.first_conv = nn.Conv1d(
in_channels=d_args['in_channels'], # 1
out_channels=d_args['filts'][0], # 128
kernel_size=d_args['first_conv'], # 3
stride=d_args['first_conv']) # 3
self.first_bn = nn.BatchNorm1d(
num_features=d_args['filts'][0]) # 128
self.lrelu_keras = nn.LeakyReLU(
negative_slope=0.3)
self.block0 = self._make_layer(
nb_blocks=d_args['blocks'][0], # 2
nb_filts=d_args['filts'][1], # 128
shifts=[2667, 889],
first=True)
self.block1 = self._make_layer(
nb_blocks=d_args['blocks'][1], # 4
nb_filts=d_args['filts'][2], # 256
shifts=[296, 99, 33, 11])
self.bn_before_gru = nn.BatchNorm1d(
num_features=d_args['filts'][2][-1]) # 256
self.gru = nn.GRU(
input_size=d_args['filts'][2][-1], # 256
hidden_size=d_args['gru_node'], # 1024
num_layers=d_args['nb_gru_layer'], # 1
batch_first=True)
self.fc1_gru = nn.Linear(
in_features=d_args['gru_node'], # 1024
out_features=d_args['nb_fc_node']) # 256
self.fc2_gru = nn.Linear(
in_features=d_args['nb_fc_node'], # 256
out_features=d_args['nb_classes'], # 6112
bias=True)
def forward(self, x):
x = x.unsqueeze(1)
x = self.first_conv(x)
x = self.first_bn(x)
x = self.lrelu_keras(x)
x = self.block0(x)
x = self.block1(x)
x = self.bn_before_gru(x)
x = self.lrelu_keras(x)
x = x.permute(0, 2, 1)
self.gru.flatten_parameters()
x, _ = self.gru(x)
x = x[:, -1, :]
code = self.fc1_gru(x)
code_norm = code.norm(p=2, dim=1, keepdim=True) / 10.
code = torch.div(code, code_norm)
out = self.fc2_gru(code)
return out
def _make_layer(self, nb_blocks, nb_filts, shifts, first = False):
layers = []
for i in range(nb_blocks):
first = first if i == 0 else False
shift = shifts[i]
layers.append(Residual_block_imgs(nb_filts=nb_filts, shift=shift, reduction=self.reduction, first=first))
if i == 0: nb_filts[0] = nb_filts[1]
return nn.Sequential(*layers)
class Dataset_imgs(data.Dataset):
def __init__(self, ):
a = 1
def __len__(self):
return 10000
def __getitem__(self, index):
y = randrange(6112)
X = np.random.random_sample((59049,)).astype(np.float32)
return X, y
def train_model(model, db_gen, optimizer, epoch, args, device, lr_scheduler, criterion):
model.train()
if args.local_rank == 0:
pbar = tqdm(total=len(db_gen))
for idx_ct, (m_batch, m_label) in enumerate(db_gen):
m_batch, m_label = m_batch.to(device), m_label.to(device)
output = model(m_batch)
loss = criterion(output, m_label)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if args.local_rank == 0:
pbar.set_description('epoch: %d, cce:%.3f'%(epoch, loss))
pbar.update(1)
lr_scheduler.step()
if args.local_rank == 0:
pbar.close()
def main():
args = get_args()
args.model_params['nb_classes'] = 6112
world_size = args.ngpu
torch.distributed.init_process_group(
'nccl',
init_method='env://',
world_size=world_size,
rank=args.local_rank,
)
torch.manual_seed(args.seed)
np.random.seed(args.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
cuda = torch.cuda.is_available()
device = torch.device('cuda' if cuda else 'cpu')
trainset = Dataset_imgs()
sampler_devset = torch.utils.data.distributed.DistributedSampler(
trainset,
num_replicas=args.ngpu,
rank=args.local_rank)
trainset_gen = data.DataLoader(trainset,
batch_size=args.bs,
drop_last=True,
pin_memory=True,
sampler=sampler_devset,
num_workers=args.nb_worker)
torch.cuda.set_device(args.local_rank)
model = Model_imgs(args.model_params, args.reduction)
model.cuda()
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
model = torch.nn.parallel.DistributedDataParallel(
model,
device_ids=[args.local_rank],
output_device=args.local_rank,
find_unused_parameters=False)
criterion= nn.CrossEntropyLoss()
params = [
{
'params': [
param for name, param in model.named_parameters()
if 'bn' not in name
]
},
{
'params': [
param for name, param in model.named_parameters()
if 'bn' in name
],
'weight_decay':
0
},
]
optimizer = torch.optim.Adam(params,
lr=args.lr,
weight_decay=0.0001,
amsgrad=True)
lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda step: keras_lr_decay(step))
# Train
for epoch in range(args.epoch):
sampler_devset.set_epoch(epoch)
if args.local_rank == 0:
print('training epoch:', epoch + 1)
train_model(model=model,
db_gen=trainset_gen,
args=args,
optimizer=optimizer,
lr_scheduler=lr_scheduler,
criterion=criterion,
device=device,
epoch=epoch)
if __name__ == '__main__':
main()
If you need other information, please do not hesitate to leave a message. Thanks in advance!