Recently I met a problem when using torch.nn.DataParallel.
First I define a model
class ft_net(nn.Module):
def __init__(self, class_num):
super(ft_net, self).__init__()
model_ft = models.resnet50(pretrained=True)
# avg pooling to global pooling
model_ft.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.backbone = model_ft
self.classifier = ClassBlock(2048, class_num)
def forward(self, x):
feature = self.backbone(x)
x = self.classifier(feature)
return x, feature
Now I want to set two different learning rate for backbone and classifier module. And I want to use multiple GPU so I need to wrap the model with DataParallel. What should I do? Which option is correct below ?
# option 1
optimizer = torch.optim.SGD([{'params': model.backbone.parameters(), 'lr': 1e-2},
{'params': model.classifier.parameters(), 'lr': 1e-1}],
lr=1e-1, weight_decay=1e-5, momentum=0.9)
model = torch.nn.DataParallel(model.cuda(),device_ids=[0,1])
# option 2
model = torch.nn.DataParallel(model.cuda(),device_ids=[0,1])
optimizer = torch.optim.SGD([{'params': model.module.backbone.parameters(), 'lr': 1e-2},
{'params': model.module.classifier.parameters(), 'lr': 1e-1}],
lr=1e-1, weight_decay=1e-5, momentum=0.9)