Change BN to GN in efficientnet

mobassir94 · June 27, 2020, 6:02pm

like this post : Change the BN to GN in resnet i was trying groupnormalization for efficientnet ,
my model code is :

out_dim = 5
enet_type = 'efficientnet-b0'

pretrained_model = {
    'efficientnet-b0': '../input/efficientnet-pytorch/efficientnet-b0-08094119.pth'
}

    
class enetv2(nn.Module):
    def __init__(self, backbone, out_dim):
        super(enetv2, self).__init__()
        self.enet = enet.EfficientNet.from_name(backbone)
        self.enet.load_state_dict(torch.load(pretrained_model[backbone]))

        self.myfc = nn.Linear(self.enet._fc.in_features, out_dim)
        self.enet._fc = nn.Identity()

    def extract(self, x):
        return self.enet(x)

    def forward(self, x):
        x = self.extract(x)
        x = self.myfc(x)
        return x
model = enetv2(enet_type, out_dim=out_dim)
model = model.to(device)

if i try model.enet._bn0
then it gives me this output :
BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)

but with this code :

for name, module in model.named_modules():
    if isinstance(module, nn.BatchNorm2d):
        # Get current bn layer
        bn = getattr(model, name)
        # Create new gn layer
        gn = nn.GroupNorm(1, bn.num_features)
        # Assign gn
        print('Swapping {} with {}'.format(bn, gn))
        setattr(model, name, gn)

print(model)

i get this error :

AttributeError Traceback (most recent call last)
in
2 if isinstance(module, nn.BatchNorm2d):
3 # Get current bn layer
----> 4 bn = getattr(model, name)
5 # Create new gn layer
6 gn = nn.GroupNorm(1, bn.num_features)

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in getattr(self, name)
592 return modules[name]
593 raise AttributeError("’{}’ object has no attribute ‘{}’".format(
–> 594 type(self).name, name))
595
596 def setattr(self, name, value):

AttributeError: ‘enetv2’ object has no attribute ‘enet._bn0’

now how do i replace BN layers with GN layers in my model?

ptrblck · June 28, 2020, 10:05am

Seems to be already solved.

mobassir94 · June 29, 2020, 4:33pm

@ptrblck i am getting nan now with that, previously i reported that it’s working because i saw training began but today after 1st and 2nd epoch i can see loss getting nan and after second epoch giving me this error : ValueError: Input contains NaN, infinity or a value too large for dtype(‘float32’).

same kernel worked fine for batchnorm
i just changed to groupnorm and started getting that error,here is the code i used to change batchnorm with groupnorm :

for  i in range(0,len(model.enet._blocks)):
    try:
        print("...........found BN layer  so Replacing Bn0 with GN.........")
        gn0 = model.enet._blocks[i]._bn0.num_features
        model.enet._blocks[i]._bn0 = nn.GroupNorm(1,num_channels = gn0)
        print(model.enet._blocks[i]._bn0)
        
    except:
        print("BN layer Not  found!!")
        
    gn1 = model.enet._blocks[i]._bn1.num_features
    gn2 = model.enet._blocks[i]._bn2.num_features
    
    model.enet._blocks[i]._bn1 = nn.GroupNorm(1,num_channels = gn1)
    model.enet._blocks[i]._bn2 = nn.GroupNorm(1,num_channels = gn2)
    
    print(model.enet._blocks[i]._bn1)
    print(model.enet._blocks[i]._bn2)

ptrblck · June 30, 2020, 5:45am

Add torch.autograd.detect_anomaly(True) at the beginning of your script and post the stack trace here, please.

mobassir94 · June 30, 2020, 6:21am

you mean to add that line of code at 1st cell of my notebook and train again?

mobassir94 · June 30, 2020, 7:17am

when i try torch.autograd.detect_anomaly(True)

TypeError Traceback (most recent call last)
in

TypeError: init() takes 1 positional argument but 2 were given

torch.autograd.set_detect_anomaly() works so i am trying it

mobassir94 · June 30, 2020, 10:15am

full traceback :


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<timed exec> in <module>

<ipython-input-17-b768044886f5> in val_epoch(loader, get_output)
     49     acc = (PREDS == TARGETS).mean() * 100.
     50 
---> 51     qwk = cohen_kappa_score(PREDS, TARGETS, weights='quadratic')
     52     qwk_k = cohen_kappa_score(PREDS[df_valid['data_provider'] == 'karolinska'], df_valid[df_valid['data_provider'] == 'karolinska'].isup_grade.values, weights='quadratic')
     53     qwk_r = cohen_kappa_score(PREDS[df_valid['data_provider'] == 'radboud'], df_valid[df_valid['data_provider'] == 'radboud'].isup_grade.values, weights='quadratic')

/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py in cohen_kappa_score(y1, y2, labels, weights, sample_weight)
    583     """
    584     confusion = confusion_matrix(y1, y2, labels=labels,
--> 585                                  sample_weight=sample_weight)
    586     n_classes = confusion.shape[0]
    587     sum0 = np.sum(confusion, axis=0)

/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py in confusion_matrix(y_true, y_pred, labels, sample_weight, normalize)
    266 
    267     """
--> 268     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
    269     if y_type not in ("binary", "multiclass"):
    270         raise ValueError("%s is not supported" % y_type)

/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py in _check_targets(y_true, y_pred)
     79     """
     80     check_consistent_length(y_true, y_pred)
---> 81     type_true = type_of_target(y_true)
     82     type_pred = type_of_target(y_pred)
     83 

/opt/conda/lib/python3.7/site-packages/sklearn/utils/multiclass.py in type_of_target(y)
    282     if y.dtype.kind == 'f' and np.any(y != y.astype(int)):
    283         # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
--> 284         _assert_all_finite(y)
    285         return 'continuous' + suffix
    286 

/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype)
     58                     msg_err.format
     59                     (type_err,
---> 60                      msg_dtype if msg_dtype is not None else X.dtype)
     61             )
     62     # for object dtype data, we only check for NaNs (GH-13254)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

mobassir94 · June 30, 2020, 10:15am

train and val loop

%%time
torch.autograd.detect_anomaly()
qwk_max = 0.
best_file = f'{kernel_type}_best_fold{fold}.pth'
for epoch in range(1, n_epochs+1):
    print(time.ctime(), 'Epoch:', epoch)
    scheduler.step(epoch-1)

    train_loss = train_epoch(train_loader, optimizer)
    val_loss, acc, qwk = val_epoch(valid_loader)

    content = time.ctime() + ' ' + f'Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(train_loss):.5f}, val loss: {np.mean(val_loss):.5f}, acc: {(acc):.5f}, qwk: {(qwk):.5f}'
    print(content)
    with open(f'log_{kernel_type}.txt', 'a') as appender:
        appender.write(content + '\n')

    if qwk > qwk_max:
        print('score2 ({:.6f} --> {:.6f}).  Saving model ...'.format(qwk_max, qwk))
        torch.save(model.state_dict(), best_file)
        qwk_max = qwk

torch.save(model.state_dict(), os.path.join(f'{kernel_type}_final_fold{fold}.pth'))

ptrblck · July 1, 2020, 12:35am

Yes, sorry torch.autograd.set_detect_anomaly(True) should be used.
The previous error seems to be raised by sklearn and it seems the output of your model is already containing NaNs.

mobassir94 · July 1, 2020, 6:47am

@ptrblck please can you tell me why my model outputting nan?
if i use same model without groupnormalization code then i don’t get any nan, only thing i changed in my code is bn to gn and nothing else! why my model is outputting nan after using groupnormalization?

mobassir94 · July 1, 2020, 6:49am

also if you think this implementation is wrong,then can you please give me correct code for changing bn to gn? :

for  i in range(0,len(model.enet._blocks)):
    try:
        print("...........found BN layer  so Replacing Bn0 with GN.........")
        gn0 = model.enet._blocks[i]._bn0.num_features
        model.enet._blocks[i]._bn0 = nn.GroupNorm(1,num_channels = gn0)
        print(model.enet._blocks[i]._bn0)
        
    except:
        print("BN layer Not  found!!")
        
    gn1 = model.enet._blocks[i]._bn1.num_features
    gn2 = model.enet._blocks[i]._bn2.num_features
    
    model.enet._blocks[i]._bn1 = nn.GroupNorm(1,num_channels = gn1)
    model.enet._blocks[i]._bn2 = nn.GroupNorm(1,num_channels = gn2)
    
    print(model.enet._blocks[i]._bn1)
    print(model.enet._blocks[i]._bn2)

Keunhoi_An · July 1, 2020, 7:07am

I’ve solved nan issue by fixing the issue that the lr is too high or forgetting to add small value of epsilon. So please check this out.

mobassir94 · July 1, 2020, 10:07am

my lr is 3e-5 for adam,isn’t it natural? also why i should try epsilon where without epsilon batchnorm worked well,also during training i can see everything going good but during test i get nan

SAI_VENKATESH_CHILUK · March 13, 2022, 12:34am

Can you please send me the whole code of How did you convert BN to GN for efficientNet. Thanks in advance