Change BN to GN in efficientnet

like this post : Change the BN to GN in resnet i was trying groupnormalization for efficientnet ,
my model code is :

out_dim = 5
enet_type = 'efficientnet-b0'

pretrained_model = {
    'efficientnet-b0': '../input/efficientnet-pytorch/efficientnet-b0-08094119.pth'
}

    
class enetv2(nn.Module):
    def __init__(self, backbone, out_dim):
        super(enetv2, self).__init__()
        self.enet = enet.EfficientNet.from_name(backbone)
        self.enet.load_state_dict(torch.load(pretrained_model[backbone]))

        self.myfc = nn.Linear(self.enet._fc.in_features, out_dim)
        self.enet._fc = nn.Identity()

    def extract(self, x):
        return self.enet(x)

    def forward(self, x):
        x = self.extract(x)
        x = self.myfc(x)
        return x
model = enetv2(enet_type, out_dim=out_dim)
model = model.to(device)

if i try model.enet._bn0
then it gives me this output :
BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)

but with this code :

for name, module in model.named_modules():
    if isinstance(module, nn.BatchNorm2d):
        # Get current bn layer
        bn = getattr(model, name)
        # Create new gn layer
        gn = nn.GroupNorm(1, bn.num_features)
        # Assign gn
        print('Swapping {} with {}'.format(bn, gn))
        setattr(model, name, gn)

print(model)

i get this error :

AttributeError Traceback (most recent call last)
in
2 if isinstance(module, nn.BatchNorm2d):
3 # Get current bn layer
----> 4 bn = getattr(model, name)
5 # Create new gn layer
6 gn = nn.GroupNorm(1, bn.num_features)

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in getattr(self, name)
592 return modules[name]
593 raise AttributeError("’{}’ object has no attribute ‘{}’".format(
–> 594 type(self).name, name))
595
596 def setattr(self, name, value):

AttributeError: ‘enetv2’ object has no attribute ‘enet._bn0’

now how do i replace BN layers with GN layers in my model?

Seems to be already solved.

@ptrblck i am getting nan now with that, previously i reported that it’s working because i saw training began but today after 1st and 2nd epoch i can see loss getting nan and after second epoch giving me this error : ValueError: Input contains NaN, infinity or a value too large for dtype(‘float32’).

same kernel worked fine for batchnorm
i just changed to groupnorm and started getting that error,here is the code i used to change batchnorm with groupnorm :

for  i in range(0,len(model.enet._blocks)):
    try:
        print("...........found BN layer  so Replacing Bn0 with GN.........")
        gn0 = model.enet._blocks[i]._bn0.num_features
        model.enet._blocks[i]._bn0 = nn.GroupNorm(1,num_channels = gn0)
        print(model.enet._blocks[i]._bn0)
        
    except:
        print("BN layer Not  found!!")
        
    gn1 = model.enet._blocks[i]._bn1.num_features
    gn2 = model.enet._blocks[i]._bn2.num_features
    
    model.enet._blocks[i]._bn1 = nn.GroupNorm(1,num_channels = gn1)
    model.enet._blocks[i]._bn2 = nn.GroupNorm(1,num_channels = gn2)
    
    print(model.enet._blocks[i]._bn1)
    print(model.enet._blocks[i]._bn2)
    

Add torch.autograd.detect_anomaly(True) at the beginning of your script and post the stack trace here, please.

you mean to add that line of code at 1st cell of my notebook and train again?

when i try torch.autograd.detect_anomaly(True)

TypeError Traceback (most recent call last)
in

TypeError: init() takes 1 positional argument but 2 were given

torch.autograd.set_detect_anomaly() works so i am trying it

full traceback :


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<timed exec> in <module>

<ipython-input-17-b768044886f5> in val_epoch(loader, get_output)
     49     acc = (PREDS == TARGETS).mean() * 100.
     50 
---> 51     qwk = cohen_kappa_score(PREDS, TARGETS, weights='quadratic')
     52     qwk_k = cohen_kappa_score(PREDS[df_valid['data_provider'] == 'karolinska'], df_valid[df_valid['data_provider'] == 'karolinska'].isup_grade.values, weights='quadratic')
     53     qwk_r = cohen_kappa_score(PREDS[df_valid['data_provider'] == 'radboud'], df_valid[df_valid['data_provider'] == 'radboud'].isup_grade.values, weights='quadratic')

/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py in cohen_kappa_score(y1, y2, labels, weights, sample_weight)
    583     """
    584     confusion = confusion_matrix(y1, y2, labels=labels,
--> 585                                  sample_weight=sample_weight)
    586     n_classes = confusion.shape[0]
    587     sum0 = np.sum(confusion, axis=0)

/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py in confusion_matrix(y_true, y_pred, labels, sample_weight, normalize)
    266 
    267     """
--> 268     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
    269     if y_type not in ("binary", "multiclass"):
    270         raise ValueError("%s is not supported" % y_type)

/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py in _check_targets(y_true, y_pred)
     79     """
     80     check_consistent_length(y_true, y_pred)
---> 81     type_true = type_of_target(y_true)
     82     type_pred = type_of_target(y_pred)
     83 

/opt/conda/lib/python3.7/site-packages/sklearn/utils/multiclass.py in type_of_target(y)
    282     if y.dtype.kind == 'f' and np.any(y != y.astype(int)):
    283         # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
--> 284         _assert_all_finite(y)
    285         return 'continuous' + suffix
    286 

/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype)
     58                     msg_err.format
     59                     (type_err,
---> 60                      msg_dtype if msg_dtype is not None else X.dtype)
     61             )
     62     # for object dtype data, we only check for NaNs (GH-13254)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

train and val loop

%%time
torch.autograd.detect_anomaly()
qwk_max = 0.
best_file = f'{kernel_type}_best_fold{fold}.pth'
for epoch in range(1, n_epochs+1):
    print(time.ctime(), 'Epoch:', epoch)
    scheduler.step(epoch-1)

    train_loss = train_epoch(train_loader, optimizer)
    val_loss, acc, qwk = val_epoch(valid_loader)

    content = time.ctime() + ' ' + f'Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(train_loss):.5f}, val loss: {np.mean(val_loss):.5f}, acc: {(acc):.5f}, qwk: {(qwk):.5f}'
    print(content)
    with open(f'log_{kernel_type}.txt', 'a') as appender:
        appender.write(content + '\n')

    if qwk > qwk_max:
        print('score2 ({:.6f} --> {:.6f}).  Saving model ...'.format(qwk_max, qwk))
        torch.save(model.state_dict(), best_file)
        qwk_max = qwk

torch.save(model.state_dict(), os.path.join(f'{kernel_type}_final_fold{fold}.pth'))

Yes, sorry torch.autograd.set_detect_anomaly(True) should be used.
The previous error seems to be raised by sklearn and it seems the output of your model is already containing NaNs.

@ptrblck please can you tell me why my model outputting nan?
if i use same model without groupnormalization code then i don’t get any nan, only thing i changed in my code is bn to gn and nothing else! why my model is outputting nan after using groupnormalization?

also if you think this implementation is wrong,then can you please give me correct code for changing bn to gn? :

for  i in range(0,len(model.enet._blocks)):
    try:
        print("...........found BN layer  so Replacing Bn0 with GN.........")
        gn0 = model.enet._blocks[i]._bn0.num_features
        model.enet._blocks[i]._bn0 = nn.GroupNorm(1,num_channels = gn0)
        print(model.enet._blocks[i]._bn0)
        
    except:
        print("BN layer Not  found!!")
        
    gn1 = model.enet._blocks[i]._bn1.num_features
    gn2 = model.enet._blocks[i]._bn2.num_features
    
    model.enet._blocks[i]._bn1 = nn.GroupNorm(1,num_channels = gn1)
    model.enet._blocks[i]._bn2 = nn.GroupNorm(1,num_channels = gn2)
    
    print(model.enet._blocks[i]._bn1)
    print(model.enet._blocks[i]._bn2)

I’ve solved nan issue by fixing the issue that the lr is too high or forgetting to add small value of epsilon. So please check this out.

my lr is 3e-5 for adam,isn’t it natural? also why i should try epsilon where without epsilon batchnorm worked well,also during training i can see everything going good but during test i get nan