How to apply L1 and l2 for resnet and overcome from overfitting

data_transforms = {
‘downsampled’: transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.ColorJitter(brightness=0.5),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
‘d_valid’: transforms.Compose([
transforms.Resize(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}

data_dir = ‘down_sample’
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
data_transforms[x])
for x in [‘downsampled’, ‘d_valid’]}

dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=16,
shuffle=False, num_workers=4)
for x in [‘downsampled’, ‘d_valid’]}
dataset_sizes = {x: len(image_datasets[x]) for x in [‘downsampled’, ‘d_valid’]}
class_names = image_datasets[‘downsampled’].classes

device = torch.device(“cuda:0” if torch.cuda.is_available() else “cpu”)

#lets train the model
def train_model(model, criterion, optimizer, scheduler, num_epochs=5):
since = time.time()

best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0

for epoch in range(num_epochs):
    print('Epoch {}/{}'.format(epoch, num_epochs - 1))
    print('-' * 10)

    # Each epoch has a training and validation phase
    for phase in ['downsampled', 'd_valid']:
        if phase == 'downsampled':
            scheduler.step()
            model.train()  # Set model to training mode
        else:
            model.eval()   # Set model to evaluate mode

        running_loss = 0.0
        running_corrects = 0

        # Iterate over data.
        for inputs, labels in dataloaders[phase]:
            inputs = inputs.to(device)
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward
            # track history if only in train
            with torch.set_grad_enabled(phase == 'downsampled'):
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels)

                # backward + optimize only if in training phase
                if phase == 'downsampled':
                    loss.backward()
                    optimizer.step()

            # statistics
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        epoch_loss = running_loss / dataset_sizes[phase]
        epoch_acc = running_corrects.double() / dataset_sizes[phase]

        print('{} Loss: {:.4f} Acc: {:.4f}'.format(
            phase, epoch_loss, epoch_acc))

        # deep copy the model
        if phase == 'd_valid' and epoch_acc > best_acc:
            best_acc = epoch_acc
            best_model_wts = copy.deepcopy(model.state_dict())
    print()

time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))

# load best model weights
model.load_state_dict(best_model_wts)
return model

model_1 = models.resnext50_32x4d(pretrained=True)
num_ftrs = model_1.fc.in_features
model_1.fc = nn.Linear(num_ftrs, 13)

model_1 = model_1.to(device)

criterion = nn.CrossEntropyLoss()

Observe that all parameters are being optimized

optimizer_ft = optim.Adam(model_1.parameters(), lr=0.001,weight_decay=0.4)

Decay LR by a factor of 0.1 every 7 epochs

exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

I dont know if your problem will get solved but your scheduler is very strong, decaying every 7 epoch by a factor of 0,1 your learning rate seems not that efficient. And Adam is already an optimizers that is adaptive so maybe you should just remove your schedular.

Apart from that, can you provide the graphs of both your loss during the training ? I suggest you to change a little bit the structure of your code with something like :

for epoch in range(num_epochs):
     model.train()
     for data in training_data:
          train something
     for data in validation_data:
          model.eval()
          with torch.no_grad():
                  validate something

Should be useful if you change it like that and if you could provide the graphs to see what happens.

PS: sorry for the late answer

This is what my train and validation loss curve is, I guess something is wrong in my code.