Multitaskmodel with pretrained weights has extremely low accuracy


Im currently working with this paper: Compare the performance of the models in art classification

It uses the following dataset: Multitask Painting Categorization by Deep Multibranch Neural Network | Imaging and Vision Laboratory

I got 3 task, with 100k samples and those have 1508 artists, 125 styles and 41 genres as classes.

My Program is as followed for Resnet multitask implementation:

mean = np.array([0.485, 0.456, 0.406])
std = np.array( [0.229, 0.224, 0.225])

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

data_transforms = {
    'train': transforms.Compose([
        transforms.Normalize(mean, std)
    'test': transforms.Compose([
        transforms.Normalize(mean, std)
        transforms.Resize((224, 224)),
        transforms.Normalize(mean, std)

class Resnet18_multiTaskNet(nn.Module):
    def __init__(self, pretrained=True, frozen_feature_layers = False):
        self.resnet18 = models.resnet18(pretrained=pretrained)
        self.features = nn.Sequential(*list(self.resnet18.children())[:-1]).cuda()
        self.is_frozen = frozen_feature_layers

        if frozen_feature_layers:

        # now lets add our new layers 
        in_features = self.resnet18.fc.in_features
        #self.fc0 = nn.Linear(in_features, 512)
        #self.bn_pu = nn.BatchNorm1d(in_features, eps = 1e-5)
        # our five new heads for 5 tasks we have at hand!
        self.fc_artist = nn.Linear(in_features, class_length ['artist']) 
        self.fc_style = nn.Linear(in_features, class_length ['style'])
        self.fc_genre = nn.Linear(in_features, class_length ['genre'])
        torch.nn.init.normal_(self.fc_artist .weight, mean=0.0, std=0.01)
        torch.nn.init.normal_(self.fc_style.weight, mean=0.0, std=0.01)
        torch.nn.init.normal_(self.fc_genre.weight, mean=0.0, std=0.01)

    def forward(self, input_imgs):
        output = self.features(input_imgs)
        #output = output.view(input_imgs.size(0), -1)
        #output = self.bn_pu(F.relu(self.fc0(output)))
        # since color is multi label we should use sigmoid
        # but since we want a numerical stable one, we use
        # nn.BCEWithLogitsloss, as a loss which itself applies sigmoid
        # and thus accepts logits. so we wont use sigmoid here for that matter
        # its much stabler than sigmoid+BCE
        output = torch.flatten(output, 1)
        prd_arist = self.fc_artist(output)
        prd_style = self.fc_style(output)
        prd_genre = self.fc_genre(output)

        return prd_arist, prd_style, prd_genre
    def _set_freeze_(self, status):
        for n,p in self.features.named_parameters():
            p.requires_grad = status
        # for m in self.features.children():
        #     for p in m.parameters():
        #         p.requires_grad=status    

    def freeze_feature_layers(self):

    def unfreeze_feature_layers(self):

class customdataset(Dataset):
    def __init__(self, csv_file, root_dir, transform, n =None):
        if n == None:
            df = pd.read_csv(csv_file)
            df = pd.read_csv(csv_file, nrows=n)
        unsorted_labels = {x: df[x].unique() for x in ['artist','style','genre']}
        self.labels = {x: np.sort(unsorted_labels[x]) for x in ['artist','style','genre']}
        self.annotations = df
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        img_path = os.path.join(self.root_dir, self.annotations.iloc[index, 0])
        image ='RGB')

        image_artist = self.annotations.iloc[index,1]
        image_style = self.annotations.iloc[index,2]
        image_genre = self.annotations.iloc[index,3]

        #test = le.transform(list(le.classes_))
        #le fit anpassen an die unique df col
        le = preprocessing.LabelEncoder()['style'])
        image_style = le.transform([image_style])['artist'])
        image_artist = le.transform([image_artist])['genre'])
        image_genre = le.transform([image_genre])
        if self.transform:
            #image =  np.array(image)
            #size = image.shape
            #if(size[2] == 1):
                #image = gray2rgb(image)
            image = self.transform(image)
def train_model(model, criterion, optimizer, scheduler, num_epochs):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    fields = ['artist', 'style', 'genre']
    #accuracies = [0.0]*len(fields)

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('\nLearning rate at this epoch is: %0.9f' % scheduler.get_lr()[0])
        print('\nLearning rate at this epoch is: %.9f ', optimizer.param_groups[0]['lr'], '\n') 
        print('-' * 10)
        lrs = []

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_loss2 = 0.0
            running_loss3 = 0.0
            running_loss4 = 0.0
            running_loss5 = 0.0

            running_corrects_artist = 0
            running_corrects_style = 0
            running_corrects_genre = 0

            # Iterate over data.
            i = 0
            for inputs,image_artist,image_style,image_genre in dataloaders[phase]:
                i = i + 1
                inputs =
                image_style =
                image_genre =

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)

                    _, preds_artist = torch.max(outputs[0], 1)
                    image_artist=  torch.squeeze(image_artist).long()
                    loss_artist = criterion[0](outputs[0], image_artist)

                    _, preds_style = torch.max(outputs[1], 1)
                    image_style=  torch.squeeze(image_style).long()
                    loss_style = criterion[1](outputs[1], image_style)

                    _, preds_genre = torch.max(outputs[2], 1)
                    image_genre=  torch.squeeze(image_genre).long()
                    loss_genre = criterion[2](outputs[2], image_genre)

                    loss_final = loss_artist + loss_style + loss_genre 

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        # zero the parameter gradients
                # statistics
                running_loss += loss_final.item() * inputs.size(0)
                running_loss2 += loss_final.item() * inputs.size(0)
                running_loss3 += loss_artist * inputs.size(0)
                running_loss4 += loss_style * inputs.size(0)
                running_loss5 += loss_genre * inputs.size(0)

                #Accuarcy corrects
                running_corrects_artist += torch.mean((preds_artist.view(*image_artist.shape) == image_artist).float())
                running_corrects_style += torch.mean((preds_style.view(*image_style.shape) == image_style).float())
                running_corrects_genre += torch.mean((preds_genre.view(*image_genre.shape) == image_genre).float())

                running_corrects_artis_2 = torch.mean((preds_artist.view(*image_artist.shape) == image_artist).float())
                print(f'\n running_corrects_artis_2: {running_corrects_artis_2}',)

                running_corrects_style_2 = torch.mean((preds_style.view(*image_style.shape) == image_style).float())
                print(f' running_corrects_style_2: {running_corrects_style_2}',)

                running_corrects_genre_2 = torch.mean((preds_genre.view(*image_genre.shape) == image_genre).float())
                print(f' running_corrects_genre_2: {running_corrects_genre_2}  \n',)

                current_f1_artist = f1_metric['artist'](preds_artist,
                current_f1_style = f1_metric['style'](preds_style,
                current_f1_genre = f1_metric['genre'](preds_genre,

                lrs.append(torch.mean((preds_artist.view(*image_artist.shape) == image_artist).float()).cpu())
                if(i) % 10 == 0:

                if (i+1) % 1 == 0:
                    print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss_final.item():.4f} ,Running Loss: {running_loss / dataset_sizes[phase]}, Running Loss for Batch: {running_loss2/ 80} ')
                    #print(f'stylepred: {preds_style} , styleimage: {}')
                    print(f'running_corrects_artist: {running_corrects_artist} , running_corrects_style: {running_corrects_style}, running_corrects_genre: {running_corrects_genre}')
                    print(f'loss_final.item(): {loss_final.item()} ')
                    print(f'loss_artist: {loss_artist.item()} , loss_style: {loss_style.item()}, loss_gemre: {loss_genre.item()} ')
                    print(f'loss_artist  * inputs.size(0): {loss_artist.item()  * inputs.size(0)} , loss_style: {loss_style.item()  * inputs.size(0)}, loss_gemre: {loss_genre.item()  * inputs.size(0)} ')

                    running_loss2 = 0.0

            if phase == 'train':

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_loss_artist = running_loss3  / dataset_sizes[phase]
            epoch_loss_style = running_loss4  / dataset_sizes[phase]
            epoch_loss_genre = running_loss5  / dataset_sizes[phase]

            epoch_acc_artist = running_corrects_artist.double() / dataset_sizes[phase]
            epoch_acc_style = running_corrects_style.double() / dataset_sizes[phase]
            epoch_acc_genre = running_corrects_genre.double() / dataset_sizes[phase]


            print('{} artist_Acc: {:.4f} '
                  'style_acc: {:.4f}  '
                  'genre_acc: {:.4f}  '.format(
                phase, epoch_acc_artist ,epoch_acc_style,epoch_acc_genre))
            print('{} total loss: {:.4f} artist loss: {:.4f} style loss: {:.4f} genre loss: {:.4f}'.format(phase,epoch_loss,epoch_loss_artist ,

            # deep copy the model
            if phase == 'val' and epoch_acc_genre  > best_acc:
                best_acc = epoch_acc_genre 
                best_model_wts = copy.deepcopy(model.state_dict())


    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))
    print(f'Artist: {current_f1_artist}')
    print(f'Style: {current_f1_style}')
    print(f'Genre: {current_f1_genre}')

    # load best model weights
    return model

I currently train everything on 3 Machines, the VGG-Model gets only 30% and transformer too. So i must make big mistakes. I have no idea where, because in the journal they hit about 60-80% accuracy with the same Data. The models start in each category with around 0% and they need several epochs go get better for example they need 20 epochs to make it from 0 to 8% or something.

They used a 0.2 learning rate from what i could gather from the studies and they implement also the coisinelearningscheduler. with SGD. They used 160 epochs and i need about 1 week to make 160 epochs currently.

I think my class for the model is maybe the problem

        torch.nn.init.normal_(self.fc_artist .weight, mean=0.0, std=0.01)
        torch.nn.init.normal_(self.fc_style.weight, mean=0.0, std=0.01)
        torch.nn.init.normal_(self.fc_genre.weight, mean=0.0, std=0.01)

This looks very suspicious, why do you need a custom initialization with very small std? I would check if the default initialization at least brings faster convergence.

I would also investigate if different learning rate decay schedules achieve different results.

1 Like

greetings i tested alot of things:

i trained the models speretaly and it seems they work without the multitask.

I also took your advice and did not make a custom initialization. It works good but the problem with multitask is still there:

I think its because of the loss function.

Currently i just add the loss together, put the sum in backprop. But that doesnt seem to work. Is there a good tutorial or way to make the loss for multitask pretrained models?

I integrated the Multilossfunction from this thread:

But it still does not work. I can train them seperately but together they dont work (one task accurcay rises, while the other stays low)

I use crossentropy for both.

model = Resnet50_multiTaskNet().to(device)        
criterion = [nn.CrossEntropyLoss(), nn.CrossEntropyLoss()]

def loss_fn1(x, cls):
    return 2 * criterion[0](x, cls)
def loss_fn2(x, cls):
    return 2 * criterion[1](x, cls)

mtl = MultiTaskLoss(model=model,
                    loss_fn=[loss_fn1, loss_fn2],
                    eta=[1.0, 1.0]).to(device)  

optimizer = optim.Adam(mtl.parameters())

class Resnet50_multiTaskNet(nn.Module):
    def __init__(self):
        super(Resnet50_multiTaskNet, self).__init__()
        self.model =  models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)

        for param in self.model.parameters():
            param.requires_grad = False 

        self.fc_artist = nn.Linear(2048, class_length ['artist']).to(device)
        self.fc_style = nn.Linear(2048, class_length ['style']).to(device)

    def forward(self, x):
        x = self.model.conv1(x)
        x = self.model.bn1(x)
        x = self.model.relu(x)
        x = self.model.maxpool(x)

        x = self.model.layer1(x)
        x = self.model.layer2(x)
        x = self.model.layer3(x)
        x = self.model.layer4(x)
        x = self.model.avgpool(x)
        x = x.view(x.size(0), -1)

        x_artist = self.fc_artist(x)
        x_style = self.fc_style(x)
        return x_artist, x_style
class MultiTaskLoss(nn.Module):
    def __init__(self, model, loss_fn, eta) -> None:
        super(MultiTaskLoss, self).__init__()
        self.model = model
        self.loss_fn = loss_fn
        self.eta = nn.Parameter(torch.Tensor(eta))

    def forward(self, input, targets) -> Tuple[torch.Tensor, torch.Tensor]:
        outputs = self.model(input)
        loss = [l(o,y) for l, o, y in zip(self.loss_fn, outputs, targets)]
        total_loss = torch.stack(loss) * torch.exp(-self.eta) + self.eta
        return loss, total_loss.sum(), outputs  # omit 1/2

Anyone has an idea why?

I had the idea to use backprop 2 times, while freezing every except the fc. So i train the fc for task1 while task 2 and the rest is frozen and then task 2 while task1 and the rest is frozen.