Develop multitask model based on vision transformer

Hi I used hugging face vision transofrmer to build my multitask model but encountered errors like:

TypeError: linear(): argument ‘input’ (position 1) must be Tensor, not ImageClassifierOutput

The code for the architecture looks like the following:

class multi_output_model(torch.nn.Module):

def __init__(self,categories,regression_out,frozen_layer=False):
    super(multi_output_model,self).__init__()
    
    #vit = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k')
    self.vit = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k')
    #self.features = torch.nn.ModuleList(vit.children())[:-1]
    #self.features = torch.nn.Sequential(*self.features)
    #in_features = vit.classifier.in_features
    in_features = self.vit.config.hidden_size
    if frozen_layer:
        self.freeze_feature_layers()
    self.classification = torch.nn.Linear(in_features,categories)
    self.regression = torch.nn.Linear(in_features,regression_out)

        
def forward(self,input_imgs):
    #output = self.features(input_imgs)
    output = self.vit(input_imgs)
    #output = output.logits.view(input_imgs.size(0), -1)
    output_classification = self.classification(output)
    output_regression = self.regression(output)
    output_regression =torch.reshape(output_regression,[output_regression.size()[0],17,100])
    #return [output_regression,output_classification]
    return SequenceClassifierOutput(
        logits=[output_regression,output_classification],
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

def freeze_feature_layers(self):
    
    for n,p in self.features.named_parameters():
        p.requires_grad = False

Can any body help me with such problem?
Looking forward to hearing from you!

It looks like you’re trying to run your code on a CPU when your model is on a GPU. Make sure to put your model on the CPU before trying to run it on the CPU.