____TypeError: forward() takes 2 positional arguments but 3 were given____

class EasyQAEarlyFusionNetwork(nn.Module):

    def __init__(self, hyperparms=None):

        super(EasyQAEarlyFusionNetwork, self).__init__()        
        self.dropout = nn.Dropout(0.3)
        self.vision_projection = nn.Linear(2048, 768) 
        self.text_projection = nn.Linear(768, 768) # here
        self.fc1 = nn.Linear(1536, 256) 
        self.bn1 = nn.BatchNorm1d(256)
        self.classifier = nn.Linear(256, 17) 
        W = torch.Tensor(768, 768)
        self.W = nn.Parameter(W)
        self.relu_f = nn.ReLU()
        # initialize weight matrices
        nn.init.kaiming_uniform_(self.W, a=math.sqrt(5))

        self.attention = nn.Linear(768, 768)
        
    def forward(self, image_emb, text_emb, fact_emb):

        x1 = image_emb   
        x1 = torch.nn.functional.normalize(x1, p=2, dim=1)
        Xv = self.relu_f(self.vision_projection(x1))
        
        x2 = text_emb
        x2 = torch.nn.functional.normalize(x2, p=2, dim=1)
        Xt = self.relu_f(self.text_projection(x2))

        x3 = fact_emb
        x3 = torch.nn.functional.normalize(x3, p=2, dim=1)
        Xf = self.relu_f(self.text_projection(x3))

        attention_scores = self.attention(Xt, Xf)
        attention_weights = F.softmax(attention_scores.squeeze(), dim=1).unsqueeze(2)

        # Apply attention weights to fact embeddings
        attended_facts = torch.sum(attention_weights * Xf.unsqueeze(0), dim=1)

        Xvt = Xv * Xt
        Xvt = self.relu_f(torch.mm(Xvt, self.W.t()))

        Xvtf = torch.cat([Xvt,attended_facts], dim=1)

        Xvtf = self.fc1(Xvtf)
        Xvtf = self.bn1(Xvtf)
        Xvtf = self.dropout(Xvtf)
        Xvtf = self.classifier(Xvtf)

        return Xvtf

Could you post a code snippet showing how you are calling into the model as well as the full stacktrace, please?

1 Like

It seems you are getting a TypeError because the self.attention layer is expecting only a single input, but you are passing two inputs (Xt and Xf) to it in this line:

attention_scores = self.attention(Xt, Xf)

The Linear layer only takes a single input

1 Like

Fixed, thanks for the help!