class EasyQAEarlyFusionNetwork(nn.Module):
def __init__(self, hyperparms=None):
super(EasyQAEarlyFusionNetwork, self).__init__()
self.dropout = nn.Dropout(0.3)
self.vision_projection = nn.Linear(2048, 768)
self.text_projection = nn.Linear(768, 768) # here
self.fc1 = nn.Linear(1536, 256)
self.bn1 = nn.BatchNorm1d(256)
self.classifier = nn.Linear(256, 17)
W = torch.Tensor(768, 768)
self.W = nn.Parameter(W)
self.relu_f = nn.ReLU()
# initialize weight matrices
nn.init.kaiming_uniform_(self.W, a=math.sqrt(5))
self.attention = nn.Linear(768, 768)
def forward(self, image_emb, text_emb, fact_emb):
x1 = image_emb
x1 = torch.nn.functional.normalize(x1, p=2, dim=1)
Xv = self.relu_f(self.vision_projection(x1))
x2 = text_emb
x2 = torch.nn.functional.normalize(x2, p=2, dim=1)
Xt = self.relu_f(self.text_projection(x2))
x3 = fact_emb
x3 = torch.nn.functional.normalize(x3, p=2, dim=1)
Xf = self.relu_f(self.text_projection(x3))
attention_scores = self.attention(Xt, Xf)
attention_weights = F.softmax(attention_scores.squeeze(), dim=1).unsqueeze(2)
# Apply attention weights to fact embeddings
attended_facts = torch.sum(attention_weights * Xf.unsqueeze(0), dim=1)
Xvt = Xv * Xt
Xvt = self.relu_f(torch.mm(Xvt, self.W.t()))
Xvtf = torch.cat([Xvt,attended_facts], dim=1)
Xvtf = self.fc1(Xvtf)
Xvtf = self.bn1(Xvtf)
Xvtf = self.dropout(Xvtf)
Xvtf = self.classifier(Xvtf)
return Xvtf
Could you post a code snippet showing how you are calling into the model as well as the full stacktrace, please?
1 Like
It seems you are getting a TypeError because the self.attention
layer is expecting only a single input, but you are passing two inputs (Xt and Xf) to it in this line:
attention_scores = self.attention(Xt, Xf)
The Linear layer only takes a single input
1 Like
Fixed, thanks for the help!