# Attention pooling - Trying toy addition problem

Hello,

I am trying to solve the addition problem using a form of attention mechanism “additive attention” as outlined at d2l.ai The problem addition problem consists of 2 number sequences of equal length, one sequence contains all zeros except at 2 indices, where it contains 1 and the solution to the problem is adding the numbers from first sequence at locations where second sequence has ones. This is essentially a regression problem. My network consists of attention pooling layer, fully-connected layer and output layer.
Can you please go over the model I have setup below? I specifically need help with passing the correct dimensions of inputs to the fully connected layer fc1. Any help is greatly appreciated.

``````from typing import Tuple
import torch.nn.functional as F
import numpy as np
import torch
from torch import nn, Tensor

X_num = np.random.uniform(low=0, high=high, size=(N, seq_len, 1))
Y = np.ones((N, 1))
for i in range(N):
# Default uniform distribution on position sampling
positions = np.random.choice(seq_len, size=2, replace=False)
Y[i, 0] = np.sum(X_num[i, positions])
return X, Y

output = np.random.uniform(-1, 1, (sequence_length, 2)) # 50x2
output[:, 0] = 0.
random_indices = np.random.choice(sequence_length, size=2, replace=False)
output[random_indices, [0, 0]] = 1
x = output
y = output[:, 0]*output[:, 1]
return output, (output[:, 0]*output[:, 1]).sum(keepdims=True)

def __init__(self, hidden_dim: int) -> None:

self.query_proj = nn.Linear(hidden_dim, hidden_dim, bias=False)
self.key_proj = nn.Linear(hidden_dim, hidden_dim, bias=False)
self.bias = nn.Parameter(torch.rand(hidden_dim).uniform_(-0.1, 0.1))
self.score_proj = nn.Linear(hidden_dim, 1)

def forward(self, query: Tensor, key: Tensor, value: Tensor) -> Tuple[Tensor,
Tensor]:

score = self.score_proj(torch.tanh(self.key_proj(key) + self.query_proj(query) +
self.bias)).squeeze(-1)
attn = F.softmax(score, dim=-1)
context = torch.bmm(attn.unsqueeze(1), value)
return context, attn

### this is where I am having trouble ###
class Model(nn.Module):

def __init__(self, input_size=2, hidden_size=64):
super().__init__()
# not sure what arguments to pass to fully connected layer
self.fc1 = nn.Linear(hidden_size * seq_size, vocab_size)  # converting n rows to 1
self.softmax = nn.Softmax(dim=1)

def forward(self, x):

x = self.attention(x).view(1, -1)
x = self.fc1(x)
log_probs = F.log_softmax(x, dim=1)
return log_probs

learning_rate = 0.001
loss_function = nn.NLLLoss()  # negative log likelihood

model = Model(2, 64)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

for i in range(20):