Attention pooling - Trying toy addition problem


I am trying to solve the addition problem using a form of attention mechanism “additive attention” as outlined at The problem addition problem consists of 2 number sequences of equal length, one sequence contains all zeros except at 2 indices, where it contains 1 and the solution to the problem is adding the numbers from first sequence at locations where second sequence has ones. This is essentially a regression problem. My network consists of attention pooling layer, fully-connected layer and output layer.
Can you please go over the model I have setup below? I specifically need help with passing the correct dimensions of inputs to the fully connected layer fc1. Any help is greatly appreciated.

from typing import Tuple
import torch.nn.functional as F
import numpy as np
import torch
from torch import nn, Tensor

def adding_problem_generator(N, seq_len=50, high=1):

   X_num = np.random.uniform(low=0, high=high, size=(N, seq_len, 1))
   X_mask = np.zeros((N, seq_len, 1))
   Y = np.ones((N, 1))
   for i in range(N):
      # Default uniform distribution on position sampling
      positions = np.random.choice(seq_len, size=2, replace=False)
      X_mask[i, positions] = 1
      Y[i, 0] = np.sum(X_num[i, positions])
      X = np.append(X_num, X_mask, axis=2)
      return X, Y

def addition_problem(sequence_length=50):

      output = np.random.uniform(-1, 1, (sequence_length, 2)) # 50x2
      output[:, 0] = 0.
      random_indices = np.random.choice(sequence_length, size=2, replace=False)
      output[random_indices, [0, 0]] = 1
      x = output
      y = output[:, 0]*output[:, 1]
      return output, (output[:, 0]*output[:, 1]).sum(keepdims=True)

class AdditiveAttention(nn.Module):

    def __init__(self, hidden_dim: int) -> None:

      super(AdditiveAttention, self).__init__()
      self.query_proj = nn.Linear(hidden_dim, hidden_dim, bias=False)
      self.key_proj = nn.Linear(hidden_dim, hidden_dim, bias=False)
      self.bias = nn.Parameter(torch.rand(hidden_dim).uniform_(-0.1, 0.1))
      self.score_proj = nn.Linear(hidden_dim, 1)

   def forward(self, query: Tensor, key: Tensor, value: Tensor) -> Tuple[Tensor,

    score = self.score_proj(torch.tanh(self.key_proj(key) + self.query_proj(query) +
    attn = F.softmax(score, dim=-1)
    context = torch.bmm(attn.unsqueeze(1), value)
    return context, attn

 ### this is where I am having trouble ###
class Model(nn.Module):

   def __init__(self, input_size=2, hidden_size=64):
     self.attention = AdditiveAttention(hidden_dim=hidden_size)
     # not sure what arguments to pass to fully connected layer
     self.fc1 = nn.Linear(hidden_size * seq_size, vocab_size)  # converting n rows to 1
     self.softmax = nn.Softmax(dim=1)

  def forward(self, x):

    x = self.attention(x).view(1, -1)
    x = self.fc1(x)
    log_probs = F.log_softmax(x, dim=1)
    return log_probs

x_train, y_train = adding_problem_generator(10000)
x_test, y_test = adding_problem_generator(1000)

learning_rate = 0.001
loss_function = nn.NLLLoss()  # negative log likelihood

model = Model(2, 64)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

for i in range(20):

  x,y = addition_problem()
  total_loss = 0

  loss = torch.pow(model(torch.from_numpy(x)) - torch.from_numpy(y), 2)

  my_loss = loss.cpu().item()