I’m trying to implement the 1D self-attention illustrated in this paper
Specifically, I’m focusing on the following picture:
Below you can find my attempt:
import torch.nn as nn
import torch
#INPUT shape ((B), n_channels, height, width)
class Self_Attention1D(nn.Module):
def __init__(self, in_channels=1, out_channels=3):
super().__init__()
self.pointwise_conv1 = nn.Conv1d(in_channels=in_channels,
out_channels=out_channels, kernel_size=(1,1))
self.pointwise_conv2 = nn.Conv1d(in_channels=out_channels,
out_channels=in_channels, kernel_size=(1,1))
self.phi = MLP(in_size = out_channels, out_size=32)
self.psi = MLP(in_size = out_channels, out_size=32)
self.gamma = MLP(in_size=32, out_size=out_channels)
def forward(self, x):
x = self.pointwise_conv1(x)
phi = self.phi(x.transpose(1,3))
psi = self.psi(x.transpose(1,3))
delta = phi-psi
gamma = self.gamma(delta).transpose(3,1)
out = self.pointwise_conv2(torch.mul(gamma,x))
return out
class MLP(nn.Module):
def __init__(self, in_size, out_size):
super().__init__()
self.in_size = in_size
self.out_size = out_size
self.layers = nn.Sequential(
nn.Linear(in_size, 64),
nn.ReLU(),
nn.Linear(64,128),
nn.ReLU(),
nn.Linear(128,64),
nn.ReLU(),
nn.Linear(64,out_size))
def forward(self, x):
out = self.layers(x)
return out
What it’s not clear to me is, how can we implement the boxed operation in such a way that the delta function takes each single entry and its neighbour one at a time as it is supposed to be by looking at the picture.
I think that my implementation is definitely ignoring locality of the operation.