Implementing 1D self attention issues

I’m trying to implement the 1D self-attention illustrated in this paper

Specifically, I’m focusing on the following picture:

Below you can find my attempt:

import torch.nn as nn
import torch

#INPUT shape ((B), n_channels, height, width)

class Self_Attention1D(nn.Module):

    def __init__(self, in_channels=1, out_channels=3):


        self.pointwise_conv1 = nn.Conv1d(in_channels=in_channels, 
        out_channels=out_channels, kernel_size=(1,1))
        self.pointwise_conv2 = nn.Conv1d(in_channels=out_channels, 
        out_channels=in_channels, kernel_size=(1,1))
        self.phi = MLP(in_size = out_channels, out_size=32)
        self.psi = MLP(in_size = out_channels, out_size=32)
        self.gamma = MLP(in_size=32, out_size=out_channels)

    def forward(self, x):
        x = self.pointwise_conv1(x)
        phi = self.phi(x.transpose(1,3))
        psi = self.psi(x.transpose(1,3))
        delta = phi-psi
        gamma = self.gamma(delta).transpose(3,1)
        out = self.pointwise_conv2(torch.mul(gamma,x))
        return out

class MLP(nn.Module):

    def __init__(self, in_size, out_size):


        self.in_size = in_size
        self.out_size = out_size
        self.layers = nn.Sequential(
            nn.Linear(in_size, 64),
    def forward(self, x):
        out = self.layers(x)
        return out

What it’s not clear to me is, how can we implement the boxed operation in such a way that the delta function takes each single entry and its neighbour one at a time as it is supposed to be by looking at the picture.

I think that my implementation is definitely ignoring locality of the operation.