I have a transformer-like model and it gave the same result for any inputs. I’ve figured out that it was a bias from linear layer and that there were zeroes as inputs. I’ve tracked this zeroing to Layer normalization within PositionWiseFF network:
class PositionWiseFeedForward(nn.Module):
def __init__(self, spectral: SpectralWrapper, d_in, d_hid, d_out, dropout=0.1):
super().__init__()
self.layer1 = spectral.wrap(nn.Conv1d(d_in, d_hid, 1))
self.layer2 = spectral.wrap(nn.Conv1d(d_hid, d_out, 1))
self.res_layer = spectral.wrap(nn.Conv1d(d_in, d_out, 1))
self.dropout = nn.Dropout(dropout)
self.layer_norm = nn.LayerNorm(d_out)
def forward(self, x):
x = x.transpose(1, 2)
output = self.layer2(F.relu(self.layer1(x)))
output = output.transpose(1, 2)
output = self.dropout(output)
residual = self.res_layer(x)
residual = residual.transpose(1, 2)
output = output + residual
output = self.layer_norm(output)
return output
I’ve added some hooks:
activation = {}
def get_activation(name):
def hook(model, input, output):
activation[name] = (input[0].detach(), output.detach())
return hook
from src.models import TransformerEncoder
model = TransformerEncoder((6, 3))
model.enc_block2.ff.layer_norm.register_forward_hook(get_activation('x'))
a = torch.rand(1, 6, 3)
b = model(a)
print("inputs:", activation["x"][0])
print("outputs:", activation["x"][1])
and this gave me this result:
inputs:
tensor([[[ 0.1303883195],
[ 1.3866363764],
[ 0.5093938112],
[-1.3120404482],
[-0.4072887599],
[ 1.0759189129]]])
outputs:
tensor([[[0.],
[0.],
[0.],
[0.],
[0.],
[0.]]])
Also it worth mentioning that it happens only in second encoder block. The first one output real numbers.