I have this function that stacks feature together before sending to encoder-decoder (I am new to Transformers):
def stack_features(features, something):
for ft in features:
l = torch.stack(ft, dim=0)
# maximum number of rows among the tensors
max_rows = max(l.size(1) for ft in features)
# Padding to make sizes compatible
padded_data = [torch.nn.functional.pad(l, (0, 0, 0, max_rows - l.size(1))) for ft in features]
k = []
for x in padded_data:
k.append(x)
outx = torch.stack(k, 0)
return outx
It outputs (2, 8, 132)
However I get this error:
signjoey\transformer_layers.py", line 674, in forward
o = self.feed_forward(self.dropout(h2) + h1)
RuntimeError: The size of tensor a (16) must match the size of tensor b (8) at non-singleton dimension 1
Upon checking the h1 and h2 from this function have different shapes
h1 is [2, 8, 512] and h2 is [2, 16, 512]
class TransformerDecoderLayer(nn.Module):
"""
Transformer decoder layer.
Consists of self-attention, source-attention, and feed-forward.
"""
def __init__(
self, size: int = 0, ff_size: int = 0, num_heads: int = 0, dropout: float = 0.1
):
"""
Represents a single Transformer decoder layer.
It attends to the source representation and the previous decoder states.
:param size: model dimensionality
:param ff_size: size of the feed-forward intermediate layer
:param num_heads: number of heads
:param dropout: dropout to apply to input
"""
super(TransformerDecoderLayer, self).__init__()
self.size = size
self.trg_trg_att = MultiHeadedAttention(num_heads, size, dropout=dropout)
self.src_trg_att = MultiHeadedAttention(num_heads, size, dropout=dropout)
self.feed_forward = PositionwiseFeedForward(
input_size=size, ff_size=ff_size, dropout=dropout
)
self.x_layer_norm = nn.LayerNorm(size, eps=1e-6)
self.dec_layer_norm = nn.LayerNorm(size, eps=1e-6)
self.dropout = nn.Dropout(dropout)
# pylint: disable=arguments-differ
def forward(
self,
x: Tensor = None,
memory: Tensor = None,
src_mask: Tensor = None,
trg_mask: Tensor = None,
) -> Tensor:
"""
Forward pass of a single Transformer decoder layer.
:param x: inputs
:param memory: source representations
:param src_mask: source mask
:param trg_mask: target mask (so as to not condition on future steps)
:return: output tensor
"""
# decoder/target self-attention
x_norm = self.x_layer_norm(x)
h1 = self.trg_trg_att(x_norm, x_norm, x_norm, mask=trg_mask)
h1 = self.dropout(h1) + x
# source-target attention
h1_norm = self.dec_layer_norm(h1)
h2 = self.src_trg_att(memory, memory, h1_norm, mask=src_mask)
# final position-wise feed-forward layer
o = self.feed_forward(self.dropout(h2) + h1)
return o
The original code for stack_features is this
def stack_features(features, something):
return torch.stack([torch.stack(ft, dim=0) for ft in features], dim=0)
I edited it to the one above because my features are of different size in dim=1. Now I notice that in the original code, they applied stacking twice. I would like to ask for insight how to modify my modified function of stack_features to allow stacking twice so that the tensor with shape (2, 8, 132) will have (2,16,132)
Thank you in advance