- x = self.norm2(x + self._ff_block(x))
-
- return x
-
- # self-attention block
- def _sa_block(self, x: Tensor,
- attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor]) -> Tensor:
- x = self.self_attn(x, x, x,
- attn_mask=attn_mask,
- key_padding_mask=key_padding_mask,
- need_weights=False)[0]
- return self.dropout1(x)
-
- # feed forward block
- def _ff_block(self, x: Tensor) -> Tensor:
- x = self.linear2(self.dropout(self.activation(self.linear1(x))))
- return self.dropout2(x)
-
-
- class TransformerDecoderLayer(Module):
- r"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.