stacked_self_attention

StackedSelfAttentionDecoderNet#

class StackedSelfAttentionDecoderNet(DecoderNet):
 | def __init__(
 |     self,
 |     decoding_dim: int,
 |     target_embedding_dim: int,
 |     feedforward_hidden_dim: int,
 |     num_layers: int,
 |     num_attention_heads: int,
 |     use_positional_encoding: bool = True,
 |     positional_encoding_max_steps: int = 5000,
 |     dropout_prob: float = 0.1,
 |     residual_dropout_prob: float = 0.2,
 |     attention_dropout_prob: float = 0.1
 | ) -> None

A Stacked self-attention decoder implementation.

Parameters

decoding_dim : int
Defines dimensionality of output vectors.
target_embedding_dim : int
Defines dimensionality of input target embeddings. Since this model takes it's output on a previous step as input of following step, this is also an input dimensionality.
feedforward_hidden_dim : int
The middle dimension of the FeedForward network. The input and output dimensions are fixed to ensure sizes match up for the self attention layers.
num_layers : int
The number of stacked self attention -> feedfoward -> layer normalisation blocks.
num_attention_heads : int
The number of attention heads to use per layer.
use_positional_encoding : bool, optional (default = True)
Whether to add sinusoidal frequencies to the input tensor. This is strongly recommended, as without this feature, the self attention layers have no idea of absolute or relative position (as they are just computing pairwise similarity between vectors of elements), which can be important features for many tasks.
dropout_prob : float, optional (default = 0.1)
The dropout probability for the feedforward network.
residual_dropout_prob : float, optional (default = 0.2)
The dropout probability for the residual connections.
attention_dropout_prob : float, optional (default = 0.1)
The dropout probability for the attention distributions in each attention layer.

init_decoder_state#

class StackedSelfAttentionDecoderNet(DecoderNet):
 | ...
 | @overrides
 | def init_decoder_state(
 |     self,
 |     encoder_out: Dict[str, torch.LongTensor]
 | ) -> Dict[str, torch.Tensor]

forward#

class StackedSelfAttentionDecoderNet(DecoderNet):
 | ...
 | @overrides
 | def forward(
 |     self,
 |     previous_state: Dict[str, torch.Tensor],
 |     encoder_outputs: torch.Tensor,
 |     source_mask: torch.BoolTensor,
 |     previous_steps_predictions: torch.Tensor,
 |     previous_steps_mask: Optional[torch.BoolTensor] = None
 | ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]

Decoder#

class Decoder(nn.Module):
 | def __init__(self, layer: nn.Module, num_layers: int) -> None

Transformer N layer decoder with masking. Code taken from http://nlp.seas.harvard.edu/2018/04/03/attention.html

forward#

class Decoder(nn.Module):
 | ...
 | @overrides
 | def forward(
 |     self,
 |     x: torch.Tensor,
 |     memory: torch.Tensor,
 |     src_mask: torch.BoolTensor,
 |     tgt_mask: torch.BoolTensor
 | ) -> torch.Tensor

DecoderLayer#

class DecoderLayer(nn.Module):
 | def __init__(
 |     self,
 |     size: int,
 |     self_attn: MultiHeadedAttention,
 |     src_attn: MultiHeadedAttention,
 |     feed_forward: F,
 |     dropout: float
 | ) -> None

A single layer of transformer decoder. Code taken from http://nlp.seas.harvard.edu/2018/04/03/attention.html

forward#

class DecoderLayer(nn.Module):
 | ...
 | def forward(
 |     self,
 |     x: torch.Tensor,
 |     memory: torch.Tensor,
 |     src_mask: torch.BoolTensor,
 |     tgt_mask: torch.BoolTensor
 | ) -> torch.Tensor

Follow Figure 1 (right) for connections.