Skip to content

stacked_self_attention

allennlp_models.generation.modules.decoder_nets.stacked_self_attention

[SOURCE]


StackedSelfAttentionDecoderNet#

@DecoderNet.register("stacked_self_attention")
class StackedSelfAttentionDecoderNet(DecoderNet):
 | def __init__(
 |     self,
 |     decoding_dim: int,
 |     target_embedding_dim: int,
 |     feedforward_hidden_dim: int,
 |     num_layers: int,
 |     num_attention_heads: int,
 |     use_positional_encoding: bool = True,
 |     positional_encoding_max_steps: int = 5000,
 |     dropout_prob: float = 0.1,
 |     residual_dropout_prob: float = 0.2,
 |     attention_dropout_prob: float = 0.1
 | ) -> None

A Stacked self-attention decoder implementation.

Parameters

  • decoding_dim : int
    Defines dimensionality of output vectors.
  • target_embedding_dim : int
    Defines dimensionality of input target embeddings. Since this model takes it's output on a previous step as input of following step, this is also an input dimensionality.
  • feedforward_hidden_dim : int
    The middle dimension of the FeedForward network. The input and output dimensions are fixed to ensure sizes match up for the self attention layers.
  • num_layers : int
    The number of stacked self attention -> feedfoward -> layer normalisation blocks.
  • num_attention_heads : int
    The number of attention heads to use per layer.
  • use_positional_encoding : bool, optional (default = True)
    Whether to add sinusoidal frequencies to the input tensor. This is strongly recommended, as without this feature, the self attention layers have no idea of absolute or relative position (as they are just computing pairwise similarity between vectors of elements), which can be important features for many tasks.
  • dropout_prob : float, optional (default = 0.1)
    The dropout probability for the feedforward network.
  • residual_dropout_prob : float, optional (default = 0.2)
    The dropout probability for the residual connections.
  • attention_dropout_prob : float, optional (default = 0.1)
    The dropout probability for the attention distributions in each attention layer.

init_decoder_state#

class StackedSelfAttentionDecoderNet(DecoderNet):
 | ...
 | def init_decoder_state(
 |     self,
 |     encoder_out: Dict[str, torch.LongTensor]
 | ) -> Dict[str, torch.Tensor]

forward#

class StackedSelfAttentionDecoderNet(DecoderNet):
 | ...
 | def forward(
 |     self,
 |     previous_state: Dict[str, torch.Tensor],
 |     encoder_outputs: torch.Tensor,
 |     source_mask: torch.BoolTensor,
 |     previous_steps_predictions: torch.Tensor,
 |     previous_steps_mask: Optional[torch.BoolTensor] = None
 | ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]

Decoder#

class Decoder(nn.Module):
 | def __init__(self, layer: nn.Module, num_layers: int) -> None

Transformer N layer decoder with masking. Code taken from http://nlp.seas.harvard.edu/2018/04/03/attention.html

forward#

class Decoder(nn.Module):
 | ...
 | def forward(
 |     self,
 |     x: torch.Tensor,
 |     memory: torch.Tensor,
 |     src_mask: torch.BoolTensor,
 |     tgt_mask: torch.BoolTensor
 | ) -> torch.Tensor

DecoderLayer#

class DecoderLayer(nn.Module):
 | def __init__(
 |     self,
 |     size: int,
 |     self_attn: MultiHeadedAttention,
 |     src_attn: MultiHeadedAttention,
 |     feed_forward: F,
 |     dropout: float
 | ) -> None

A single layer of transformer decoder. Code taken from http://nlp.seas.harvard.edu/2018/04/03/attention.html

forward#

class DecoderLayer(nn.Module):
 | ...
 | def forward(
 |     self,
 |     x: torch.Tensor,
 |     memory: torch.Tensor,
 |     src_mask: torch.BoolTensor,
 |     tgt_mask: torch.BoolTensor
 | ) -> torch.Tensor

Follow Figure 1 (right) for connections.