transformer_layer

allennlp.modules.transformer.transformer_layer

AttentionLayer¶

class AttentionLayer(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     hidden_size: int,
 |     num_attention_heads: int,
 |     attention_dropout: float = 0.0,
 |     hidden_dropout: float = 0.0,
 |     is_cross_attention: bool = False,
 |     is_decoder: bool = False
 | )

This module wraps the self-attention with the output-layer, similar to the architecture in BERT. Details in the paper: BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding, Devlin et al, 2019

Parameters¶

hidden_size : int
num_attention_heads : int
attention_dropout : float, optional (default = 0.0)
Dropout probability for the SelfAttention layer.
hidden_dropout : float, optional (default = 0.0)
Dropout probability for the OutputLayer.

forward¶

class AttentionLayer(TransformerModule,  FromParams):
 | ...
 | def forward(
 |     self,
 |     input_tensor: torch.Tensor,
 |     attention_mask: torch.BoolTensor,
 |     head_mask: Optional[torch.Tensor] = None,
 |     encoder_hidden_states: Optional[torch.Tensor] = None,
 |     encoder_attention_mask: Optional[torch.BoolTensor] = None,
 |     output_attentions: bool = False
 | )

Parameters¶

input_tensor : torch.Tensor
Shape batch_size x seq_len x hidden_dim
attention_mask : torch.BoolTensor, optional
Shape batch_size x seq_len
head_mask : torch.BoolTensor, optional
output_attentions : bool
Whether to also return the attention probabilities, default = False

TransformerLayerOutput¶

@dataclass
class TransformerLayerOutput

Encapsulates the outputs of the TransformerLayer module.

hidden_states¶

class TransformerLayerOutput:
 | ...
 | hidden_states: FloatT = None

self_attention_probs¶

class TransformerLayerOutput:
 | ...
 | self_attention_probs: Optional[FloatT] = None

cross_attention_probs¶

class TransformerLayerOutput:
 | ...
 | cross_attention_probs: Optional[FloatT] = None

TransformerLayer¶

class TransformerLayer(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     hidden_size: int,
 |     intermediate_size: int,
 |     num_attention_heads: int,
 |     attention_dropout: float = 0.0,
 |     hidden_dropout: float = 0.0,
 |     activation: Union[str, torch.nn.Module] = "relu",
 |     add_cross_attention: bool = False
 | )

This module is a single transformer layer, mapping to BertLayer in the architecture in BERT. Details in the paper: BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding, Devlin et al, 2019

Parameters¶

hidden_size : int
intermediate_size : int
num_attention_heads : int
attention_dropout : float, optional (default = 0.0)
Dropout probability for the SelfAttention layer.
hidden_dropout : float, optional (default = 0.0)
Dropout probability for the OutputLayer.
activation : Union[str, torch.nn.Module]
add_cross_attention : bool, optional (default = False)
If True, an extra AttentionLayer is added for cross-attention. This is helpful when using the layer in a decoder.

get_output_dim¶

class TransformerLayer(TransformerModule,  FromParams):
 | ...
 | def get_output_dim(self) -> int

forward¶

class TransformerLayer(TransformerModule,  FromParams):
 | ...
 | def forward(
 |     self,
 |     hidden_states: torch.Tensor,
 |     attention_mask: torch.Tensor,
 |     head_mask: Optional[torch.Tensor] = None,
 |     encoder_hidden_states: Optional[torch.Tensor] = None,
 |     encoder_attention_mask: Optional[torch.Tensor] = None,
 |     output_attentions: bool = False
 | ) -> TransformerLayerOutput

Parameters¶

hidden_states : torch.Tensor
Shape batch_size x seq_len x hidden_dim
attention_mask : torch.BoolTensor, optional
Shape batch_size x seq_len
head_mask : torch.BoolTensor, optional
encoder_hidden_states : torch.Tensor, optional
encoder_attention_mask : torch.Tensor, optional
output_attentions : bool
Whether to also return the attention probabilities, default = False