transformer_layer
allennlp.modules.transformer.transformer_layer
AttentionLayer¶
class AttentionLayer(TransformerModule, FromParams):
| def __init__(
| self,
| hidden_size: int,
| num_attention_heads: int,
| attention_dropout: float = 0.0,
| hidden_dropout: float = 0.0,
| is_cross_attention: bool = False,
| is_decoder: bool = False
| )
This module wraps the self-attention with the output-layer, similar to the architecture in BERT. Details in the paper: BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding, Devlin et al, 2019
Parameters¶
- hidden_size :
int
- num_attention_heads :
int
- attention_dropout :
float
, optional (default =0.0
)
Dropout probability for theSelfAttention
layer. - hidden_dropout :
float
, optional (default =0.0
)
Dropout probability for theOutputLayer
.
forward¶
class AttentionLayer(TransformerModule, FromParams):
| ...
| def forward(
| self,
| input_tensor: torch.Tensor,
| attention_mask: torch.BoolTensor,
| head_mask: Optional[torch.Tensor] = None,
| encoder_hidden_states: Optional[torch.Tensor] = None,
| encoder_attention_mask: Optional[torch.BoolTensor] = None,
| output_attentions: bool = False
| )
Parameters¶
- input_tensor :
torch.Tensor
Shapebatch_size x seq_len x hidden_dim
- attention_mask :
torch.BoolTensor
, optional
Shapebatch_size x seq_len
- head_mask :
torch.BoolTensor
, optional - output_attentions :
bool
Whether to also return the attention probabilities, default =False
TransformerLayerOutput¶
@dataclass
class TransformerLayerOutput
Encapsulates the outputs of the TransformerLayer
module.
hidden_states¶
class TransformerLayerOutput:
| ...
| hidden_states: FloatT = None
self_attention_probs¶
class TransformerLayerOutput:
| ...
| self_attention_probs: Optional[FloatT] = None
cross_attention_probs¶
class TransformerLayerOutput:
| ...
| cross_attention_probs: Optional[FloatT] = None
TransformerLayer¶
class TransformerLayer(TransformerModule, FromParams):
| def __init__(
| self,
| hidden_size: int,
| intermediate_size: int,
| num_attention_heads: int,
| attention_dropout: float = 0.0,
| hidden_dropout: float = 0.0,
| activation: Union[str, torch.nn.Module] = "relu",
| add_cross_attention: bool = False
| )
This module is a single transformer layer, mapping to BertLayer
in the architecture in BERT.
Details in the paper:
BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding, Devlin et al, 2019
Parameters¶
- hidden_size :
int
- intermediate_size :
int
- num_attention_heads :
int
- attention_dropout :
float
, optional (default =0.0
)
Dropout probability for theSelfAttention
layer. - hidden_dropout :
float
, optional (default =0.0
)
Dropout probability for theOutputLayer
. - activation :
Union[str, torch.nn.Module]
- add_cross_attention :
bool
, optional (default =False
)
If True, an extraAttentionLayer
is added for cross-attention. This is helpful when using the layer in a decoder.
get_output_dim¶
class TransformerLayer(TransformerModule, FromParams):
| ...
| def get_output_dim(self) -> int
forward¶
class TransformerLayer(TransformerModule, FromParams):
| ...
| def forward(
| self,
| hidden_states: torch.Tensor,
| attention_mask: torch.Tensor,
| head_mask: Optional[torch.Tensor] = None,
| encoder_hidden_states: Optional[torch.Tensor] = None,
| encoder_attention_mask: Optional[torch.Tensor] = None,
| output_attentions: bool = False
| ) -> TransformerLayerOutput
Parameters¶
- hidden_states :
torch.Tensor
Shapebatch_size x seq_len x hidden_dim
- attention_mask :
torch.BoolTensor
, optional
Shapebatch_size x seq_len
- head_mask :
torch.BoolTensor
, optional - encoder_hidden_states :
torch.Tensor
, optional - encoder_attention_mask :
torch.Tensor
, optional - output_attentions :
bool
Whether to also return the attention probabilities, default =False