transformer_stack
allennlp.modules.transformer.transformer_stack
TransformerStack¶
class TransformerStack(TransformerModule, FromParams):
| def __init__(
| self,
| num_hidden_layers: int,
| layer: Optional[TransformerLayer] = None,
| hidden_size: Optional[int] = None,
| intermediate_size: Optional[int] = None,
| num_attention_heads: int = 8,
| attention_dropout: float = 0.1,
| hidden_dropout: float = 0.1,
| activation: Union[str, torch.nn.Module] = "relu",
| add_cross_attention: bool = False
| )
This module is the basic transformer stack. Details in the paper: BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding, Devlin et al, 2019
Parameters¶
- num_hidden_layers :
int
- layer :
TransformerLayer
, optional - hidden_size :
int
, optional
This needs to be provided if nolayer
argument is passed. - intermediate_size :
int
, optional
This needs to be provided if nolayer
argument is passed. - num_attention_heads :
int
- attention_dropout :
float
, optional (default =0.0
)
Dropout probability for theSelfAttention
layer. - hidden_dropout :
float
, optional (default =0.0
)
Dropout probability for theOutputLayer
. - activation :
Union[str, torch.nn.Module]
, optional (default ="relu"
) - add_cross_attention :
bool
, optional (default =False
)
If True, theTransformerLayer
modules will have cross attention modules as well. This is helpful when using theTransformerStack
as a decoder.
forward¶
class TransformerStack(TransformerModule, FromParams):
| ...
| def forward(
| self,
| hidden_states: torch.Tensor,
| attention_mask: Optional[torch.Tensor] = None,
| head_mask: Optional[torch.Tensor] = None,
| encoder_hidden_states: Optional[torch.Tensor] = None,
| encoder_attention_mask: Optional[torch.Tensor] = None,
| output_attentions: bool = False,
| output_hidden_states: bool = False
| )
hidden_states : torch.Tensor
Shape batch_size x seq_len x hidden_dim
attention_mask : torch.BoolTensor
, optional
Shape batch_size x seq_len
head_mask : torch.BoolTensor
, optional
output_attentions : bool
Whether to also return the attention probabilities, default = False
output_hidden_states : bool
Whether to return the hidden_states for all layers, default = False
from_pretrained_module¶
class TransformerStack(TransformerModule, FromParams):
| ...
| @classmethod
| def from_pretrained_module(
| cls,
| pretrained_module: Union[str, torch.nn.Module],
| num_hidden_layers: Optional[Union[int, range]] = None,
| source="huggingface",
| mapping: Optional[Dict[str, str]] = None,
| **kwargs
| )