t5
allennlp.modules.transformer.t5
An implementation of T5, adapted from HuggingFace.
T5LayerNorm¶
class T5LayerNorm(TransformerModule, FromParams):
| def __init__(self, hidden_size: int = 512, eps: float = 1e-6)
T5-style layer norm does not have bias and does not subtract the mean.
forward¶
class T5LayerNorm(TransformerModule, FromParams):
| ...
| def forward(self, hidden_states) -> FloatT
T5FeedForwardProjection¶
class T5FeedForwardProjection(TransformerModule, Registrable)
forward¶
class T5FeedForwardProjection(TransformerModule, Registrable):
| ...
| def forward(self, hidden_states) -> FloatT
T5DenseReluDense¶
@T5FeedForwardProjection.register("relu")
class T5DenseReluDense(TransformerModule, FromParams):
| def __init__(
| self,
| hidden_size: int = 512,
| ff_size: int = 2048,
| dropout: float = 0.1
| )
forward¶
class T5DenseReluDense(TransformerModule, FromParams):
| ...
| def forward(self, hidden_states) -> FloatT
T5DenseGatedGeluDense¶
@T5FeedForwardProjection.register("gated-gelu")
class T5DenseGatedGeluDense(TransformerModule, FromParams):
| def __init__(
| self,
| hidden_size: int = 512,
| ff_size: int = 2048,
| dropout: float = 0.1
| )
forward¶
class T5DenseGatedGeluDense(TransformerModule, FromParams):
| ...
| def forward(self, hidden_states) -> FloatT
T5LayerFF¶
class T5LayerFF(TransformerModule, FromParams):
| def __init__(
| self,
| ff_proj: Optional[T5FeedForwardProjection] = None,
| layer_norm: Optional[T5LayerNorm] = None,
| dropout: float = 0.1
| )
forward¶
class T5LayerFF(TransformerModule, FromParams):
| ...
| def forward(self, hidden_states) -> FloatT
T5LayerSelfAttentionOutput¶
class T5LayerSelfAttentionOutput(NamedTuple)
hidden_states¶
class T5LayerSelfAttentionOutput(NamedTuple):
| ...
| hidden_states: FloatT = None
attn_key_value_state¶
class T5LayerSelfAttentionOutput(NamedTuple):
| ...
| attn_key_value_state: Optional[Tuple[FloatT, FloatT]] = None
attn_position_bias¶
class T5LayerSelfAttentionOutput(NamedTuple):
| ...
| attn_position_bias: FloatT = None
attn_weights¶
class T5LayerSelfAttentionOutput(NamedTuple):
| ...
| attn_weights: Optional[FloatT] = None
T5LayerSelfAttention¶
class T5LayerSelfAttention(TransformerModule, FromParams):
| def __init__(
| self,
| self_attention: Optional[T5Attention] = None,
| layer_norm: Optional[T5LayerNorm] = None,
| dropout: float = 0.1,
| has_relative_attention_bias: bool = False
| )
hidden_size¶
class T5LayerSelfAttention(TransformerModule, FromParams):
| ...
| @property
| def hidden_size(self) -> int
forward¶
class T5LayerSelfAttention(TransformerModule, FromParams):
| ...
| def forward(
| self,
| hidden_states: FloatT,
| attention_mask: Optional[torch.BoolTensor] = None,
| position_bias: Optional[torch.Tensor] = None,
| layer_head_mask: Optional[torch.BoolTensor] = None,
| past_key_value: Optional[Tuple[FloatT]] = None,
| use_cache: bool = False,
| output_attentions: bool = False
| ) -> T5LayerSelfAttentionOutput
T5LayerCrossAttentionOutput¶
class T5LayerCrossAttentionOutput(NamedTuple)
hidden_states¶
class T5LayerCrossAttentionOutput(NamedTuple):
| ...
| hidden_states: FloatT = None
attn_key_value_state¶
class T5LayerCrossAttentionOutput(NamedTuple):
| ...
| attn_key_value_state: Optional[Tuple[FloatT, FloatT]] = None
attn_position_bias¶
class T5LayerCrossAttentionOutput(NamedTuple):
| ...
| attn_position_bias: FloatT = None
attn_weights¶
class T5LayerCrossAttentionOutput(NamedTuple):
| ...
| attn_weights: Optional[FloatT] = None
T5LayerCrossAttention¶
class T5LayerCrossAttention(TransformerModule, FromParams):
| def __init__(
| self,
| enc_dec_attention: Optional[T5Attention] = None,
| layer_norm: Optional[T5LayerNorm] = None,
| dropout: float = 0.1
| )
forward¶
class T5LayerCrossAttention(TransformerModule, FromParams):
| ...
| def forward(
| self,
| hidden_states: FloatT,
| key_value_states: Optional[FloatT],
| attention_mask: Optional[torch.BoolTensor] = None,
| position_bias: Optional[FloatT] = None,
| layer_head_mask: Optional[torch.BoolTensor] = None,
| past_key_value: Optional[Tuple[Tuple[FloatT]]] = None,
| use_cache: bool = False,
| query_length: int = None,
| output_attentions: bool = False
| ) -> T5LayerCrossAttentionOutput
KeyValueStates¶
KeyValueStates = Union[
Tuple[FloatT, FloatT], # without cross attention
Tuple[FloatT, FloatT, FloatT, Float ...
T5BlockOutput¶
class T5BlockOutput(NamedTuple)
hidden_states¶
class T5BlockOutput(NamedTuple):
| ...
| hidden_states: FloatT = None
present_key_value_states¶
class T5BlockOutput(NamedTuple):
| ...
| present_key_value_states: Optional[KeyValueStates] = None
self_attn_weights¶
class T5BlockOutput(NamedTuple):
| ...
| self_attn_weights: Optional[FloatT] = None
self_attn_position_bias¶
class T5BlockOutput(NamedTuple):
| ...
| self_attn_position_bias: Optional[FloatT] = None
cross_attn_weights¶
class T5BlockOutput(NamedTuple):
| ...
| cross_attn_weights: Optional[FloatT] = None
cross_attn_position_bias¶
class T5BlockOutput(NamedTuple):
| ...
| cross_attn_position_bias: Optional[FloatT] = None
T5Block¶
class T5Block(TransformerModule, FromParams):
| def __init__(
| self,
| attention: Optional[T5LayerSelfAttention] = None,
| cross_attention: Optional[T5LayerCrossAttention] = None,
| ff: Optional[T5LayerFF] = None
| )
hidden_size¶
class T5Block(TransformerModule, FromParams):
| ...
| @property
| def hidden_size(self) -> int
forward¶
class T5Block(TransformerModule, FromParams):
| ...
| def forward(
| self,
| hidden_states: FloatT,
| attention_mask: Optional[torch.BoolTensor] = None,
| position_bias: Optional[FloatT] = None,
| encoder_hidden_states: Optional[FloatT] = None,
| encoder_attention_mask: Optional[torch.BoolTensor] = None,
| encoder_decoder_position_bias: Optional[FloatT] = None,
| layer_head_mask: Optional[torch.BoolTensor] = None,
| encoder_layer_head_mask: Optional[torch.BoolTensor] = None,
| past_key_value: Optional[KeyValueStates] = None,
| use_cache: bool = False,
| output_attentions: bool = False
| ) -> T5BlockOutput
T5StackOutput¶
class T5StackOutput(NamedTuple)
last_hidden_state¶
class T5StackOutput(NamedTuple):
| ...
| last_hidden_state: FloatT = None
past_key_values¶
class T5StackOutput(NamedTuple):
| ...
| past_key_values: Optional[List[KeyValueStates]] = None
all_hidden_states¶
class T5StackOutput(NamedTuple):
| ...
| all_hidden_states: Optional[List[FloatT]] = None
attentions¶
class T5StackOutput(NamedTuple):
| ...
| attentions: Optional[List[FloatT]] = None
cross_attentions¶
class T5StackOutput(NamedTuple):
| ...
| cross_attentions: Optional[List[FloatT]] = None
T5Stack¶
class T5Stack(TransformerModule, FromParams):
| def __init__(
| self,
| token_embeddings: nn.Embedding,
| blocks: List[T5Block],
| final_layer_norm: Optional[T5LayerNorm] = None,
| dropout: float = 0.1
| )
num_blocks¶
class T5Stack(TransformerModule, FromParams):
| ...
| @property
| def num_blocks(self) -> int
hidden_size¶
class T5Stack(TransformerModule, FromParams):
| ...
| @property
| def hidden_size(self) -> int
get_head_mask¶
class T5Stack(TransformerModule, FromParams):
| ...
| @staticmethod
| def get_head_mask(
| head_mask: Optional[torch.BoolTensor],
| num_hidden_layers: int
| ) -> BoolT
resize_token_embeddings¶
class T5Stack(TransformerModule, FromParams):
| ...
| def resize_token_embeddings(
| self,
| new_size: int,
| *, init_fn: Callable = torch.nn.init.normal_
| ) -> None
forward¶
class T5Stack(TransformerModule, FromParams):
| ...
| def forward(
| self,
| input_ids: Optional[torch.IntTensor] = None,
| attention_mask: Optional[torch.BoolTensor] = None,
| encoder_hidden_states: Optional[FloatT] = None,
| encoder_attention_mask: Optional[torch.BoolTensor] = None,
| inputs_embeds: Optional[FloatT] = None,
| head_mask: Optional[torch.BoolTensor] = None,
| encoder_head_mask: Optional[torch.BoolTensor] = None,
| past_key_values: Optional[KeyValueStates] = None,
| use_cache: bool = False,
| output_attentions: bool = False,
| output_all_hidden_states: bool = False
| ) -> T5StackOutput
T5EncoderStack¶
class T5EncoderStack(T5Stack, FromParams):
| def __init__(
| self,
| token_embeddings: nn.Embedding,
| blocks: List[T5Block],
| final_layer_norm: Optional[T5LayerNorm] = None,
| dropout: float = 0.1
| )
basic_encoder¶
class T5EncoderStack(T5Stack, FromParams):
| ...
| @classmethod
| def basic_encoder(
| cls,
| token_embeddings: nn.Embedding,
| num_blocks: int = 6,
| block_self_attention: Lazy[T5Attention] = Lazy(T5Attention),
| final_layer_norm: Optional[T5LayerNorm] = None,
| block_ff: Lazy[T5LayerFF] = Lazy(T5LayerFF),
| dropout: float = 0.1,
| ddp_accelerator: Optional[DdpAccelerator] = None,
| checkpoint_wrapper: Optional[CheckpointWrapper] = None
| ) -> "T5EncoderStack"
T5DecoderStack¶
class T5DecoderStack(T5Stack, FromParams):
| def __init__(
| self,
| token_embeddings: nn.Embedding,
| blocks: List[T5Block],
| final_layer_norm: Optional[T5LayerNorm] = None,
| dropout: float = 0.1
| )
basic_decoder¶
class T5DecoderStack(T5Stack, FromParams):
| ...
| @classmethod
| def basic_decoder(
| cls,
| token_embeddings: nn.Embedding,
| num_blocks: int = 6,
| block_self_attention: Lazy[T5Attention] = Lazy(T5Attention),
| block_cross_attention: Lazy[T5Attention] = Lazy(T5Attention),
| final_layer_norm: Optional[T5LayerNorm] = None,
| block_ff: Lazy[T5LayerFF] = Lazy(T5LayerFF),
| dropout: float = 0.1,
| ddp_accelerator: Optional[DdpAccelerator] = None,
| checkpoint_wrapper: Optional[CheckpointWrapper] = None
| ) -> "T5DecoderStack"
T5Output¶
class T5Output(NamedTuple)
Defines the output from the T5
model.
encoder_last_hidden_state¶
class T5Output(NamedTuple):
| ...
| encoder_last_hidden_state: FloatT = None
Final hidden states from the encoder.
Shape: (batch_size, target_length, hidden_dim)
encoder_all_hidden_states¶
class T5Output(NamedTuple):
| ...
| encoder_all_hidden_states: Optional[List[FloatT]] = None
All hidden states from the encoder.
Shape (each): (batch_size, target_length, hidden_dim)
decoder_last_hidden_state¶
class T5Output(NamedTuple):
| ...
| decoder_last_hidden_state: Optional[FloatT] = None
Final hidden states from the decoder. Only present when labels
is given.
Shape: (batch_size, target_length, hidden_dim)
decoder_all_hidden_states¶
class T5Output(NamedTuple):
| ...
| decoder_all_hidden_states: Optional[List[FloatT]] = None
All hidden states from the decoder. Only present when labels
is given
and output_all_hidden_states
is True
.
Shape (each): (batch_size, target_length, hidden_dim)
encoder_attentions¶
class T5Output(NamedTuple):
| ...
| encoder_attentions: Optional[List[FloatT]] = None
Attention values from the encoder. Only present when output_attentions
is True
.
decoder_attentions¶
class T5Output(NamedTuple):
| ...
| decoder_attentions: Optional[List[FloatT]] = None
Attention values from the decoder. Only present when labels
is given
and output_attentions
is True
.
cross_attentions¶
class T5Output(NamedTuple):
| ...
| cross_attentions: Optional[List[FloatT]] = None
Cross-attention values from the decoder. Only present when labels
is given
and output_attentions
is True
.
loss¶
class T5Output(NamedTuple):
| ...
| loss: Optional[FloatT] = None
The loss calculating with respect to labels
.
logits¶
class T5Output(NamedTuple):
| ...
| logits: Optional[FloatT] = None
The logits that are used to calculate the loss with respect to labels
.
predictions¶
class T5Output(NamedTuple):
| ...
| predictions: Optional[IntT] = None
Predicted token IDs from beam search.
Shape: (batch_size, beam_size, max_decoding_steps)
.
predicted_log_probs¶
class T5Output(NamedTuple):
| ...
| predicted_log_probs: Optional[FloatT] = None
Probabilities corresponding to predictions
.
Shape: (batch_size, beam_size,)
.
T5¶
class T5(TransformerModule, Registrable):
| def __init__(
| self,
| token_embeddings: Optional[nn.Embedding] = None,
| encoder: Lazy[T5EncoderStack] = Lazy(T5EncoderStack.basic_encoder),
| decoder: Lazy[T5DecoderStack] = Lazy(T5DecoderStack.basic_decoder),
| decoder_start_token_id: int = 0,
| pad_token_id: int = 0,
| eos_token_id: int = 1,
| vocab_size: int = 32128,
| model_dim: int = 512,
| output_attentions: bool = False,
| output_all_hidden_states: bool = False,
| beam_search: Lazy[BeamSearch] = Lazy(BeamSearch, beam_size=3, max_steps=100),
| ddp_accelerator: Optional[DdpAccelerator] = None,
| checkpoint_wrapper: Optional[CheckpointWrapper] = None,
| tie_word_embeddings: bool = True
| )
default_implementation¶
class T5(TransformerModule, Registrable):
| ...
| default_implementation = "default"
resize_token_embeddings¶
class T5(TransformerModule, Registrable):
| ...
| def resize_token_embeddings(
| self,
| new_size: int,
| *, init_fn: Callable = torch.nn.init.normal_
| ) -> None
Resizes the token embeddings in the model.
This takes care of the token embeddings for the encoder, the decoder, and the LM head.
new_size : int
The new size of the token embeddings
init_fn : Callable
The function to use to initialize new embeddings. This function will be called with a
single argument, the tensor to initialize, and it is expected to initialize the tensor
in place. Many of the functions from torch.nn.init
fit.
forward¶
class T5(TransformerModule, Registrable):
| ...
| def forward(
| self,
| input_ids: IntT,
| attention_mask: Optional[BoolT] = None,
| labels: Optional[IntT] = None,
| decoder_attention_mask: Optional[BoolT] = None
| ) -> T5Output
Run forward pass of the model.
take_search_step¶
class T5(TransformerModule, Registrable):
| ...
| def take_search_step(
| self,
| last_predictions: torch.Tensor,
| state: Dict[str, torch.Tensor],
| step: int
| ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]
Take step during beam search.
This function is what gets passed to the BeamSearch.search
method. It takes
predictions from the last timestep and the current state and outputs
the log probabilities assigned to tokens for the next timestep, as well as the updated
state.