Skip to content

t5

allennlp.modules.transformer.t5

[SOURCE]


An implementation of T5, adapted from HuggingFace.

T5LayerNorm

class T5LayerNorm(TransformerModule,  FromParams):
 | def __init__(self, hidden_size: int = 512, eps: float = 1e-6)

T5-style layer norm does not have bias and does not subtract the mean.

forward

class T5LayerNorm(TransformerModule,  FromParams):
 | ...
 | def forward(self, hidden_states) -> FloatT

T5FeedForwardProjection

class T5FeedForwardProjection(TransformerModule,  Registrable)

forward

class T5FeedForwardProjection(TransformerModule,  Registrable):
 | ...
 | def forward(self, hidden_states) -> FloatT

T5DenseReluDense

@T5FeedForwardProjection.register("relu")
class T5DenseReluDense(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     hidden_size: int = 512,
 |     ff_size: int = 2048,
 |     dropout: float = 0.1
 | )

forward

class T5DenseReluDense(TransformerModule,  FromParams):
 | ...
 | def forward(self, hidden_states) -> FloatT

T5DenseGatedGeluDense

@T5FeedForwardProjection.register("gated-gelu")
class T5DenseGatedGeluDense(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     hidden_size: int = 512,
 |     ff_size: int = 2048,
 |     dropout: float = 0.1
 | )

forward

class T5DenseGatedGeluDense(TransformerModule,  FromParams):
 | ...
 | def forward(self, hidden_states) -> FloatT

T5LayerFF

class T5LayerFF(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     ff_proj: Optional[T5FeedForwardProjection] = None,
 |     layer_norm: Optional[T5LayerNorm] = None,
 |     dropout: float = 0.1
 | )

forward

class T5LayerFF(TransformerModule,  FromParams):
 | ...
 | def forward(self, hidden_states) -> FloatT

T5LayerSelfAttentionOutput

class T5LayerSelfAttentionOutput(NamedTuple)

hidden_states

class T5LayerSelfAttentionOutput(NamedTuple):
 | ...
 | hidden_states: FloatT = None

attn_key_value_state

class T5LayerSelfAttentionOutput(NamedTuple):
 | ...
 | attn_key_value_state: Optional[Tuple[FloatT, FloatT]] = None

attn_position_bias

class T5LayerSelfAttentionOutput(NamedTuple):
 | ...
 | attn_position_bias: FloatT = None

attn_weights

class T5LayerSelfAttentionOutput(NamedTuple):
 | ...
 | attn_weights: Optional[FloatT] = None

T5LayerSelfAttention

class T5LayerSelfAttention(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     self_attention: Optional[T5Attention] = None,
 |     layer_norm: Optional[T5LayerNorm] = None,
 |     dropout: float = 0.1,
 |     has_relative_attention_bias: bool = False
 | )

hidden_size

class T5LayerSelfAttention(TransformerModule,  FromParams):
 | ...
 | @property
 | def hidden_size(self) -> int

forward

class T5LayerSelfAttention(TransformerModule,  FromParams):
 | ...
 | def forward(
 |     self,
 |     hidden_states: FloatT,
 |     attention_mask: Optional[torch.BoolTensor] = None,
 |     position_bias: Optional[torch.Tensor] = None,
 |     layer_head_mask: Optional[torch.BoolTensor] = None,
 |     past_key_value: Optional[Tuple[FloatT]] = None,
 |     use_cache: bool = False,
 |     output_attentions: bool = False
 | ) -> T5LayerSelfAttentionOutput

T5LayerCrossAttentionOutput

class T5LayerCrossAttentionOutput(NamedTuple)

hidden_states

class T5LayerCrossAttentionOutput(NamedTuple):
 | ...
 | hidden_states: FloatT = None

attn_key_value_state

class T5LayerCrossAttentionOutput(NamedTuple):
 | ...
 | attn_key_value_state: Optional[Tuple[FloatT, FloatT]] = None

attn_position_bias

class T5LayerCrossAttentionOutput(NamedTuple):
 | ...
 | attn_position_bias: FloatT = None

attn_weights

class T5LayerCrossAttentionOutput(NamedTuple):
 | ...
 | attn_weights: Optional[FloatT] = None

T5LayerCrossAttention

class T5LayerCrossAttention(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     enc_dec_attention: Optional[T5Attention] = None,
 |     layer_norm: Optional[T5LayerNorm] = None,
 |     dropout: float = 0.1
 | )

forward

class T5LayerCrossAttention(TransformerModule,  FromParams):
 | ...
 | def forward(
 |     self,
 |     hidden_states: FloatT,
 |     key_value_states: Optional[FloatT],
 |     attention_mask: Optional[torch.BoolTensor] = None,
 |     position_bias: Optional[FloatT] = None,
 |     layer_head_mask: Optional[torch.BoolTensor] = None,
 |     past_key_value: Optional[Tuple[Tuple[FloatT]]] = None,
 |     use_cache: bool = False,
 |     query_length: int = None,
 |     output_attentions: bool = False
 | ) -> T5LayerCrossAttentionOutput

KeyValueStates

KeyValueStates = Union[
    Tuple[FloatT, FloatT],  # without cross attention
    Tuple[FloatT, FloatT, FloatT, Float ...

T5BlockOutput

class T5BlockOutput(NamedTuple)

hidden_states

class T5BlockOutput(NamedTuple):
 | ...
 | hidden_states: FloatT = None

present_key_value_states

class T5BlockOutput(NamedTuple):
 | ...
 | present_key_value_states: Optional[KeyValueStates] = None

self_attn_weights

class T5BlockOutput(NamedTuple):
 | ...
 | self_attn_weights: Optional[FloatT] = None

self_attn_position_bias

class T5BlockOutput(NamedTuple):
 | ...
 | self_attn_position_bias: Optional[FloatT] = None

cross_attn_weights

class T5BlockOutput(NamedTuple):
 | ...
 | cross_attn_weights: Optional[FloatT] = None

cross_attn_position_bias

class T5BlockOutput(NamedTuple):
 | ...
 | cross_attn_position_bias: Optional[FloatT] = None

T5Block

class T5Block(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     attention: Optional[T5LayerSelfAttention] = None,
 |     cross_attention: Optional[T5LayerCrossAttention] = None,
 |     ff: Optional[T5LayerFF] = None
 | )

hidden_size

class T5Block(TransformerModule,  FromParams):
 | ...
 | @property
 | def hidden_size(self) -> int

forward

class T5Block(TransformerModule,  FromParams):
 | ...
 | def forward(
 |     self,
 |     hidden_states: FloatT,
 |     attention_mask: Optional[torch.BoolTensor] = None,
 |     position_bias: Optional[FloatT] = None,
 |     encoder_hidden_states: Optional[FloatT] = None,
 |     encoder_attention_mask: Optional[torch.BoolTensor] = None,
 |     encoder_decoder_position_bias: Optional[FloatT] = None,
 |     layer_head_mask: Optional[torch.BoolTensor] = None,
 |     encoder_layer_head_mask: Optional[torch.BoolTensor] = None,
 |     past_key_value: Optional[KeyValueStates] = None,
 |     use_cache: bool = False,
 |     output_attentions: bool = False
 | ) -> T5BlockOutput

T5StackOutput

class T5StackOutput(NamedTuple)

last_hidden_state

class T5StackOutput(NamedTuple):
 | ...
 | last_hidden_state: FloatT = None

past_key_values

class T5StackOutput(NamedTuple):
 | ...
 | past_key_values: Optional[List[KeyValueStates]] = None

all_hidden_states

class T5StackOutput(NamedTuple):
 | ...
 | all_hidden_states: Optional[List[FloatT]] = None

attentions

class T5StackOutput(NamedTuple):
 | ...
 | attentions: Optional[List[FloatT]] = None

cross_attentions

class T5StackOutput(NamedTuple):
 | ...
 | cross_attentions: Optional[List[FloatT]] = None

T5Stack

class T5Stack(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     token_embeddings: nn.Embedding,
 |     blocks: List[T5Block],
 |     final_layer_norm: Optional[T5LayerNorm] = None,
 |     dropout: float = 0.1
 | )

num_blocks

class T5Stack(TransformerModule,  FromParams):
 | ...
 | @property
 | def num_blocks(self) -> int

hidden_size

class T5Stack(TransformerModule,  FromParams):
 | ...
 | @property
 | def hidden_size(self) -> int

get_head_mask

class T5Stack(TransformerModule,  FromParams):
 | ...
 | @staticmethod
 | def get_head_mask(
 |     head_mask: Optional[torch.BoolTensor],
 |     num_hidden_layers: int
 | ) -> BoolT

resize_token_embeddings

class T5Stack(TransformerModule,  FromParams):
 | ...
 | def resize_token_embeddings(
 |     self,
 |     new_size: int,
 |     *, init_fn: Callable = torch.nn.init.normal_
 | ) -> None

forward

class T5Stack(TransformerModule,  FromParams):
 | ...
 | def forward(
 |     self,
 |     input_ids: Optional[torch.IntTensor] = None,
 |     attention_mask: Optional[torch.BoolTensor] = None,
 |     encoder_hidden_states: Optional[FloatT] = None,
 |     encoder_attention_mask: Optional[torch.BoolTensor] = None,
 |     inputs_embeds: Optional[FloatT] = None,
 |     head_mask: Optional[torch.BoolTensor] = None,
 |     encoder_head_mask: Optional[torch.BoolTensor] = None,
 |     past_key_values: Optional[KeyValueStates] = None,
 |     use_cache: bool = False,
 |     output_attentions: bool = False,
 |     output_all_hidden_states: bool = False
 | ) -> T5StackOutput

T5EncoderStack

class T5EncoderStack(T5Stack,  FromParams):
 | def __init__(
 |     self,
 |     token_embeddings: nn.Embedding,
 |     blocks: List[T5Block],
 |     final_layer_norm: Optional[T5LayerNorm] = None,
 |     dropout: float = 0.1
 | )

basic_encoder

class T5EncoderStack(T5Stack,  FromParams):
 | ...
 | @classmethod
 | def basic_encoder(
 |     cls,
 |     token_embeddings: nn.Embedding,
 |     num_blocks: int = 6,
 |     block_self_attention: Lazy[T5Attention] = Lazy(T5Attention),
 |     final_layer_norm: Optional[T5LayerNorm] = None,
 |     block_ff: Lazy[T5LayerFF] = Lazy(T5LayerFF),
 |     dropout: float = 0.1,
 |     ddp_accelerator: Optional[DdpAccelerator] = None,
 |     checkpoint_wrapper: Optional[CheckpointWrapper] = None
 | ) -> "T5EncoderStack"

T5DecoderStack

class T5DecoderStack(T5Stack,  FromParams):
 | def __init__(
 |     self,
 |     token_embeddings: nn.Embedding,
 |     blocks: List[T5Block],
 |     final_layer_norm: Optional[T5LayerNorm] = None,
 |     dropout: float = 0.1
 | )

basic_decoder

class T5DecoderStack(T5Stack,  FromParams):
 | ...
 | @classmethod
 | def basic_decoder(
 |     cls,
 |     token_embeddings: nn.Embedding,
 |     num_blocks: int = 6,
 |     block_self_attention: Lazy[T5Attention] = Lazy(T5Attention),
 |     block_cross_attention: Lazy[T5Attention] = Lazy(T5Attention),
 |     final_layer_norm: Optional[T5LayerNorm] = None,
 |     block_ff: Lazy[T5LayerFF] = Lazy(T5LayerFF),
 |     dropout: float = 0.1,
 |     ddp_accelerator: Optional[DdpAccelerator] = None,
 |     checkpoint_wrapper: Optional[CheckpointWrapper] = None
 | ) -> "T5DecoderStack"

T5Output

class T5Output(NamedTuple)

Defines the output from the T5 model.

encoder_last_hidden_state

class T5Output(NamedTuple):
 | ...
 | encoder_last_hidden_state: FloatT = None

Final hidden states from the encoder.

Shape: (batch_size, target_length, hidden_dim)

encoder_all_hidden_states

class T5Output(NamedTuple):
 | ...
 | encoder_all_hidden_states: Optional[List[FloatT]] = None

All hidden states from the encoder.

Shape (each): (batch_size, target_length, hidden_dim)

decoder_last_hidden_state

class T5Output(NamedTuple):
 | ...
 | decoder_last_hidden_state: Optional[FloatT] = None

Final hidden states from the decoder. Only present when labels is given.

Shape: (batch_size, target_length, hidden_dim)

decoder_all_hidden_states

class T5Output(NamedTuple):
 | ...
 | decoder_all_hidden_states: Optional[List[FloatT]] = None

All hidden states from the decoder. Only present when labels is given and output_all_hidden_states is True.

Shape (each): (batch_size, target_length, hidden_dim)

encoder_attentions

class T5Output(NamedTuple):
 | ...
 | encoder_attentions: Optional[List[FloatT]] = None

Attention values from the encoder. Only present when output_attentions is True.

decoder_attentions

class T5Output(NamedTuple):
 | ...
 | decoder_attentions: Optional[List[FloatT]] = None

Attention values from the decoder. Only present when labels is given and output_attentions is True.

cross_attentions

class T5Output(NamedTuple):
 | ...
 | cross_attentions: Optional[List[FloatT]] = None

Cross-attention values from the decoder. Only present when labels is given and output_attentions is True.

loss

class T5Output(NamedTuple):
 | ...
 | loss: Optional[FloatT] = None

The loss calculating with respect to labels.

logits

class T5Output(NamedTuple):
 | ...
 | logits: Optional[FloatT] = None

The logits that are used to calculate the loss with respect to labels.

predictions

class T5Output(NamedTuple):
 | ...
 | predictions: Optional[IntT] = None

Predicted token IDs from beam search.

Shape: (batch_size, beam_size, max_decoding_steps).

predicted_log_probs

class T5Output(NamedTuple):
 | ...
 | predicted_log_probs: Optional[FloatT] = None

Probabilities corresponding to predictions.

Shape: (batch_size, beam_size,).

T5

class T5(TransformerModule,  Registrable):
 | def __init__(
 |     self,
 |     token_embeddings: Optional[nn.Embedding] = None,
 |     encoder: Lazy[T5EncoderStack] = Lazy(T5EncoderStack.basic_encoder),
 |     decoder: Lazy[T5DecoderStack] = Lazy(T5DecoderStack.basic_decoder),
 |     decoder_start_token_id: int = 0,
 |     pad_token_id: int = 0,
 |     eos_token_id: int = 1,
 |     vocab_size: int = 32128,
 |     model_dim: int = 512,
 |     output_attentions: bool = False,
 |     output_all_hidden_states: bool = False,
 |     beam_search: Lazy[BeamSearch] = Lazy(BeamSearch, beam_size=3, max_steps=100),
 |     ddp_accelerator: Optional[DdpAccelerator] = None,
 |     checkpoint_wrapper: Optional[CheckpointWrapper] = None,
 |     tie_word_embeddings: bool = True
 | )

default_implementation

class T5(TransformerModule,  Registrable):
 | ...
 | default_implementation = "default"

resize_token_embeddings

class T5(TransformerModule,  Registrable):
 | ...
 | def resize_token_embeddings(
 |     self,
 |     new_size: int,
 |     *, init_fn: Callable = torch.nn.init.normal_
 | ) -> None

Resizes the token embeddings in the model.

This takes care of the token embeddings for the encoder, the decoder, and the LM head.

new_size : int The new size of the token embeddings init_fn : Callable The function to use to initialize new embeddings. This function will be called with a single argument, the tensor to initialize, and it is expected to initialize the tensor in place. Many of the functions from torch.nn.init fit.

forward

class T5(TransformerModule,  Registrable):
 | ...
 | def forward(
 |     self,
 |     input_ids: IntT,
 |     attention_mask: Optional[BoolT] = None,
 |     labels: Optional[IntT] = None,
 |     decoder_attention_mask: Optional[BoolT] = None
 | ) -> T5Output

Run forward pass of the model.

take_search_step

class T5(TransformerModule,  Registrable):
 | ...
 | def take_search_step(
 |     self,
 |     last_predictions: torch.Tensor,
 |     state: Dict[str, torch.Tensor],
 |     step: int
 | ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]

Take step during beam search.

This function is what gets passed to the BeamSearch.search method. It takes predictions from the last timestep and the current state and outputs the log probabilities assigned to tokens for the next timestep, as well as the updated state.