Skip to content

t5

allennlp.modules.transformer.t5

[SOURCE]


Adapted from HuggingFace.

FloatT

FloatT = Union[torch.FloatTensor]

IntT

IntT = Union[torch.IntTensor]

BoolT

BoolT = Union[torch.BoolTensor]

T5LayerNorm

class T5LayerNorm(TransformerModule,  FromParams):
 | def __init__(self, hidden_size: int = 512, eps: float = 1e-6)

T5-style layer norm does not have bias and does not subtract the mean.

forward

class T5LayerNorm(TransformerModule,  FromParams):
 | ...
 | def forward(self, hidden_states) -> FloatT

layer norm should always be calculated in float32

T5FeedForwardProjection

class T5FeedForwardProjection(TransformerModule,  Registrable)

forward

class T5FeedForwardProjection(TransformerModule,  Registrable):
 | ...
 | def forward(self, hidden_states) -> FloatT

T5DenseReluDense

@T5FeedForwardProjection.register("relu")
class T5DenseReluDense(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     hidden_size: int = 512,
 |     ff_size: int = 2048,
 |     dropout: float = 0.1
 | )

forward

class T5DenseReluDense(TransformerModule,  FromParams):
 | ...
 | def forward(self, hidden_states) -> FloatT

T5DenseGatedGeluDense

@T5FeedForwardProjection.register("gated-gelu")
class T5DenseGatedGeluDense(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     hidden_size: int = 512,
 |     ff_size: int = 2048,
 |     dropout: float = 0.1
 | )

forward

class T5DenseGatedGeluDense(TransformerModule,  FromParams):
 | ...
 | def forward(self, hidden_states) -> FloatT

T5LayerFF

class T5LayerFF(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     ff_proj: Optional[T5FeedForwardProjection] = None,
 |     layer_norm: Optional[T5LayerNorm] = None,
 |     dropout: float = 0.1
 | )

forward

class T5LayerFF(TransformerModule,  FromParams):
 | ...
 | def forward(self, hidden_states) -> FloatT

T5AttentionOutput

@dataclass
class T5AttentionOutput

hidden_states

class T5AttentionOutput:
 | ...
 | hidden_states: FloatT = None

key_value_state

class T5AttentionOutput:
 | ...
 | key_value_state: Optional[Tuple[FloatT, FloatT]] = None

position_bias

class T5AttentionOutput:
 | ...
 | position_bias: FloatT = None

attn_weights

class T5AttentionOutput:
 | ...
 | attn_weights: Optional[FloatT] = None

T5Attention

class T5Attention(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     is_decoder: bool = False,
 |     hidden_size: int = 512,
 |     key_value_proj_dim: int = 64,
 |     num_heads: int = 8,
 |     has_relative_attention_bias: bool = False,
 |     relative_attention_num_buckets: int = 32,
 |     dropout: float = 0.1
 | )

compute_bias

class T5Attention(TransformerModule,  FromParams):
 | ...
 | def compute_bias(self, query_length: int, key_length: int) -> FloatT

Compute binned relative position bias

forward

class T5Attention(TransformerModule,  FromParams):
 | ...
 | def forward(
 |     self,
 |     hidden_states: torch.Tensor,
 |     mask: Optional[torch.BoolTensor] = None,
 |     key_value_states: Optional[FloatT] = None,
 |     position_bias: Optional[FloatT] = None,
 |     past_key_value: Optional[Tuple[FloatT, FloatT]] = None,
 |     layer_head_mask: Optional[BoolT] = None,
 |     query_length: Optional[int] = None,
 |     use_cache: bool = False,
 |     output_attentions: bool = False
 | ) -> T5AttentionOutput

Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).

T5LayerSelfAttentionOutput

@dataclass
class T5LayerSelfAttentionOutput

hidden_states

class T5LayerSelfAttentionOutput:
 | ...
 | hidden_states: FloatT = None

attn_key_value_state

class T5LayerSelfAttentionOutput:
 | ...
 | attn_key_value_state: Optional[Tuple[FloatT, FloatT]] = None

attn_position_bias

class T5LayerSelfAttentionOutput:
 | ...
 | attn_position_bias: FloatT = None

attn_weights

class T5LayerSelfAttentionOutput:
 | ...
 | attn_weights: Optional[FloatT] = None

T5LayerSelfAttention

class T5LayerSelfAttention(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     self_attention: Optional[T5Attention] = None,
 |     layer_norm: Optional[T5LayerNorm] = None,
 |     dropout: float = 0.1
 | )

forward

class T5LayerSelfAttention(TransformerModule,  FromParams):
 | ...
 | def forward(
 |     self,
 |     hidden_states: FloatT,
 |     attention_mask: Optional[torch.BoolTensor] = None,
 |     position_bias: Optional[torch.Tensor] = None,
 |     layer_head_mask: Optional[torch.BoolTensor] = None,
 |     past_key_value: Optional[Tuple[FloatT]] = None,
 |     use_cache: bool = False,
 |     output_attentions: bool = False
 | ) -> T5LayerSelfAttentionOutput

T5LayerCrossAttentionOutput

@dataclass
class T5LayerCrossAttentionOutput

hidden_states

class T5LayerCrossAttentionOutput:
 | ...
 | hidden_states: FloatT = None

attn_key_value_state

class T5LayerCrossAttentionOutput:
 | ...
 | attn_key_value_state: Optional[Tuple[FloatT, FloatT]] = None

attn_position_bias

class T5LayerCrossAttentionOutput:
 | ...
 | attn_position_bias: FloatT = None

attn_weights

class T5LayerCrossAttentionOutput:
 | ...
 | attn_weights: Optional[FloatT] = None

T5LayerCrossAttention

class T5LayerCrossAttention(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     enc_dec_attention: Optional[T5Attention] = None,
 |     layer_norm: Optional[T5LayerNorm] = None,
 |     dropout: float = 0.1
 | )

forward

class T5LayerCrossAttention(TransformerModule,  FromParams):
 | ...
 | def forward(
 |     self,
 |     hidden_states: FloatT,
 |     key_value_states: Optional[FloatT],
 |     attention_mask: Optional[torch.BoolTensor] = None,
 |     position_bias: Optional[FloatT] = None,
 |     layer_head_mask: Optional[torch.BoolTensor] = None,
 |     past_key_value: Optional[Tuple[Tuple[FloatT]]] = None,
 |     use_cache: bool = False,
 |     query_length: int = None,
 |     output_attentions: bool = False
 | ) -> T5LayerCrossAttentionOutput

KeyValueStates

KeyValueStates = Union[
    Tuple[FloatT, FloatT],  # without cross attention
    Tuple[FloatT, FloatT, FloatT, Float ...

T5BlockOutput

@dataclass
class T5BlockOutput

hidden_states

class T5BlockOutput:
 | ...
 | hidden_states: FloatT = None

present_key_value_states

class T5BlockOutput:
 | ...
 | present_key_value_states: Optional[KeyValueStates] = None

self_attn_weights

class T5BlockOutput:
 | ...
 | self_attn_weights: Optional[FloatT] = None

self_attn_position_bias

class T5BlockOutput:
 | ...
 | self_attn_position_bias: Optional[FloatT] = None

cross_attn_weights

class T5BlockOutput:
 | ...
 | cross_attn_weights: Optional[FloatT] = None

cross_attn_position_bias

class T5BlockOutput:
 | ...
 | cross_attn_position_bias: Optional[FloatT] = None

T5Block

class T5Block(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     attention: Optional[T5LayerSelfAttention] = None,
 |     cross_attention: Optional[T5LayerCrossAttention] = None,
 |     ff: Optional[T5LayerFF] = None
 | )

hidden_size

class T5Block(TransformerModule,  FromParams):
 | ...
 | @property
 | def hidden_size(self) -> int

forward

class T5Block(TransformerModule,  FromParams):
 | ...
 | def forward(
 |     self,
 |     hidden_states: FloatT,
 |     attention_mask: Optional[torch.BoolTensor] = None,
 |     position_bias: Optional[FloatT] = None,
 |     encoder_hidden_states: Optional[FloatT] = None,
 |     encoder_attention_mask: Optional[torch.BoolTensor] = None,
 |     encoder_decoder_position_bias: Optional[FloatT] = None,
 |     layer_head_mask: Optional[torch.BoolTensor] = None,
 |     encoder_layer_head_mask: Optional[torch.BoolTensor] = None,
 |     past_key_value: Optional[KeyValueStates] = None,
 |     use_cache: bool = False,
 |     output_attentions: bool = False
 | ) -> T5BlockOutput

T5StackOutput

@dataclass
class T5StackOutput

last_hidden_state

class T5StackOutput:
 | ...
 | last_hidden_state: FloatT = None

past_key_values

class T5StackOutput:
 | ...
 | past_key_values: Optional[List[KeyValueStates]] = None

all_hidden_states

class T5StackOutput:
 | ...
 | all_hidden_states: Optional[List[FloatT]] = None

attentions

class T5StackOutput:
 | ...
 | attentions: Optional[List[FloatT]] = None

cross_attentions

class T5StackOutput:
 | ...
 | cross_attentions: Optional[List[FloatT]] = None

T5Stack

class T5Stack(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     token_embeddings: nn.Embedding,
 |     blocks: List[T5Block],
 |     final_layer_norm: Optional[T5LayerNorm] = None,
 |     dropout: float = 0.1
 | )

num_blocks

class T5Stack(TransformerModule,  FromParams):
 | ...
 | @property
 | def num_blocks(self) -> int

hidden_size

class T5Stack(TransformerModule,  FromParams):
 | ...
 | @property
 | def hidden_size(self) -> int

get_head_mask

class T5Stack(TransformerModule,  FromParams):
 | ...
 | @staticmethod
 | def get_head_mask(
 |     head_mask: Optional[torch.BoolTensor],
 |     num_hidden_layers: int
 | ) -> BoolT

forward

class T5Stack(TransformerModule,  FromParams):
 | ...
 | def forward(
 |     self,
 |     input_ids: Optional[torch.IntTensor] = None,
 |     attention_mask: Optional[torch.BoolTensor] = None,
 |     encoder_hidden_states: Optional[FloatT] = None,
 |     encoder_attention_mask: Optional[torch.BoolTensor] = None,
 |     inputs_embeds: Optional[FloatT] = None,
 |     head_mask: Optional[torch.BoolTensor] = None,
 |     encoder_head_mask: Optional[torch.BoolTensor] = None,
 |     past_key_values: Optional[KeyValueStates] = None,
 |     use_cache: bool = False,
 |     output_attentions: bool = False,
 |     output_all_hidden_states: bool = False
 | ) -> T5StackOutput

T5EncoderStack

class T5EncoderStack(T5Stack,  FromParams):
 | def __init__(
 |     self,
 |     token_embeddings: nn.Embedding,
 |     blocks: List[T5Block],
 |     final_layer_norm: Optional[T5LayerNorm] = None,
 |     dropout: float = 0.1
 | )

basic_encoder

class T5EncoderStack(T5Stack,  FromParams):
 | ...
 | @classmethod
 | def basic_encoder(
 |     cls,
 |     token_embeddings: nn.Embedding,
 |     num_blocks: int = 6,
 |     block_self_attention: Lazy[T5Attention] = Lazy(T5Attention),
 |     final_layer_norm: Optional[T5LayerNorm] = None,
 |     block_ff: Lazy[T5LayerFF] = Lazy(T5LayerFF),
 |     dropout: float = 0.1
 | ) -> "T5EncoderStack"

T5DecoderStack

class T5DecoderStack(T5Stack,  FromParams):
 | def __init__(
 |     self,
 |     token_embeddings: nn.Embedding,
 |     blocks: List[T5Block],
 |     final_layer_norm: Optional[T5LayerNorm] = None,
 |     dropout: float = 0.1
 | )

basic_decoder

class T5DecoderStack(T5Stack,  FromParams):
 | ...
 | @classmethod
 | def basic_decoder(
 |     cls,
 |     token_embeddings: nn.Embedding,
 |     num_blocks: int = 6,
 |     block_self_attention: Lazy[T5Attention] = Lazy(T5Attention),
 |     block_cross_attention: Lazy[T5Attention] = Lazy(T5Attention),
 |     final_layer_norm: Optional[T5LayerNorm] = None,
 |     block_ff: Lazy[T5LayerFF] = Lazy(T5LayerFF),
 |     dropout: float = 0.1
 | ) -> "T5DecoderStack"

T5Output

@dataclass
class T5Output

Defines the output from the T5 model.

encoder_last_hidden_state

class T5Output:
 | ...
 | encoder_last_hidden_state: FloatT = None

Final hidden states from the encoder.

Shape: (batch_size, target_length, hidden_dim)

encoder_all_hidden_states

class T5Output:
 | ...
 | encoder_all_hidden_states: Optional[List[FloatT]] = None

All hidden states from the encoder.

Shape (each): (batch_size, target_length, hidden_dim)

decoder_last_hidden_state

class T5Output:
 | ...
 | decoder_last_hidden_state: Optional[FloatT] = None

Final hidden states from the decoder. Only present when labels is given.

Shape: (batch_size, target_length, hidden_dim)

decoder_all_hidden_states

class T5Output:
 | ...
 | decoder_all_hidden_states: Optional[List[FloatT]] = None

All hidden states from the decoder. Only present when labels is given and output_all_hidden_states is True.

Shape (each): (batch_size, target_length, hidden_dim)

encoder_attentions

class T5Output:
 | ...
 | encoder_attentions: Optional[List[FloatT]] = None

Attention values from the encoder. Only present when output_attentions is True.

decoder_attentions

class T5Output:
 | ...
 | decoder_attentions: Optional[List[FloatT]] = None

Attention values from the decoder. Only present when labels is given and output_attentions is True.

cross_attentions

class T5Output:
 | ...
 | cross_attentions: Optional[List[FloatT]] = None

Cross-attention values from the decoder. Only present when labels is given and output_attentions is True.

loss

class T5Output:
 | ...
 | loss: Optional[FloatT] = None

The loss calculating with respect to labels.

logits

class T5Output:
 | ...
 | logits: Optional[FloatT] = None

The logits that are used to calculate the loss with respect to labels.

predictions

class T5Output:
 | ...
 | predictions: Optional[IntT] = None

Predicted token IDs from beam search.

Shape: (batch_size, beam_size, max_decoding_steps).

predicted_log_probs

class T5Output:
 | ...
 | predicted_log_probs: Optional[FloatT] = None

Probabilities corresponding to predictions.

Shape: (batch_size, beam_size,).

T5

class T5(TransformerModule,  Registrable):
 | def __init__(
 |     self,
 |     token_embeddings: Optional[nn.Embedding] = None,
 |     encoder: Lazy[T5EncoderStack] = Lazy(T5EncoderStack),
 |     decoder: Lazy[T5DecoderStack] = Lazy(T5DecoderStack),
 |     decoder_start_token_id: int = 0,
 |     pad_token_id: int = 0,
 |     eos_token_id: int = 1,
 |     vocab_size: int = 32128,
 |     model_dim: int = 512,
 |     output_attentions: bool = False,
 |     output_all_hidden_states: bool = False,
 |     beam_size: int = 3,
 |     max_decoding_steps: int = 100
 | )

default_implementation

class T5(TransformerModule,  Registrable):
 | ...
 | default_implementation = "default"

forward

class T5(TransformerModule,  Registrable):
 | ...
 | def forward(
 |     self,
 |     input_ids: IntT,
 |     attention_mask: Optional[BoolT] = None,
 |     labels: Optional[IntT] = None,
 |     decoder_attention_mask: Optional[BoolT] = None
 | ) -> T5Output

Run forward pass of the model.

take_search_step

class T5(TransformerModule,  Registrable):
 | ...
 | def take_search_step(
 |     self,
 |     last_predictions: torch.Tensor,
 |     state: Dict[str, torch.Tensor],
 |     step: int
 | ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]

Take step during beam search.

This function is what gets passed to the BeamSearch.search method. It takes predictions from the last timestep and the current state and outputs the log probabilities assigned to tokens for the next timestep, as well as the updated state.