bidirectional_lm_transformer

The BidirectionalTransformerEncoder from Calypso. This is basically the transformer from https://nlp.seas.harvard.edu/2018/04/03/attention.html so credit to them.

This code should be considered "private" in that we have several transformer implementations and may end up deleting this one. If you use it, consider yourself warned.

attention#

def attention(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    mask: torch.BoolTensor = None,
    dropout: Callable = None
) -> Tuple[torch.Tensor, torch.Tensor]

Compute 'Scaled Dot Product Attention'

subsequent_mask#

def subsequent_mask(
    size: int,
    device: str = "cpu"
) -> torch.BoolTensor

Mask out subsequent positions.

PositionalEncoding#

class PositionalEncoding(torch.nn.Module,  Registrable):
 | def __init__(self, input_dim: int, max_len: int = 5000) -> None

Implement the Positional Encoding function.

forward#

class PositionalEncoding(torch.nn.Module,  Registrable):
 | ...
 | def forward(self, x: torch.Tensor) -> torch.Tensor

PositionwiseFeedForward#

class PositionwiseFeedForward(torch.nn.Module):
 | def __init__(
 |     self,
 |     input_dim: int,
 |     ff_dim: int,
 |     dropout: float = 0.1
 | ) -> None

Implements FFN equation.

forward#

class PositionwiseFeedForward(torch.nn.Module):
 | ...
 | def forward(self, x: torch.Tensor) -> torch.Tensor

TransformerEncoder#

class TransformerEncoder(torch.nn.Module):
 | def __init__(
 |     self,
 |     layer: torch.nn.Module,
 |     num_layers: int,
 |     return_all_layers: bool = False
 | ) -> None

Core encoder is a stack of N layers

forward#

class TransformerEncoder(torch.nn.Module):
 | ...
 | def forward(self, x, mask)

Pass the input (and mask) through each layer in turn.

SublayerConnection#

class SublayerConnection(torch.nn.Module):
 | def __init__(self, size: int, dropout: float) -> None

A residual connection followed by a layer norm. Note for code simplicity the norm is first as opposed to last.

forward#

class SublayerConnection(torch.nn.Module):
 | ...
 | def forward(
 |     self,
 |     x: torch.Tensor,
 |     sublayer: Callable[[torch.Tensor], torch.Tensor]
 | ) -> torch.Tensor

Apply residual connection to any sublayer with the same size.

EncoderLayer#

class EncoderLayer(torch.nn.Module):
 | def __init__(
 |     self,
 |     size: int,
 |     self_attn: torch.nn.Module,
 |     feed_forward: torch.nn.Module,
 |     dropout: float
 | ) -> None

Encoder is made up of self-attn and feed forward (defined below)

forward#

class EncoderLayer(torch.nn.Module):
 | ...
 | def forward(
 |     self,
 |     x: torch.Tensor,
 |     mask: torch.BoolTensor
 | ) -> torch.Tensor

Follow Figure 1 (left) for connections.

MultiHeadedAttention#

class MultiHeadedAttention(torch.nn.Module):
 | def __init__(
 |     self,
 |     num_heads: int,
 |     input_dim: int,
 |     dropout: float = 0.1
 | ) -> None

forward#

class MultiHeadedAttention(torch.nn.Module):
 | ...
 | def forward(
 |     self,
 |     query: torch.Tensor,
 |     key: torch.Tensor,
 |     value: torch.Tensor,
 |     mask: torch.BoolTensor = None
 | ) -> torch.Tensor

make_model#

def make_model(
    num_layers: int = 6,
    input_size: int = 512,
    hidden_size: int = 2048,
    heads: int = 8,
    dropout: float = 0.1,
    return_all_layers: bool = False
) -> TransformerEncoder

Helper: Construct a model from hyperparameters.

BidirectionalLanguageModelTransformer#

class BidirectionalLanguageModelTransformer(Seq2SeqEncoder):
 | def __init__(
 |     self,
 |     input_dim: int,
 |     hidden_dim: int,
 |     num_layers: int,
 |     dropout: float = 0.1,
 |     input_dropout: float = None,
 |     return_all_layers: bool = False
 | ) -> None

get_attention_masks#

class BidirectionalLanguageModelTransformer(Seq2SeqEncoder):
 | ...
 | def get_attention_masks(
 |     self,
 |     mask: torch.BoolTensor
 | ) -> Tuple[torch.Tensor, torch.Tensor]

Returns 2 masks of shape (batch_size, timesteps, timesteps) representing 1) non-padded elements, and 2) elements of the sequence which are permitted to be involved in attention at a given timestep.

forward#

class BidirectionalLanguageModelTransformer(Seq2SeqEncoder):
 | ...
 | def forward(
 |     self,
 |     token_embeddings: torch.Tensor,
 |     mask: torch.BoolTensor
 | ) -> torch.Tensor

get_regularization_penalty#

class BidirectionalLanguageModelTransformer(Seq2SeqEncoder):
 | ...
 | def get_regularization_penalty(self)

get_input_dim#

class BidirectionalLanguageModelTransformer(Seq2SeqEncoder):
 | ...
 | def get_input_dim(self) -> int

get_output_dim#

class BidirectionalLanguageModelTransformer(Seq2SeqEncoder):
 | ...
 | def get_output_dim(self) -> int

is_bidirectional#

class BidirectionalLanguageModelTransformer(Seq2SeqEncoder):
 | ...
 | def is_bidirectional(self) -> bool