bidirectional_lm_transformer
The BidirectionalTransformerEncoder from Calypso. This is basically the transformer from https://nlp.seas.harvard.edu/2018/04/03/attention.html so credit to them.
This code should be considered "private" in that we have several transformer implementations and may end up deleting this one. If you use it, consider yourself warned.
attention#
def attention(
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
mask: torch.BoolTensor = None,
dropout: Callable = None
) -> Tuple[torch.Tensor, torch.Tensor]
Compute 'Scaled Dot Product Attention'
subsequent_mask#
def subsequent_mask(
size: int,
device: str = "cpu"
) -> torch.BoolTensor
Mask out subsequent positions.
PositionalEncoding#
class PositionalEncoding(torch.nn.Module, Registrable):
| def __init__(self, input_dim: int, max_len: int = 5000) -> None
Implement the Positional Encoding function.
forward#
class PositionalEncoding(torch.nn.Module, Registrable):
| ...
| def forward(self, x: torch.Tensor) -> torch.Tensor
PositionwiseFeedForward#
class PositionwiseFeedForward(torch.nn.Module):
| def __init__(
| self,
| input_dim: int,
| ff_dim: int,
| dropout: float = 0.1
| ) -> None
Implements FFN equation.
forward#
class PositionwiseFeedForward(torch.nn.Module):
| ...
| def forward(self, x: torch.Tensor) -> torch.Tensor
TransformerEncoder#
class TransformerEncoder(torch.nn.Module):
| def __init__(
| self,
| layer: torch.nn.Module,
| num_layers: int,
| return_all_layers: bool = False
| ) -> None
Core encoder is a stack of N layers
forward#
class TransformerEncoder(torch.nn.Module):
| ...
| def forward(self, x, mask)
Pass the input (and mask) through each layer in turn.
SublayerConnection#
class SublayerConnection(torch.nn.Module):
| def __init__(self, size: int, dropout: float) -> None
A residual connection followed by a layer norm. Note for code simplicity the norm is first as opposed to last.
forward#
class SublayerConnection(torch.nn.Module):
| ...
| def forward(
| self,
| x: torch.Tensor,
| sublayer: Callable[[torch.Tensor], torch.Tensor]
| ) -> torch.Tensor
Apply residual connection to any sublayer with the same size.
EncoderLayer#
class EncoderLayer(torch.nn.Module):
| def __init__(
| self,
| size: int,
| self_attn: torch.nn.Module,
| feed_forward: torch.nn.Module,
| dropout: float
| ) -> None
Encoder is made up of self-attn and feed forward (defined below)
forward#
class EncoderLayer(torch.nn.Module):
| ...
| def forward(
| self,
| x: torch.Tensor,
| mask: torch.BoolTensor
| ) -> torch.Tensor
Follow Figure 1 (left) for connections.
MultiHeadedAttention#
class MultiHeadedAttention(torch.nn.Module):
| def __init__(
| self,
| num_heads: int,
| input_dim: int,
| dropout: float = 0.1
| ) -> None
forward#
class MultiHeadedAttention(torch.nn.Module):
| ...
| def forward(
| self,
| query: torch.Tensor,
| key: torch.Tensor,
| value: torch.Tensor,
| mask: torch.BoolTensor = None
| ) -> torch.Tensor
make_model#
def make_model(
num_layers: int = 6,
input_size: int = 512,
hidden_size: int = 2048,
heads: int = 8,
dropout: float = 0.1,
return_all_layers: bool = False
) -> TransformerEncoder
Helper: Construct a model from hyperparameters.
BidirectionalLanguageModelTransformer#
class BidirectionalLanguageModelTransformer(Seq2SeqEncoder):
| def __init__(
| self,
| input_dim: int,
| hidden_dim: int,
| num_layers: int,
| dropout: float = 0.1,
| input_dropout: float = None,
| return_all_layers: bool = False
| ) -> None
get_attention_masks#
class BidirectionalLanguageModelTransformer(Seq2SeqEncoder):
| ...
| def get_attention_masks(
| self,
| mask: torch.BoolTensor
| ) -> Tuple[torch.Tensor, torch.Tensor]
Returns 2 masks of shape (batch_size, timesteps, timesteps) representing 1) non-padded elements, and 2) elements of the sequence which are permitted to be involved in attention at a given timestep.
forward#
class BidirectionalLanguageModelTransformer(Seq2SeqEncoder):
| ...
| def forward(
| self,
| token_embeddings: torch.Tensor,
| mask: torch.BoolTensor
| ) -> torch.Tensor
get_regularization_penalty#
class BidirectionalLanguageModelTransformer(Seq2SeqEncoder):
| ...
| def get_regularization_penalty(self)
get_input_dim#
class BidirectionalLanguageModelTransformer(Seq2SeqEncoder):
| ...
| def get_input_dim(self) -> int
get_output_dim#
class BidirectionalLanguageModelTransformer(Seq2SeqEncoder):
| ...
| def get_output_dim(self) -> int
is_bidirectional#
class BidirectionalLanguageModelTransformer(Seq2SeqEncoder):
| ...
| def is_bidirectional(self) -> bool