Skip to content

bimodal_encoder

allennlp.modules.transformer.bimodal_encoder

[SOURCE]


BiModalEncoder

class BiModalEncoder(TransformerModule,  FromParams):
 | def __init__(
 |     self,
 |     num_hidden_layers1: int = 12,
 |     num_hidden_layers2: int = 12,
 |     hidden_size1: int = 1024,
 |     hidden_size2: int = 1024,
 |     combined_hidden_size: int = 1024,
 |     intermediate_size1: int = 1024,
 |     intermediate_size2: int = 1024,
 |     num_attention_heads1: int = 8,
 |     num_attention_heads2: int = 8,
 |     combined_num_attention_heads: int = 8,
 |     attention_dropout1: float = 0.1,
 |     hidden_dropout1: float = 0.1,
 |     attention_dropout2: float = 0.1,
 |     hidden_dropout2: float = 0.1,
 |     activation: str = "relu",
 |     biattention_id1: Optional[List[int]] = None,
 |     biattention_id2: Optional[List[int]] = None,
 |     fixed_layer1: int = 0,
 |     fixed_layer2: int = 0,
 |     fast_mode: bool = False,
 |     with_coattention: bool = True,
 |     in_batch_pairs: bool = False
 | )

This module encodes two modalities separately, and performs bi-directional attention using a connection layer. It is based on the modified BertEncoder in the paper: ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks

Parameters

  • num_hidden_layers1 : int, optional (default = 12)
    Number of hidden layers in the transformer block for the first modality.
  • num_hidden_layers2 : int, optional (default = 12)
    Number of hidden layers in the transformer block for the second modality.
  • hidden_size1 : int, optional (default = 1024)
  • hidden_size2 : int, optional (default = 1024)
  • combined_hidden_size : int, optional (default = 1024)
    Hidden size for the connection layer.
  • intermediate_size1 : int, optional (default = 1024)
  • intermediate_size2 : int, optional (default = 1024)
  • num_attention_heads1 : int, optional (default = 8)
  • num_attention_heads2 : int, optional (default = 8)
  • combined_num_attention_heads : int, optional (default = 8)
    Number of attention heads in the connection layer.
  • attention_dropout1 : float, optional (default = 0.1)
  • hidden_dropout1 : float, optional (default = 0.1)
  • attention_dropout2 : float, optional (default = 0.1)
  • hidden_dropout2 : float, optional (default = 0.1)
  • biattention_id1 : List, optional (default = [1])
  • biattention_id2 : List, optional (default = [1])
  • fixed_layer1 : int, optional (default = 0)
  • fixed_layer2 : int, optional (default = 0)
  • fast_mode : bool, optional (default = False)
  • with_coattention : bool, optional (default = True)
  • in_batch_pairs : bool, optional (default = False)

forward

class BiModalEncoder(TransformerModule,  FromParams):
 | ...
 | def forward(
 |     self,
 |     embedding1,
 |     embedding2,
 |     attention_mask1,
 |     attention_mask2,
 |     co_attention_mask=None,
 |     output_all_encoded_layers=True
 | )