bimodal_encoder
allennlp.modules.transformer.bimodal_encoder
BiModalEncoder¶
class BiModalEncoder(TransformerModule, FromParams):
| def __init__(
| self,
| num_hidden_layers1: int = 12,
| num_hidden_layers2: int = 12,
| hidden_size1: int = 1024,
| hidden_size2: int = 1024,
| combined_hidden_size: int = 1024,
| intermediate_size1: int = 1024,
| intermediate_size2: int = 1024,
| num_attention_heads1: int = 8,
| num_attention_heads2: int = 8,
| combined_num_attention_heads: int = 8,
| attention_dropout1: float = 0.1,
| hidden_dropout1: float = 0.1,
| attention_dropout2: float = 0.1,
| hidden_dropout2: float = 0.1,
| activation: str = "relu",
| biattention_id1: Optional[List[int]] = None,
| biattention_id2: Optional[List[int]] = None,
| fixed_layer1: int = 0,
| fixed_layer2: int = 0,
| fast_mode: bool = False,
| with_coattention: bool = True,
| in_batch_pairs: bool = False
| )
This module encodes two modalities separately, and performs bi-directional attention using a connection layer. It is based on the modified BertEncoder in the paper: ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks
Parameters¶
- num_hidden_layers1 :
int, optional (default =12)
Number of hidden layers in the transformer block for the first modality. - num_hidden_layers2 :
int, optional (default =12)
Number of hidden layers in the transformer block for the second modality. - hidden_size1 :
int, optional (default =1024) - hidden_size2 :
int, optional (default =1024) - combined_hidden_size :
int, optional (default =1024)
Hidden size for the connection layer. - intermediate_size1 :
int, optional (default =1024) - intermediate_size2 :
int, optional (default =1024) - num_attention_heads1 :
int, optional (default =8) - num_attention_heads2 :
int, optional (default =8) - combined_num_attention_heads :
int, optional (default =8)
Number of attention heads in the connection layer. - attention_dropout1 :
float, optional (default =0.1) - hidden_dropout1 :
float, optional (default =0.1) - attention_dropout2 :
float, optional (default =0.1) - hidden_dropout2 :
float, optional (default =0.1) - biattention_id1 :
List, optional (default =[1]) - biattention_id2 :
List, optional (default =[1]) - fixed_layer1 :
int, optional (default =0) - fixed_layer2 :
int, optional (default =0) - fast_mode :
bool, optional (default =False) - with_coattention :
bool, optional (default =True) - in_batch_pairs :
bool, optional (default =False)
forward¶
class BiModalEncoder(TransformerModule, FromParams):
| ...
| def forward(
| self,
| embedding1,
| embedding2,
| attention_mask1,
| attention_mask2,
| co_attention_mask=None,
| output_all_encoded_layers=True
| )