vision_text_model

allennlp_models.vision.models.vision_text_model

VisionTextModel#

@Model.register("vision_model")
class VisionTextModel(Model):
 | def __init__(
 |     self,
 |     vocab: Vocabulary,
 |     text_embeddings: TransformerEmbeddings,
 |     image_embeddings: ImageFeatureEmbeddings,
 |     encoder: BiModalEncoder,
 |     pooled_output_dim: int,
 |     fusion_method: str = "sum",
 |     dropout: float = 0.1,
 |     label_namespace: str = "labels",
 |     is_multilabel: bool = False,
 |     *, ignore_text: bool = False,
 |     *, ignore_image: bool = False
 | ) -> None

VisionTextModel takes as input a single text input and a single image input to produce some output. Example tasks include visual question-answering, visual entailment, etc.

Parameters¶

vocab : Vocabulary
text_embeddings : TransformerEmbeddings
image_embeddings : ImageFeatureEmbeddings
encoder : BiModalEncoder
pooled_output_dim : int
fusion_method : str, optional (default = "sum")
dropout : float, optional (default = 0.1)
label_namespace : str, optional (default = "labels")
is_multilabel : bool, optional (default = False)
Whether the output classification is multilabel. (i.e., can have multiple correct answers)

from_huggingface_model_name#

class VisionTextModel(Model):
 | ...
 | @classmethod
 | def from_huggingface_model_name(
 |     cls,
 |     vocab: Vocabulary,
 |     model_name: str,
 |     image_feature_dim: int,
 |     image_num_hidden_layers: int,
 |     image_hidden_size: int,
 |     image_num_attention_heads: int,
 |     combined_hidden_size: int,
 |     combined_num_attention_heads: int,
 |     pooled_output_dim: int,
 |     image_intermediate_size: int,
 |     image_attention_dropout: float,
 |     image_hidden_dropout: float,
 |     image_biattention_id: List[int],
 |     text_biattention_id: List[int],
 |     text_fixed_layer: int,
 |     image_fixed_layer: int,
 |     pooled_dropout: float = 0.1,
 |     fusion_method: str = "sum",
 |     *, ignore_text: bool = False,
 |     *, ignore_image: bool = False
 | )

forward#

class VisionTextModel(Model):
 | ...
 | def forward(
 |     self,
 |     box_features: torch.Tensor,
 |     box_coordinates: torch.Tensor,
 |     box_mask: torch.Tensor,
 |     text: TextFieldTensors,
 |     labels: Optional[torch.Tensor] = None,
 |     label_weights: Optional[torch.Tensor] = None
 | ) -> Dict[str, torch.Tensor]

Parameters¶

box_features : Tensor
Shape: (batch_size, num_boxes, feature_size)
box_coordinates : Tensor
Shape: (batch_size, num_boxes, 4)
box_mask : Tensor
A bool and 0-1 tensor of shape (batch_size, num_boxes).
text : TextFieldTensors
label : Optional[Tensor]
label_weights : Optional[Tensor]

get_metrics#

class VisionTextModel(Model):
 | ...
 | def get_metrics(self, reset: bool = False) -> Dict[str, float]

make_output_human_readable#

class VisionTextModel(Model):
 | ...
 | def make_output_human_readable(
 |     self,
 |     output_dict: Dict[str, torch.Tensor]
 | ) -> Dict[str, torch.Tensor]