vision_text_model
allennlp_models.vision.models.vision_text_model
VisionTextModel#
@Model.register("vision_model")
class VisionTextModel(Model):
| def __init__(
| self,
| vocab: Vocabulary,
| text_embeddings: TransformerEmbeddings,
| image_embeddings: ImageFeatureEmbeddings,
| encoder: BiModalEncoder,
| pooled_output_dim: int,
| fusion_method: str = "sum",
| dropout: float = 0.1,
| label_namespace: str = "labels",
| is_multilabel: bool = False,
| *, ignore_text: bool = False,
| *, ignore_image: bool = False
| ) -> None
VisionTextModel
takes as input a single text input and a single image input
to produce some output. Example tasks include visual question-answering, visual
entailment, etc.
Parameters¶
- vocab :
Vocabulary
- text_embeddings :
TransformerEmbeddings
- image_embeddings :
ImageFeatureEmbeddings
- encoder :
BiModalEncoder
- pooled_output_dim :
int
- fusion_method :
str
, optional (default ="sum"
) - dropout :
float
, optional (default =0.1
) - label_namespace :
str
, optional (default ="labels"
) - is_multilabel :
bool
, optional (default =False
)
Whether the output classification is multilabel. (i.e., can have multiple correct answers)
from_huggingface_model_name#
class VisionTextModel(Model):
| ...
| @classmethod
| def from_huggingface_model_name(
| cls,
| vocab: Vocabulary,
| model_name: str,
| image_feature_dim: int,
| image_num_hidden_layers: int,
| image_hidden_size: int,
| image_num_attention_heads: int,
| combined_hidden_size: int,
| combined_num_attention_heads: int,
| pooled_output_dim: int,
| image_intermediate_size: int,
| image_attention_dropout: float,
| image_hidden_dropout: float,
| image_biattention_id: List[int],
| text_biattention_id: List[int],
| text_fixed_layer: int,
| image_fixed_layer: int,
| pooled_dropout: float = 0.1,
| fusion_method: str = "sum",
| *, ignore_text: bool = False,
| *, ignore_image: bool = False
| )
forward#
class VisionTextModel(Model):
| ...
| def forward(
| self,
| box_features: torch.Tensor,
| box_coordinates: torch.Tensor,
| box_mask: torch.Tensor,
| text: TextFieldTensors,
| labels: Optional[torch.Tensor] = None,
| label_weights: Optional[torch.Tensor] = None
| ) -> Dict[str, torch.Tensor]
Parameters¶
-
box_features :
Tensor
Shape:(batch_size, num_boxes, feature_size)
-
box_coordinates :
Tensor
Shape:(batch_size, num_boxes, 4)
-
box_mask :
Tensor
A bool and 0-1 tensor of shape(batch_size, num_boxes)
. -
text :
TextFieldTensors
-
label :
Optional[Tensor]
-
label_weights :
Optional[Tensor]
get_metrics#
class VisionTextModel(Model):
| ...
| def get_metrics(self, reset: bool = False) -> Dict[str, float]
make_output_human_readable#
class VisionTextModel(Model):
| ...
| def make_output_human_readable(
| self,
| output_dict: Dict[str, torch.Tensor]
| ) -> Dict[str, torch.Tensor]