Skip to content




def get_score(count: int) -> float


class VQAv2Reader(VisionReader):
 | def __init__(
 |     self,
 |     image_dir: Optional[Union[str, PathLike]] = None,
 |     *,
 |     image_loader: Optional[ImageLoader] = None,
 |     image_featurizer: Optional[Lazy[GridEmbedder]] = None,
 |     region_detector: Optional[Lazy[RegionDetector]] = None,
 |     answer_vocab: Optional[Union[Vocabulary, str]] = None,
 |     feature_cache_dir: Optional[Union[str, PathLike]] = None,
 |     tokenizer: Optional[Tokenizer] = None,
 |     token_indexers: Optional[Dict[str, TokenIndexer]] = None,
 |     cuda_device: Optional[Union[int, torch.device]] = None,
 |     max_instances: Optional[int] = None,
 |     image_processing_batch_size: int = 8,
 |     multiple_answers_per_question: bool = True,
 |     write_to_cache: bool = True
 | ) -> None

Parametersimage_dir: `str`

Path to directory containing `png` image files.

image_loader: ImageLoader The image loader component used to load the images. image_featurizer: Lazy[GridEmbedder] The backbone image processor (like a ResNet), whose output will be passed to the region detector for finding object boxes in the image. region_detector: Lazy[RegionDetector] For pulling out regions of the image (both coordinates and features) that will be used by downstream models. answer_vocab: Union[Vocabulary, str], optional The vocabulary to use for answers. The reader will look into the "answers" namespace in the vocabulary to find possible answers. If this is given, the reader only outputs instances with answers contained in this vocab. If this is not given, the reader outputs all instances with all answers. If this is a URL or filename, we will download a previously saved vocabulary from there. feature_cache_dir: Union[str, PathLike], optional An optional directory to cache the featurized images in. Featurizing images takes a long time, and many images are duplicated, so we highly recommend to use this cache. tokenizer: Tokenizer, optional The Tokenizer to use to tokenize the text. By default, this uses the tokenizer for "bert-base-uncased". token_indexers: Dict[str, TokenIndexer], optional The TokenIndexer to use. By default, this uses the indexer for "bert-base-uncased". cuda_device: Union[int, torch.device], optional Either a torch device or a GPU number. This is the GPU we'll use to featurize the images. max_instances: int, optional For debugging, you can use this parameter to limit the number of instances the reader returns. image_processing_batch_size: int The number of images to process at one time while featurizing. Default is 8. multiple_answers_per_question: bool VQA questions have multiple answers. By default, we use all of them, and give more points to the more common answer. But VQA also has a special answer, the so-called "multiple choice answer". If this is set to False, we only use that answer.


class VQAv2Reader(VisionReader):
 | ...
 | def text_to_instance(
 |     self,
 |     question: str,
 |     image: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]],
 |     answer_counts: Optional[MutableMapping[str, int]] = None,
 |     *,
 |     use_cache: bool = True
 | ) -> Optional[Instance]


class VQAv2Reader(VisionReader):
 | ...
 | def apply_token_indexers(self, instance: Instance) -> None