vision_reader

allennlp_models.vision.dataset_readers.vision_reader

VisionReader#

class VisionReader(DatasetReader):
 | def __init__(
 |     self,
 |     image_dir: Optional[Union[str, PathLike]],
 |     *, image_loader: Optional[ImageLoader] = None,
 |     *, image_featurizer: Optional[Lazy[GridEmbedder]] = None,
 |     *, region_detector: Optional[Lazy[RegionDetector]] = None,
 |     *, feature_cache_dir: Optional[Union[str, PathLike]] = None,
 |     *, tokenizer: Optional[Tokenizer] = None,
 |     *, token_indexers: Optional[Dict[str, TokenIndexer]] = None,
 |     *, cuda_device: Optional[Union[int, torch.device]] = None,
 |     *, max_instances: Optional[int] = None,
 |     *, image_processing_batch_size: int = 8,
 |     *, write_to_cache: bool = True,
 |     *, manual_distributed_sharding: bool = True,
 |     *, manual_multiprocess_sharding: bool = True
 | ) -> None

Base class for dataset readers for vision tasks.

If you don't specify image_loader, image_featurizer, and region_detector, the reader assumes it can get all featurized images from the cache.

If you don't specify feature_cache, the reader will featurize all images using the featurization components, and use an internal in-memory cache to catch duplicate images.

If you don't specify either of these things, the reader will not produce featurized images at all.

Parameters¶

image_dir : str
Path to directory containing image files. The structure of the directory doesn't matter. We find images by finding filenames that match *[image_id].jpg.
image_loader : ImageLoader, optional
The image loading component.
image_featurizer : Lazy[GridEmbedder], optional
The backbone image processor (like a ResNet), whose output will be passed to the region detector for finding object boxes in the image.
region_detector : Lazy[RegionDetector], optional
For pulling out regions of the image (both coordinates and features) that will be used by downstream models.
tokenizer : Tokenizer, optional
The Tokenizer to use to tokenize the text. By default, this uses the tokenizer for "bert-base-uncased".
token_indexers : Dict[str, TokenIndexer], optional
The TokenIndexer to use. By default, this uses the indexer for "bert-base-uncased".
cuda_device : Union[int, torch.device], optional
Either a torch device or a GPU number. This is the GPU we'll use to featurize the images.
max_instances : int, optional
For debugging, you can use this parameter to limit the number of instances the reader returns.
image_processing_batch_size : int
The number of images to process at one time while featurizing. Default is 8.
write_to_cache : bool, optional (default = True)
Allows the reader to write to the cache. Disabling this is useful if you don't want to accidentally overwrite a cache you already have, or if you don't have write access to the cache you're using.

image_featurizer#

class VisionReader(DatasetReader):
 | ...
 | @property
 | def image_featurizer(self) -> Optional[GridEmbedder]

region_detector#

class VisionReader(DatasetReader):
 | ...
 | @property
 | def region_detector(self) -> Optional[RegionDetector]