Skip to content

dataloader

[ allennlp.data.dataloader ]


TensorDict#

TensorDict = Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]

allennlp_collate#

def allennlp_collate(instances: List[Instance]) -> TensorDict

DataLoader Objects#

class DataLoader(Registrable,  data.DataLoader):
 | def __init__(
 |     self,
 |     dataset: data.Dataset,
 |     batch_size: int = 1,
 |     shuffle: bool = False,
 |     sampler: Sampler = None,
 |     batch_sampler: BatchSampler = None,
 |     num_workers: int = 0,
 |     collate_fn=allennlp_collate,
 |     pin_memory: bool = False,
 |     drop_last: bool = False,
 |     timeout: int = 0,
 |     worker_init_fn=None,
 |     multiprocessing_context: str = None,
 |     batches_per_epoch: int = None
 | )

A registrable version of the pytorch DataLoader. Firstly, this class exists is so that we can construct a DataLoader from a configuration file and have a different default collate_fn. You can use this class directly in python code, but it is identical to using pytorch dataloader with allennlp's custom collate function:

from torch.utils.data import DataLoader

from allennlp.data.samplers import allennlp_collate
# Construct a dataloader directly for a dataset which contains allennlp
# Instances which have _already_ been indexed.
my_loader = DataLoader(dataset, batch_size=32, collate_fn=allennlp_collate)

Secondly, this class adds a batches_per_epoch parameter which, if given, determines the number of batches after which an epoch ends. If this is None, then an epoch is set to be one full pass through your data. You might use this if you have a very large dataset and want more frequent checkpoints and evaluations on validation data, for instance.

In a typical AllenNLP configuration file, the dataset parameter does not get an entry under the "data_loader", it gets constructed separately.

from_partial_objects#

 | @classmethod
 | def from_partial_objects(
 |     cls,
 |     dataset: data.Dataset,
 |     batch_size: int = 1,
 |     shuffle: bool = False,
 |     sampler: Lazy[Sampler] = None,
 |     batch_sampler: Lazy[BatchSampler] = None,
 |     num_workers: int = 0,
 |     pin_memory: bool = False,
 |     drop_last: bool = False,
 |     timeout: int = 0,
 |     worker_init_fn=None,
 |     multiprocessing_context: str = None,
 |     batches_per_epoch: int = None
 | ) -> "DataLoader"

DataLoader.default_implementation#

DataLoader.default_implementation = "default"