dataloader
[ allennlp.data.dataloader ]
TensorDict#
TensorDict = Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]
allennlp_collate#
def allennlp_collate(instances: List[Instance]) -> TensorDict
DataLoader Objects#
class DataLoader(Registrable, data.DataLoader):
| def __init__(
| self,
| dataset: data.Dataset,
| batch_size: int = 1,
| shuffle: bool = False,
| sampler: Sampler = None,
| batch_sampler: BatchSampler = None,
| num_workers: int = 0,
| collate_fn=allennlp_collate,
| pin_memory: bool = False,
| drop_last: bool = False,
| timeout: int = 0,
| worker_init_fn=None,
| multiprocessing_context: str = None,
| batches_per_epoch: int = None
| )
A registrable version of the pytorch
DataLoader.
Firstly, this class exists is so that we can construct a DataLoader
from a configuration file and have a different default collate_fn.
You can use this class directly in python code, but it is identical to using
pytorch dataloader with allennlp's custom collate function:
from torch.utils.data import DataLoader
from allennlp.data.samplers import allennlp_collate
# Construct a dataloader directly for a dataset which contains allennlp
# Instances which have _already_ been indexed.
my_loader = DataLoader(dataset, batch_size=32, collate_fn=allennlp_collate)
Secondly, this class adds a batches_per_epoch parameter which, if given, determines the number
of batches after which an epoch ends. If this is None, then an epoch is set to be one full pass
through your data. You might use this if you have a very large dataset and want more frequent
checkpoints and evaluations on validation data, for instance.
In a typical AllenNLP configuration file, the dataset parameter does not get an entry under
the "data_loader", it gets constructed separately.
from_partial_objects#
| @classmethod
| def from_partial_objects(
| cls,
| dataset: data.Dataset,
| batch_size: int = 1,
| shuffle: bool = False,
| sampler: Lazy[Sampler] = None,
| batch_sampler: Lazy[BatchSampler] = None,
| num_workers: int = 0,
| pin_memory: bool = False,
| drop_last: bool = False,
| timeout: int = 0,
| worker_init_fn=None,
| multiprocessing_context: str = None,
| batches_per_epoch: int = None
| ) -> "DataLoader"
DataLoader.default_implementation#
DataLoader.default_implementation = "default"