Skip to content

data_collator

allennlp.data.data_loaders.data_collator

[SOURCE]


allennlp_collate

def allennlp_collate(instances: List[Instance]) -> TensorDict

This is the default function used to turn a list of Instances into a TensorDict batch.

DataCollator

class DataCollator(Registrable)

This class is similar with DataCollator in Transformers Allow to do some dynamic operations for tensor in different batches Cause this method run before each epoch to convert List[Instance] to TensorDict

default_implementation

class DataCollator(Registrable):
 | ...
 | default_implementation = "allennlp"

__call__

class DataCollator(Registrable):
 | ...
 | def __call__(self, instances: List[Instance]) -> TensorDict

DefaultDataCollator

@DataCollator.register("allennlp")
class DefaultDataCollator(DataCollator)

__call__

class DefaultDataCollator(DataCollator):
 | ...
 | def __call__(self, instances: List[Instance]) -> TensorDict

LanguageModelingDataCollator

@DataCollator.register("language_model")
class LanguageModelingDataCollator(DataCollator):
 | def __init__(
 |     self,
 |     model_name: str,
 |     mlm: bool = True,
 |     mlm_probability: float = 0.15,
 |     filed_name: str = "source",
 |     namespace: str = "tokens"
 | )

Register as an DataCollator with name LanguageModelingDataCollator Used for language modeling.

__call__

class LanguageModelingDataCollator(DataCollator):
 | ...
 | def __call__(self, instances: List[Instance]) -> TensorDict

process_tokens

class LanguageModelingDataCollator(DataCollator):
 | ...
 | def process_tokens(self, tensor_dicts: TensorDict) -> TensorDict