simple_language_modeling
allennlp_models.lm.dataset_readers.simple_language_modeling
SimpleLanguageModelingDatasetReader#
@DatasetReader.register("simple_language_modeling")
class SimpleLanguageModelingDatasetReader(DatasetReader):
| def __init__(
| self,
| tokenizer: Tokenizer = None,
| token_indexers: Dict[str, TokenIndexer] = None,
| max_sequence_length: int = None,
| start_tokens: List[str] = None,
| end_tokens: List[str] = None,
| **kwargs
| ) -> None
Reads sentences, one per line, for language modeling. This does not handle arbitrarily formatted text with sentences spanning multiple lines.
Parameters¶
- tokenizer :
Tokenizer, optional
Tokenizer to use to split the input sentences into words or other kinds of tokens. Defaults toSpacyTokenizer(). - token_indexers :
Dict[str, TokenIndexer], optional
Indexers used to define input token representations. Defaults to{"tokens": SingleIdTokenIndexer()}. - max_sequence_length :
int, optional
If specified, sentences with more than this number of tokens will be dropped. - start_tokens :
List[str], optional (default =None)
These are prepended to the tokens provided to theTextField. - end_tokens :
List[str], optional (default =None)
These are appended to the tokens provided to theTextField.
text_to_instance#
class SimpleLanguageModelingDatasetReader(DatasetReader):
| ...
| def text_to_instance(self, sentence: str) -> Instance