simple_language_modeling
SimpleLanguageModelingDatasetReader#
class SimpleLanguageModelingDatasetReader(DatasetReader):
| def __init__(
| self,
| tokenizer: Tokenizer = None,
| token_indexers: Dict[str, TokenIndexer] = None,
| max_sequence_length: int = None,
| start_tokens: List[str] = None,
| end_tokens: List[str] = None,
| **kwargs
| ) -> None
Reads sentences, one per line, for language modeling. This does not handle arbitrarily formatted text with sentences spanning multiple lines.
Parameters
- tokenizer :
Tokenizer
, optional
Tokenizer to use to split the input sentences into words or other kinds of tokens. Defaults toSpacyTokenizer()
. - token_indexers :
Dict[str, TokenIndexer]
, optional
Indexers used to define input token representations. Defaults to{"tokens": SingleIdTokenIndexer()}
. - max_sequence_length :
int
, optional
If specified, sentences with more than this number of tokens will be dropped. - start_tokens :
List[str]
, optional (default =None
)
These are prepended to the tokens provided to theTextField
. - end_tokens :
List[str]
, optional (default =None
)
These are appended to the tokens provided to theTextField
.
text_to_instance#
class SimpleLanguageModelingDatasetReader(DatasetReader):
| ...
| @overrides
| def text_to_instance(self, sentence: str) -> Instance