token_characters_indexer
allennlp.data.token_indexers.token_characters_indexer
TokenCharactersIndexer#
@TokenIndexer.register("characters")
class TokenCharactersIndexer(TokenIndexer):
| def __init__(
| self,
| namespace: str = "token_characters",
| character_tokenizer: CharacterTokenizer = CharacterTokenizer(),
| start_tokens: List[str] = None,
| end_tokens: List[str] = None,
| min_padding_length: int = 0,
| token_min_padding_length: int = 0
| ) -> None
This TokenIndexer
represents tokens as lists of character indices.
Registered as a TokenIndexer
with name "characters".
Parameters
- namespace :
str
, optional (default =token_characters
)
We will use this namespace in theVocabulary
to map the characters in each token to indices. - character_tokenizer :
CharacterTokenizer
, optional (default =CharacterTokenizer()
)
We use aCharacterTokenizer
to handle splitting tokens into characters, as it has options for byte encoding and other things. The default here is to instantiate aCharacterTokenizer
with its default parameters, which uses unicode characters and retains casing. - start_tokens :
List[str]
, optional (default =None
)
These are prepended to the tokens provided totokens_to_indices
. - end_tokens :
List[str]
, optional (default =None
)
These are appended to the tokens provided totokens_to_indices
. - min_padding_length :
int
, optional (default =0
)
We use this value as the minimum length of padding. Usually used withCnnEncoder
, its value should be set to the maximum value ofngram_filter_sizes
correspondingly. - token_min_padding_length :
int
, optional (default =0
)
SeeTokenIndexer
.
count_vocab_items#
class TokenCharactersIndexer(TokenIndexer):
| ...
| @overrides
| def count_vocab_items(
| self,
| token: Token,
| counter: Dict[str, Dict[str, int]]
| )
tokens_to_indices#
class TokenCharactersIndexer(TokenIndexer):
| ...
| @overrides
| def tokens_to_indices(
| self,
| tokens: List[Token],
| vocabulary: Vocabulary
| ) -> Dict[str, List[List[int]]]
get_padding_lengths#
class TokenCharactersIndexer(TokenIndexer):
| ...
| @overrides
| def get_padding_lengths(
| self,
| indexed_tokens: IndexedTokenList
| ) -> Dict[str, int]
as_padded_tensor_dict#
class TokenCharactersIndexer(TokenIndexer):
| ...
| @overrides
| def as_padded_tensor_dict(
| self,
| tokens: IndexedTokenList,
| padding_lengths: Dict[str, int]
| ) -> Dict[str, torch.Tensor]
Pad the tokens.
get_empty_token_list#
class TokenCharactersIndexer(TokenIndexer):
| ...
| @overrides
| def get_empty_token_list(self) -> IndexedTokenList