Skip to content

token_characters_indexer

allennlp.data.token_indexers.token_characters_indexer

[SOURCE]


TokenCharactersIndexer

@TokenIndexer.register("characters")
class TokenCharactersIndexer(TokenIndexer):
 | def __init__(
 |     self,
 |     namespace: str = "token_characters",
 |     character_tokenizer: CharacterTokenizer = CharacterTokenizer(),
 |     start_tokens: List[str] = None,
 |     end_tokens: List[str] = None,
 |     min_padding_length: int = 0,
 |     token_min_padding_length: int = 0
 | ) -> None

This TokenIndexer represents tokens as lists of character indices.

Registered as a TokenIndexer with name "characters".

Parameters

  • namespace : str, optional (default = token_characters)
    We will use this namespace in the Vocabulary to map the characters in each token to indices.
  • character_tokenizer : CharacterTokenizer, optional (default = CharacterTokenizer())
    We use a CharacterTokenizer to handle splitting tokens into characters, as it has options for byte encoding and other things. The default here is to instantiate a CharacterTokenizer with its default parameters, which uses unicode characters and retains casing.
  • start_tokens : List[str], optional (default = None)
    These are prepended to the tokens provided to tokens_to_indices.
  • end_tokens : List[str], optional (default = None)
    These are appended to the tokens provided to tokens_to_indices.
  • min_padding_length : int, optional (default = 0)
    We use this value as the minimum length of padding. Usually used with CnnEncoder, its value should be set to the maximum value of ngram_filter_sizes correspondingly.
  • token_min_padding_length : int, optional (default = 0)
    See TokenIndexer.

count_vocab_items

class TokenCharactersIndexer(TokenIndexer):
 | ...
 | def count_vocab_items(
 |     self,
 |     token: Token,
 |     counter: Dict[str, Dict[str, int]]
 | )

tokens_to_indices

class TokenCharactersIndexer(TokenIndexer):
 | ...
 | def tokens_to_indices(
 |     self,
 |     tokens: List[Token],
 |     vocabulary: Vocabulary
 | ) -> Dict[str, List[List[int]]]

get_padding_lengths

class TokenCharactersIndexer(TokenIndexer):
 | ...
 | def get_padding_lengths(
 |     self,
 |     indexed_tokens: IndexedTokenList
 | ) -> Dict[str, int]

as_padded_tensor_dict

class TokenCharactersIndexer(TokenIndexer):
 | ...
 | def as_padded_tensor_dict(
 |     self,
 |     tokens: IndexedTokenList,
 |     padding_lengths: Dict[str, int]
 | ) -> Dict[str, torch.Tensor]

Pad the tokens.

get_empty_token_list

class TokenCharactersIndexer(TokenIndexer):
 | ...
 | def get_empty_token_list(self) -> IndexedTokenList