elmo_indexer
[ allennlp.data.token_indexers.elmo_indexer ]
ELMoCharacterMapper Objects#
class ELMoCharacterMapper():
| def __init__(self, tokens_to_add: Dict[str, int] = None) -> None
Maps individual tokens to sequences of character ids, compatible with ELMo. To be consistent with previously trained models, we include it here as special of existing character indexers.
We allow to add optional additional special tokens with designated
character ids with tokens_to_add
.
max_word_length#
max_word_length = 50
beginning_of_sentence_character#
beginning_of_sentence_character = 256
end_of_sentence_character#
end_of_sentence_character = 257
beginning_of_word_character#
beginning_of_word_character = 258
end_of_word_character#
end_of_word_character = 259
padding_character#
padding_character = 260
beginning_of_sentence_characters#
beginning_of_sentence_characters = _make_bos_eos(
beginning_of_sentence_character,
padding_character,
beginning ...
end_of_sentence_characters#
end_of_sentence_characters = _make_bos_eos(
end_of_sentence_character,
padding_character,
beginning_of_wo ...
bos_token#
bos_token = "<S>"
eos_token#
eos_token = "</S>"
convert_word_to_char_ids#
| def convert_word_to_char_ids(self, word: str) -> List[int]
ELMoTokenCharactersIndexer Objects#
class ELMoTokenCharactersIndexer(TokenIndexer):
| def __init__(
| self,
| namespace: str = "elmo_characters",
| tokens_to_add: Dict[str, int] = None,
| token_min_padding_length: int = 0
| ) -> None
Convert a token to an array of character ids to compute ELMo representations.
Registered as a TokenIndexer
with name "elmo_characters".
Parameters
- namespace :
str
, optional (default =elmo_characters
) - tokens_to_add :
Dict[str, int]
, optional (default =None
)
If not None, then provides a mapping of special tokens to character ids. When using pre-trained models, then the character id must be less then 261, and we recommend using un-used ids (e.g. 1-32). - token_min_padding_length :
int
, optional (default =0
)
SeeTokenIndexer
.
count_vocab_items#
| @overrides
| def count_vocab_items(
| self,
| token: Token,
| counter: Dict[str, Dict[str, int]]
| )
get_empty_token_list#
| @overrides
| def get_empty_token_list(self) -> IndexedTokenList
tokens_to_indices#
| @overrides
| def tokens_to_indices(
| self,
| tokens: List[Token],
| vocabulary: Vocabulary
| ) -> Dict[str, List[List[int]]]
https://github.com/allenai/allennlp/blob/master/allennlp/data/token_indexers/wordpiece_indexer.py#L113
as_padded_tensor_dict#
| @overrides
| def as_padded_tensor_dict(
| self,
| tokens: IndexedTokenList,
| padding_lengths: Dict[str, int]
| ) -> Dict[str, torch.Tensor]
Overriding this method only because we need a different padding token than the default.