elmo_indexer
allennlp.data.token_indexers.elmo_indexer
ELMoCharacterMapper#
class ELMoCharacterMapper:
| def __init__(self, tokens_to_add: Dict[str, int] = None) -> None
Maps individual tokens to sequences of character ids, compatible with ELMo. To be consistent with previously trained models, we include it here as special of existing character indexers.
We allow to add optional additional special tokens with designated
character ids with tokens_to_add
.
max_word_length#
class ELMoCharacterMapper:
| ...
| max_word_length = 50
beginning_of_sentence_character#
class ELMoCharacterMapper:
| ...
| beginning_of_sentence_character = 256
end_of_sentence_character#
class ELMoCharacterMapper:
| ...
| end_of_sentence_character = 257
beginning_of_word_character#
class ELMoCharacterMapper:
| ...
| beginning_of_word_character = 258
end_of_word_character#
class ELMoCharacterMapper:
| ...
| end_of_word_character = 259
padding_character#
class ELMoCharacterMapper:
| ...
| padding_character = 260
beginning_of_sentence_characters#
class ELMoCharacterMapper:
| ...
| beginning_of_sentence_characters = _make_bos_eos(
beginning_of_sentence_character,
padding_character,
beginning ...
end_of_sentence_characters#
class ELMoCharacterMapper:
| ...
| end_of_sentence_characters = _make_bos_eos(
end_of_sentence_character,
padding_character,
beginning_of_wo ...
bos_token#
class ELMoCharacterMapper:
| ...
| bos_token = "<S>"
eos_token#
class ELMoCharacterMapper:
| ...
| eos_token = "</S>"
convert_word_to_char_ids#
class ELMoCharacterMapper:
| ...
| def convert_word_to_char_ids(self, word: str) -> List[int]
ELMoTokenCharactersIndexer#
@TokenIndexer.register("elmo_characters")
class ELMoTokenCharactersIndexer(TokenIndexer):
| def __init__(
| self,
| namespace: str = "elmo_characters",
| tokens_to_add: Dict[str, int] = None,
| token_min_padding_length: int = 0
| ) -> None
Convert a token to an array of character ids to compute ELMo representations.
Registered as a TokenIndexer
with name "elmo_characters".
Parameters
- namespace :
str
, optional (default =elmo_characters
) - tokens_to_add :
Dict[str, int]
, optional (default =None
)
If not None, then provides a mapping of special tokens to character ids. When using pre-trained models, then the character id must be less then 261, and we recommend using un-used ids (e.g. 1-32). - token_min_padding_length :
int
, optional (default =0
)
SeeTokenIndexer
.
count_vocab_items#
class ELMoTokenCharactersIndexer(TokenIndexer):
| ...
| @overrides
| def count_vocab_items(
| self,
| token: Token,
| counter: Dict[str, Dict[str, int]]
| )
get_empty_token_list#
class ELMoTokenCharactersIndexer(TokenIndexer):
| ...
| @overrides
| def get_empty_token_list(self) -> IndexedTokenList
tokens_to_indices#
class ELMoTokenCharactersIndexer(TokenIndexer):
| ...
| @overrides
| def tokens_to_indices(
| self,
| tokens: List[Token],
| vocabulary: Vocabulary
| ) -> Dict[str, List[List[int]]]
https://github.com/allenai/allennlp/blob/master/allennlp/data/token_indexers/wordpiece_indexer.py#L113
as_padded_tensor_dict#
class ELMoTokenCharactersIndexer(TokenIndexer):
| ...
| @overrides
| def as_padded_tensor_dict(
| self,
| tokens: IndexedTokenList,
| padding_lengths: Dict[str, int]
| ) -> Dict[str, torch.Tensor]
Overriding this method only because we need a different padding token than the default.