Skip to content

elmo_indexer

[ allennlp.data.token_indexers.elmo_indexer ]


ELMoCharacterMapper Objects#

class ELMoCharacterMapper():
 | def __init__(self, tokens_to_add: Dict[str, int] = None) -> None

Maps individual tokens to sequences of character ids, compatible with ELMo. To be consistent with previously trained models, we include it here as special of existing character indexers.

We allow to add optional additional special tokens with designated character ids with tokens_to_add.

max_word_length#

max_word_length = 50

beginning_of_sentence_character#

beginning_of_sentence_character = 256

end_of_sentence_character#

end_of_sentence_character = 257

beginning_of_word_character#

beginning_of_word_character = 258

end_of_word_character#

end_of_word_character = 259

padding_character#

padding_character = 260

beginning_of_sentence_characters#

beginning_of_sentence_characters = _make_bos_eos(
        beginning_of_sentence_character,
        padding_character,
        beginning ...

end_of_sentence_characters#

end_of_sentence_characters = _make_bos_eos(
        end_of_sentence_character,
        padding_character,
        beginning_of_wo ...

bos_token#

bos_token = "<S>"

eos_token#

eos_token = "</S>"

convert_word_to_char_ids#

 | def convert_word_to_char_ids(self, word: str) -> List[int]

ELMoTokenCharactersIndexer Objects#

class ELMoTokenCharactersIndexer(TokenIndexer):
 | def __init__(
 |     self,
 |     namespace: str = "elmo_characters",
 |     tokens_to_add: Dict[str, int] = None,
 |     token_min_padding_length: int = 0
 | ) -> None

Convert a token to an array of character ids to compute ELMo representations.

Registered as a TokenIndexer with name "elmo_characters".

Parameters

  • namespace : str, optional (default = elmo_characters)
  • tokens_to_add : Dict[str, int], optional (default = None)
    If not None, then provides a mapping of special tokens to character ids. When using pre-trained models, then the character id must be less then 261, and we recommend using un-used ids (e.g. 1-32).
  • token_min_padding_length : int, optional (default = 0)
    See TokenIndexer.

count_vocab_items#

 | @overrides
 | def count_vocab_items(
 |     self,
 |     token: Token,
 |     counter: Dict[str, Dict[str, int]]
 | )

get_empty_token_list#

 | @overrides
 | def get_empty_token_list(self) -> IndexedTokenList

tokens_to_indices#

 | @overrides
 | def tokens_to_indices(
 |     self,
 |     tokens: List[Token],
 |     vocabulary: Vocabulary
 | ) -> Dict[str, List[List[int]]]

https://github.com/allenai/allennlp/blob/master/allennlp/data/token_indexers/wordpiece_indexer.py#L113

as_padded_tensor_dict#

 | @overrides
 | def as_padded_tensor_dict(
 |     self,
 |     tokens: IndexedTokenList,
 |     padding_lengths: Dict[str, int]
 | ) -> Dict[str, torch.Tensor]

Overriding this method only because we need a different padding token than the default.