elmo_indexer

allennlp.data.token_indexers.elmo_indexer

ELMoCharacterMapper#

class ELMoCharacterMapper:
 | def __init__(self, tokens_to_add: Dict[str, int] = None) -> None

Maps individual tokens to sequences of character ids, compatible with ELMo. To be consistent with previously trained models, we include it here as special of existing character indexers.

We allow to add optional additional special tokens with designated character ids with tokens_to_add.

max_word_length#

class ELMoCharacterMapper:
 | ...
 | max_word_length = 50

beginning_of_sentence_character#

class ELMoCharacterMapper:
 | ...
 | beginning_of_sentence_character = 256

end_of_sentence_character#

class ELMoCharacterMapper:
 | ...
 | end_of_sentence_character = 257

beginning_of_word_character#

class ELMoCharacterMapper:
 | ...
 | beginning_of_word_character = 258

end_of_word_character#

class ELMoCharacterMapper:
 | ...
 | end_of_word_character = 259

padding_character#

class ELMoCharacterMapper:
 | ...
 | padding_character = 260

beginning_of_sentence_characters#

class ELMoCharacterMapper:
 | ...
 | beginning_of_sentence_characters = _make_bos_eos(
        beginning_of_sentence_character,
        padding_character,
        beginning ...

end_of_sentence_characters#

class ELMoCharacterMapper:
 | ...
 | end_of_sentence_characters = _make_bos_eos(
        end_of_sentence_character,
        padding_character,
        beginning_of_wo ...

bos_token#

class ELMoCharacterMapper:
 | ...
 | bos_token = "<S>"

eos_token#

class ELMoCharacterMapper:
 | ...
 | eos_token = "</S>"

convert_word_to_char_ids#

class ELMoCharacterMapper:
 | ...
 | def convert_word_to_char_ids(self, word: str) -> List[int]

ELMoTokenCharactersIndexer#

@TokenIndexer.register("elmo_characters")
class ELMoTokenCharactersIndexer(TokenIndexer):
 | def __init__(
 |     self,
 |     namespace: str = "elmo_characters",
 |     tokens_to_add: Dict[str, int] = None,
 |     token_min_padding_length: int = 0
 | ) -> None

Convert a token to an array of character ids to compute ELMo representations.

Registered as a TokenIndexer with name "elmo_characters".

Parameters

namespace : str, optional (default = elmo_characters)
tokens_to_add : Dict[str, int], optional (default = None)
If not None, then provides a mapping of special tokens to character ids. When using pre-trained models, then the character id must be less then 261, and we recommend using un-used ids (e.g. 1-32).
token_min_padding_length : int, optional (default = 0)
See TokenIndexer.

count_vocab_items#

class ELMoTokenCharactersIndexer(TokenIndexer):
 | ...
 | @overrides
 | def count_vocab_items(
 |     self,
 |     token: Token,
 |     counter: Dict[str, Dict[str, int]]
 | )

get_empty_token_list#

class ELMoTokenCharactersIndexer(TokenIndexer):
 | ...
 | @overrides
 | def get_empty_token_list(self) -> IndexedTokenList

tokens_to_indices#

class ELMoTokenCharactersIndexer(TokenIndexer):
 | ...
 | @overrides
 | def tokens_to_indices(
 |     self,
 |     tokens: List[Token],
 |     vocabulary: Vocabulary
 | ) -> Dict[str, List[List[int]]]

https://github.com/allenai/allennlp/blob/master/allennlp/data/token_indexers/wordpiece_indexer.py#L113

as_padded_tensor_dict#

class ELMoTokenCharactersIndexer(TokenIndexer):
 | ...
 | @overrides
 | def as_padded_tensor_dict(
 |     self,
 |     tokens: IndexedTokenList,
 |     padding_lengths: Dict[str, int]
 | ) -> Dict[str, torch.Tensor]

Overriding this method only because we need a different padding token than the default.