Skip to content

elmo_indexer

allennlp.data.token_indexers.elmo_indexer

[SOURCE]


ELMoCharacterMapper

class ELMoCharacterMapper:
 | def __init__(self, tokens_to_add: Dict[str, int] = None) -> None

Maps individual tokens to sequences of character ids, compatible with ELMo. To be consistent with previously trained models, we include it here as special of existing character indexers.

We allow to add optional additional special tokens with designated character ids with tokens_to_add.

max_word_length

class ELMoCharacterMapper:
 | ...
 | max_word_length = 50

beginning_of_sentence_character

class ELMoCharacterMapper:
 | ...
 | beginning_of_sentence_character = 256

end_of_sentence_character

class ELMoCharacterMapper:
 | ...
 | end_of_sentence_character = 257

beginning_of_word_character

class ELMoCharacterMapper:
 | ...
 | beginning_of_word_character = 258

end_of_word_character

class ELMoCharacterMapper:
 | ...
 | end_of_word_character = 259

padding_character

class ELMoCharacterMapper:
 | ...
 | padding_character = 260

beginning_of_sentence_characters

class ELMoCharacterMapper:
 | ...
 | beginning_of_sentence_characters = _make_bos_eos(
        beginning_of_sentence_character,
        padding_character,
        beginning ...

end_of_sentence_characters

class ELMoCharacterMapper:
 | ...
 | end_of_sentence_characters = _make_bos_eos(
        end_of_sentence_character,
        padding_character,
        beginning_of_wo ...

bos_token

class ELMoCharacterMapper:
 | ...
 | bos_token = "<S>"

eos_token

class ELMoCharacterMapper:
 | ...
 | eos_token = "</S>"

convert_word_to_char_ids

class ELMoCharacterMapper:
 | ...
 | def convert_word_to_char_ids(self, word: str) -> List[int]

ELMoTokenCharactersIndexer

@TokenIndexer.register("elmo_characters")
class ELMoTokenCharactersIndexer(TokenIndexer):
 | def __init__(
 |     self,
 |     namespace: str = "elmo_characters",
 |     tokens_to_add: Dict[str, int] = None,
 |     token_min_padding_length: int = 0
 | ) -> None

Convert a token to an array of character ids to compute ELMo representations.

Registered as a TokenIndexer with name "elmo_characters".

Parameters

  • namespace : str, optional (default = elmo_characters)
  • tokens_to_add : Dict[str, int], optional (default = None)
    If not None, then provides a mapping of special tokens to character ids. When using pre-trained models, then the character id must be less then 261, and we recommend using un-used ids (e.g. 1-32).
  • token_min_padding_length : int, optional (default = 0)
    See TokenIndexer.

count_vocab_items

class ELMoTokenCharactersIndexer(TokenIndexer):
 | ...
 | def count_vocab_items(
 |     self,
 |     token: Token,
 |     counter: Dict[str, Dict[str, int]]
 | )

get_empty_token_list

class ELMoTokenCharactersIndexer(TokenIndexer):
 | ...
 | def get_empty_token_list(self) -> IndexedTokenList

tokens_to_indices

class ELMoTokenCharactersIndexer(TokenIndexer):
 | ...
 | def tokens_to_indices(
 |     self,
 |     tokens: List[Token],
 |     vocabulary: Vocabulary
 | ) -> Dict[str, List[List[int]]]

https://github.com/allenai/allennlp/blob/main/allennlp/data/token_indexers/wordpiece_indexer.py#L113

as_padded_tensor_dict

class ELMoTokenCharactersIndexer(TokenIndexer):
 | ...
 | def as_padded_tensor_dict(
 |     self,
 |     tokens: IndexedTokenList,
 |     padding_lengths: Dict[str, int]
 | ) -> Dict[str, torch.Tensor]

Overriding this method only because we need a different padding token than the default.