pretrained_transformer_mismatched_indexer

allennlp.data.token_indexers.pretrained_transformer_mismatched_indexer

PretrainedTransformerMismatchedIndexer#

@TokenIndexer.register("pretrained_transformer_mismatched")
class PretrainedTransformerMismatchedIndexer(TokenIndexer):
 | def __init__(
 |     self,
 |     model_name: str,
 |     namespace: str = "tags",
 |     max_length: int = None,
 |     tokenizer_kwargs: Optional[Dict[str, Any]] = None,
 |     **kwargs
 | ) -> None

Use this indexer when (for whatever reason) you are not using a corresponding PretrainedTransformerTokenizer on your input. We assume that you used a tokenizer that splits strings into words, while the transformer expects wordpieces as input. This indexer splits the words into wordpieces and flattens them out. You should use the corresponding PretrainedTransformerMismatchedEmbedder to embed these wordpieces and then pull out a single vector for each original word.

Registered as a TokenIndexer with name "pretrained_transformer_mismatched".

Parameters

model_name : str
The name of the transformers model to use.
namespace : str, optional (default = tags)
We will add the tokens in the pytorch_transformer vocabulary to this vocabulary namespace. We use a somewhat confusing default value of tags so that we do not add padding or UNK tokens to this namespace, which would break on loading because we wouldn't find our default OOV token.
max_length : int, optional (default = None)
If positive, split the document into segments of this many tokens (including special tokens) before feeding into the embedder. The embedder embeds these segments independently and concatenate the results to get the original document representation. Should be set to the same value as the max_length option on the PretrainedTransformerMismatchedEmbedder.
tokenizer_kwargs : Dict[str, Any], optional (default = None)
Dictionary with additional arguments for AutoTokenizer.from_pretrained.

count_vocab_items#

class PretrainedTransformerMismatchedIndexer(TokenIndexer):
 | ...
 | @overrides
 | def count_vocab_items(
 |     self,
 |     token: Token,
 |     counter: Dict[str, Dict[str, int]]
 | )

tokens_to_indices#

class PretrainedTransformerMismatchedIndexer(TokenIndexer):
 | ...
 | @overrides
 | def tokens_to_indices(
 |     self,
 |     tokens: List[Token],
 |     vocabulary: Vocabulary
 | ) -> IndexedTokenList

get_empty_token_list#

class PretrainedTransformerMismatchedIndexer(TokenIndexer):
 | ...
 | @overrides
 | def get_empty_token_list(self) -> IndexedTokenList

as_padded_tensor_dict#

class PretrainedTransformerMismatchedIndexer(TokenIndexer):
 | ...
 | @overrides
 | def as_padded_tensor_dict(
 |     self,
 |     tokens: IndexedTokenList,
 |     padding_lengths: Dict[str, int]
 | ) -> Dict[str, torch.Tensor]