spacy_indexer

allennlp.data.token_indexers.spacy_indexer

SpacyTokenIndexer#

@TokenIndexer.register("spacy")
class SpacyTokenIndexer(TokenIndexer):
 | def __init__(
 |     self,
 |     hidden_dim: int = 96,
 |     token_min_padding_length: int = 0
 | ) -> None

This SpacyTokenIndexer represents tokens as word vectors from a spacy model. You might want to do this for two main reasons; easier integration with a spacy pipeline and no out of vocabulary tokens.

Registered as a TokenIndexer with name "spacy".

Parameters

hidden_dim : int, optional (default = 96)
The dimension of the vectors that spacy generates for representing words.
token_min_padding_length : int, optional (default = 0)
See TokenIndexer.

count_vocab_items#

class SpacyTokenIndexer(TokenIndexer):
 | ...
 | @overrides
 | def count_vocab_items(
 |     self,
 |     token: Token,
 |     counter: Dict[str, Dict[str, int]]
 | )

We are using spacy to generate embeddings directly for our model, so we don't need to capture the vocab - it is defined by the spacy model we are using instead.

tokens_to_indices#

class SpacyTokenIndexer(TokenIndexer):
 | ...
 | @overrides
 | def tokens_to_indices(
 |     self,
 |     tokens: List[SpacyToken],
 |     vocabulary: Vocabulary
 | ) -> Dict[str, List[numpy.ndarray]]

as_padded_tensor_dict#

class SpacyTokenIndexer(TokenIndexer):
 | ...
 | @overrides
 | def as_padded_tensor_dict(
 |     self,
 |     tokens: IndexedTokenList,
 |     padding_lengths: Dict[str, int]
 | ) -> Dict[str, torch.Tensor]