snli

allennlp_models.pair_classification.dataset_readers.snli

SnliReader#

@DatasetReader.register("snli")
class SnliReader(DatasetReader):
 | def __init__(
 |     self,
 |     tokenizer: Optional[Tokenizer] = None,
 |     token_indexers: Dict[str, TokenIndexer] = None,
 |     combine_input_fields: Optional[bool] = None,
 |     **kwargs
 | ) -> None

Reads a file from the Stanford Natural Language Inference (SNLI) dataset. This data is formatted as jsonl, one json-formatted instance per line. The keys in the data are "gold_label", "sentence1", and "sentence2". We convert these keys into fields named "label", "premise" and "hypothesis", along with a metadata field containing the tokenized strings of the premise and hypothesis.

Registered as a DatasetReader with name "snli".

Parameters¶

tokenizer : Tokenizer, optional (default = SpacyTokenizer())
We use this Tokenizer for both the premise and the hypothesis. See Tokenizer.
token_indexers : Dict[str, TokenIndexer], optional (default = {"tokens": SingleIdTokenIndexer()})
We similarly use this for both the premise and the hypothesis. See TokenIndexer.
combine_input_fields : bool, optional
(default=isinstance(tokenizer, PretrainedTransformerTokenizer)) If False, represent the premise and the hypothesis as separate fields in the instance. If True, tokenize them together using tokenizer.tokenize_sentence_pair() and provide a single tokens field in the instance.

text_to_instance#

class SnliReader(DatasetReader):
 | ...
 | @overrides
 | def text_to_instance(
 |     self,
 |     premise: str,
 |     hypothesis: str,
 |     label: str = None
 | ) -> Instance

apply_token_indexers#

class SnliReader(DatasetReader):
 | ...
 | @overrides
 | def apply_token_indexers(self, instance: Instance) -> Instance