squad

SquadReader#

class SquadReader(DatasetReader):
 | def __init__(
 |     self,
 |     tokenizer: Tokenizer = None,
 |     token_indexers: Dict[str, TokenIndexer] = None,
 |     passage_length_limit: int = None,
 |     question_length_limit: int = None,
 |     skip_invalid_examples: bool = False,
 |     **kwargs
 | ) -> None

Reads a JSON-formatted SQuAD file and returns a Dataset where the Instances have four fields: question, a TextField, passage, another TextField, and span_start and span_end, both IndexFields into the passage TextField. We also add a MetadataField that stores the instance's ID, the original passage text, gold answer strings, and token offsets into the original passage, accessible as metadata['id'], metadata['original_passage'], metadata['answer_texts'] and metadata['token_offsets']. This is so that we can more easily use the official SQuAD evaluation script to get metrics.

We also support limiting the maximum length for both passage and question. However, some gold answer spans may exceed the maximum passage length, which will cause error in making instances. We simply skip these spans to avoid errors. If all of the gold answer spans of an example are skipped, during training, we will skip this example. During validating or testing, since we cannot skip examples, we use the last token as the pseudo gold answer span instead. The computed loss will not be accurate as a result. But this will not affect the answer evaluation, because we keep all the original gold answer texts.

Parameters

tokenizer : Tokenizer, optional (default = SpacyTokenizer())
We use this Tokenizer for both the question and the passage. See Tokenizer. Default is SpacyTokenizer().
token_indexers : Dict[str, TokenIndexer], optional
We similarly use this for both the question and the passage. See TokenIndexer. Default is {"tokens": SingleIdTokenIndexer()}.
passage_length_limit : int, optional (default = None)
if specified, we will cut the passage if the length of passage exceeds this limit.
question_length_limit : int, optional (default = None)
if specified, we will cut the question if the length of passage exceeds this limit.
skip_invalid_examples : bool, optional (default = False)
if this is true, we will skip those invalid examples

text_to_instance#

class SquadReader(DatasetReader):
 | ...
 | @overrides
 | def text_to_instance(
 |     self,
 |     question_text: str,
 |     passage_text: str,
 |     char_spans: List[Tuple[int, int]] = None,
 |     answer_texts: List[str] = None,
 |     passage_tokens: List[Token] = None,
 |     additional_metadata: Dict[str, Any] = None
 | ) -> Optional[Instance]