quac
allennlp_models.rc.dataset_readers.quac
QuACReader#
@DatasetReader.register("quac")
class QuACReader(DatasetReader):
| def __init__(
| self,
| tokenizer: Tokenizer = None,
| token_indexers: Dict[str, TokenIndexer] = None,
| num_context_answers: int = 0,
| **kwargs
| ) -> None
Reads a JSON-formatted Question Answering in Context (QuAC) data file
and returns a Dataset
where the Instances
have four fields: question
, a ListField
,
passage
, another TextField
, and span_start
and span_end
, both ListField
composed of
IndexFieldsinto the
passageTextField
.
Two
ListField, composed of
LabelField,
yesno_listand
followup_listis added.
We also add a
MetadataFieldthat stores the instance's ID, the original passage text, gold answer strings,
and token offsets into the original passage, accessible as
metadata['id'],
metadata['original_passage'],
metadata['answer_text_lists'] and metadata['token_offsets']
.
Parameters¶
- tokenizer :
Tokenizer
, optional (default =SpacyTokenizer()
)
We use thisTokenizer
for both the question and the passage. SeeTokenizer
. Default isSpacyTokenizer()
. - token_indexers :
Dict[str, TokenIndexer]
, optional
We similarly use this for both the question and the passage. SeeTokenIndexer
. Default is{"tokens": SingleIdTokenIndexer()}
. - num_context_answers :
int
, optional
How many previous question answers to consider in a context.
text_to_instance#
class QuACReader(DatasetReader):
| ...
| @overrides
| def text_to_instance(
| self,
| question_text_list: List[str],
| passage_text: str,
| start_span_list: List[List[int]] = None,
| end_span_list: List[List[int]] = None,
| passage_tokens: List[Token] = None,
| yesno_list: List[int] = None,
| followup_list: List[int] = None,
| additional_metadata: Dict[str, Any] = None
| ) -> Instance
We need to convert character indices in passage_text
to token indices in
passage_tokens
, as the latter is what we'll actually use for supervision.