cnn_dm
allennlp_models.generation.dataset_readers.cnn_dm
CNNDailyMailDatasetReader#
@DatasetReader.register("cnn_dm")
class CNNDailyMailDatasetReader(DatasetReader):
| def __init__(
| self,
| source_tokenizer: Tokenizer = None,
| target_tokenizer: Tokenizer = None,
| source_token_indexers: Dict[str, TokenIndexer] = None,
| target_token_indexers: Dict[str, TokenIndexer] = None,
| source_max_tokens: Optional[int] = None,
| target_max_tokens: Optional[int] = None,
| source_prefix: Optional[str] = None,
| **kwargs
| ) -> None
Reads the CNN/DailyMail dataset for text summarization.
The output of read
is a list of Instance
s with the fields:
source_tokens : TextField
and
target_tokens : TextField
Parameters¶
- source_tokenizer :
Tokenizer
, optional
Tokenizer to use to split the input sequences into words or other kinds of tokens. Defaults toSpacyTokenizer()
. - target_tokenizer :
Tokenizer
, optional
Tokenizer to use to split the output sequences (during training) into words or other kinds of tokens. Defaults tosource_tokenizer
. - source_token_indexers :
Dict[str, TokenIndexer]
, optional
Indexers used to define input (source side) token representations. Defaults to{"tokens": SingleIdTokenIndexer()}
. - target_token_indexers :
Dict[str, TokenIndexer]
, optional
Indexers used to define output (target side) token representations. Defaults tosource_token_indexers
. - source_max_tokens :
int
, optional
Maximum number of tokens in source sequence. - target_max_tokens :
int
, optional
Maximum number of tokens in target sequence. - source_prefix :
str
, optional
An optional prefix to prepend to source strings. For example, with a T5 model, you want to set thesource_prefix
to "summarize: ".
text_to_instance#
class CNNDailyMailDatasetReader(DatasetReader):
| ...
| def text_to_instance(
| self,
| source_sequence: str,
| target_sequence: str = None
| ) -> Instance
apply_token_indexers#
class CNNDailyMailDatasetReader(DatasetReader):
| ...
| def apply_token_indexers(self, instance: Instance) -> None