Skip to content

cnn_dm

allennlp_models.generation.dataset_readers.cnn_dm

[SOURCE]


CNNDailyMailDatasetReader#

@DatasetReader.register("cnn_dm")
class CNNDailyMailDatasetReader(DatasetReader):
 | def __init__(
 |     self,
 |     source_tokenizer: Tokenizer = None,
 |     target_tokenizer: Tokenizer = None,
 |     source_token_indexers: Dict[str, TokenIndexer] = None,
 |     target_token_indexers: Dict[str, TokenIndexer] = None,
 |     source_max_tokens: Optional[int] = None,
 |     target_max_tokens: Optional[int] = None,
 |     source_prefix: Optional[str] = None,
 |     **kwargs
 | ) -> None

Reads the CNN/DailyMail dataset for text summarization.

The output of read is a list of Instance s with the fields: source_tokens : TextField and target_tokens : TextField

Parameters

  • source_tokenizer : Tokenizer, optional
    Tokenizer to use to split the input sequences into words or other kinds of tokens. Defaults to SpacyTokenizer().
  • target_tokenizer : Tokenizer, optional
    Tokenizer to use to split the output sequences (during training) into words or other kinds of tokens. Defaults to source_tokenizer.
  • source_token_indexers : Dict[str, TokenIndexer], optional
    Indexers used to define input (source side) token representations. Defaults to {"tokens": SingleIdTokenIndexer()}.
  • target_token_indexers : Dict[str, TokenIndexer], optional
    Indexers used to define output (target side) token representations. Defaults to source_token_indexers.
  • source_max_tokens : int, optional
    Maximum number of tokens in source sequence.
  • target_max_tokens : int, optional
    Maximum number of tokens in target sequence.
  • source_prefix : str, optional
    An optional prefix to prepend to source strings. For example, with a T5 model, you want to set the source_prefix to "summarize: ".

text_to_instance#

class CNNDailyMailDatasetReader(DatasetReader):
 | ...
 | def text_to_instance(
 |     self,
 |     source_sequence: str,
 |     target_sequence: str = None
 | ) -> Instance

apply_token_indexers#

class CNNDailyMailDatasetReader(DatasetReader):
 | ...
 | def apply_token_indexers(self, instance: Instance) -> None