Skip to content

seq2seq

allennlp_models.generation.dataset_readers.seq2seq

[SOURCE]


Seq2SeqDatasetReader#

@DatasetReader.register("seq2seq")
class Seq2SeqDatasetReader(DatasetReader):
 | def __init__(
 |     self,
 |     source_tokenizer: Tokenizer = None,
 |     target_tokenizer: Tokenizer = None,
 |     source_token_indexers: Dict[str, TokenIndexer] = None,
 |     target_token_indexers: Dict[str, TokenIndexer] = None,
 |     source_add_start_token: bool = True,
 |     source_add_end_token: bool = True,
 |     target_add_start_token: bool = True,
 |     target_add_end_token: bool = True,
 |     start_symbol: str = START_SYMBOL,
 |     end_symbol: str = END_SYMBOL,
 |     delimiter: str = "\t",
 |     source_max_tokens: Optional[int] = None,
 |     target_max_tokens: Optional[int] = None,
 |     quoting: int = csv.QUOTE_MINIMAL,
 |     **kwargs
 | ) -> None

Read a tsv file containing paired sequences, and create a dataset suitable for a ComposedSeq2Seq model, or any model with a matching API.

Expected format for each input line: \t

The output of read is a list of Instance s with the fields: source_tokens : TextField and target_tokens : TextField

START_SYMBOL and END_SYMBOL tokens are added to the source and target sequences.

Parameters

  • source_tokenizer : Tokenizer, optional
    Tokenizer to use to split the input sequences into words or other kinds of tokens. Defaults to SpacyTokenizer().
  • target_tokenizer : Tokenizer, optional
    Tokenizer to use to split the output sequences (during training) into words or other kinds of tokens. Defaults to source_tokenizer.
  • source_token_indexers : Dict[str, TokenIndexer], optional
    Indexers used to define input (source side) token representations. Defaults to {"tokens": SingleIdTokenIndexer()}.
  • target_token_indexers : Dict[str, TokenIndexer], optional
    Indexers used to define output (target side) token representations. Defaults to source_token_indexers.
  • source_add_start_token : bool, optional (default = True)
    Whether or not to add start_symbol to the beginning of the source sequence.
  • source_add_end_token : bool, optional (default = True)
    Whether or not to add end_symbol to the end of the source sequence.
  • target_add_start_token : bool, optional (default = True)
    Whether or not to add start_symbol to the beginning of the target sequence.
  • target_add_end_token : bool, optional (default = True)
    Whether or not to add end_symbol to the end of the target sequence.
  • start_symbol : str, optional (default = START_SYMBOL)
    The special token to add to the end of the source sequence or the target sequence if source_add_start_token or target_add_start_token respectively.
  • end_symbol : str, optional (default = END_SYMBOL)
    The special token to add to the end of the source sequence or the target sequence if source_add_end_token or target_add_end_token respectively.
  • delimiter : str, optional (default = "\t")
    Set delimiter for tsv/csv file.
  • quoting : int, optional (default = csv.QUOTE_MINIMAL)
    Quoting to use for csv reader.

text_to_instance#

class Seq2SeqDatasetReader(DatasetReader):
 | ...
 | def text_to_instance(
 |     self,
 |     source_string: str,
 |     target_string: str = None
 | ) -> Instance

apply_token_indexers#

class Seq2SeqDatasetReader(DatasetReader):
 | ...
 | def apply_token_indexers(self, instance: Instance) -> None