seq2seq
allennlp_models.generation.dataset_readers.seq2seq
Seq2SeqDatasetReader#
@DatasetReader.register("seq2seq")
class Seq2SeqDatasetReader(DatasetReader):
| def __init__(
| self,
| source_tokenizer: Tokenizer = None,
| target_tokenizer: Tokenizer = None,
| source_token_indexers: Dict[str, TokenIndexer] = None,
| target_token_indexers: Dict[str, TokenIndexer] = None,
| source_add_start_token: bool = True,
| source_add_end_token: bool = True,
| target_add_start_token: bool = True,
| target_add_end_token: bool = True,
| start_symbol: str = START_SYMBOL,
| end_symbol: str = END_SYMBOL,
| delimiter: str = "\t",
| source_max_tokens: Optional[int] = None,
| target_max_tokens: Optional[int] = None,
| quoting: int = csv.QUOTE_MINIMAL,
| **kwargs
| ) -> None
Read a tsv file containing paired sequences, and create a dataset suitable for a
ComposedSeq2Seq model, or any model with a matching API.
Expected format for each input line:
The output of read is a list of Instance s with the fields:
source_tokens : TextField and
target_tokens : TextField
START_SYMBOL and END_SYMBOL tokens are added to the source and target sequences.
Parameters¶
- source_tokenizer :
Tokenizer, optional
Tokenizer to use to split the input sequences into words or other kinds of tokens. Defaults toSpacyTokenizer(). - target_tokenizer :
Tokenizer, optional
Tokenizer to use to split the output sequences (during training) into words or other kinds of tokens. Defaults tosource_tokenizer. - source_token_indexers :
Dict[str, TokenIndexer], optional
Indexers used to define input (source side) token representations. Defaults to{"tokens": SingleIdTokenIndexer()}. - target_token_indexers :
Dict[str, TokenIndexer], optional
Indexers used to define output (target side) token representations. Defaults tosource_token_indexers. - source_add_start_token :
bool, optional (default =True)
Whether or not to addstart_symbolto the beginning of the source sequence. - source_add_end_token :
bool, optional (default =True)
Whether or not to addend_symbolto the end of the source sequence. - target_add_start_token :
bool, optional (default =True)
Whether or not to addstart_symbolto the beginning of the target sequence. - target_add_end_token :
bool, optional (default =True)
Whether or not to addend_symbolto the end of the target sequence. - start_symbol :
str, optional (default =START_SYMBOL)
The special token to add to the end of the source sequence or the target sequence ifsource_add_start_tokenortarget_add_start_tokenrespectively. - end_symbol :
str, optional (default =END_SYMBOL)
The special token to add to the end of the source sequence or the target sequence ifsource_add_end_tokenortarget_add_end_tokenrespectively. - delimiter :
str, optional (default ="\t")
Set delimiter for tsv/csv file. - quoting :
int, optional (default =csv.QUOTE_MINIMAL)
Quoting to use for csv reader.
text_to_instance#
class Seq2SeqDatasetReader(DatasetReader):
| ...
| def text_to_instance(
| self,
| source_string: str,
| target_string: str = None
| ) -> Instance
apply_token_indexers#
class Seq2SeqDatasetReader(DatasetReader):
| ...
| def apply_token_indexers(self, instance: Instance) -> None