seq2seq
allennlp_models.generation.dataset_readers.seq2seq
Seq2SeqDatasetReader#
@DatasetReader.register("seq2seq")
class Seq2SeqDatasetReader(DatasetReader):
| def __init__(
| self,
| source_tokenizer: Tokenizer = None,
| target_tokenizer: Tokenizer = None,
| source_token_indexers: Dict[str, TokenIndexer] = None,
| target_token_indexers: Dict[str, TokenIndexer] = None,
| source_add_start_token: bool = True,
| source_add_end_token: bool = True,
| target_add_start_token: bool = True,
| target_add_end_token: bool = True,
| start_symbol: str = START_SYMBOL,
| end_symbol: str = END_SYMBOL,
| delimiter: str = "\t",
| source_max_tokens: Optional[int] = None,
| target_max_tokens: Optional[int] = None,
| quoting: int = csv.QUOTE_MINIMAL,
| **kwargs
| ) -> None
Read a tsv file containing paired sequences, and create a dataset suitable for a
ComposedSeq2Seq
model, or any model with a matching API.
Expected format for each input line:
The output of read
is a list of Instance
s with the fields:
source_tokens : TextField
and
target_tokens : TextField
START_SYMBOL
and END_SYMBOL
tokens are added to the source and target sequences.
Parameters¶
- source_tokenizer :
Tokenizer
, optional
Tokenizer to use to split the input sequences into words or other kinds of tokens. Defaults toSpacyTokenizer()
. - target_tokenizer :
Tokenizer
, optional
Tokenizer to use to split the output sequences (during training) into words or other kinds of tokens. Defaults tosource_tokenizer
. - source_token_indexers :
Dict[str, TokenIndexer]
, optional
Indexers used to define input (source side) token representations. Defaults to{"tokens": SingleIdTokenIndexer()}
. - target_token_indexers :
Dict[str, TokenIndexer]
, optional
Indexers used to define output (target side) token representations. Defaults tosource_token_indexers
. - source_add_start_token :
bool
, optional (default =True
)
Whether or not to addstart_symbol
to the beginning of the source sequence. - source_add_end_token :
bool
, optional (default =True
)
Whether or not to addend_symbol
to the end of the source sequence. - target_add_start_token :
bool
, optional (default =True
)
Whether or not to addstart_symbol
to the beginning of the target sequence. - target_add_end_token :
bool
, optional (default =True
)
Whether or not to addend_symbol
to the end of the target sequence. - start_symbol :
str
, optional (default =START_SYMBOL
)
The special token to add to the end of the source sequence or the target sequence ifsource_add_start_token
ortarget_add_start_token
respectively. - end_symbol :
str
, optional (default =END_SYMBOL
)
The special token to add to the end of the source sequence or the target sequence ifsource_add_end_token
ortarget_add_end_token
respectively. - delimiter :
str
, optional (default ="\t"
)
Set delimiter for tsv/csv file. - quoting :
int
, optional (default =csv.QUOTE_MINIMAL
)
Quoting to use for csv reader.
text_to_instance#
class Seq2SeqDatasetReader(DatasetReader):
| ...
| def text_to_instance(
| self,
| source_string: str,
| target_string: str = None
| ) -> Instance
apply_token_indexers#
class Seq2SeqDatasetReader(DatasetReader):
| ...
| def apply_token_indexers(self, instance: Instance) -> None