text_classification_json

allennlp.data.dataset_readers.text_classification_json

TextClassificationJsonReader#

@DatasetReader.register("text_classification_json")
class TextClassificationJsonReader(DatasetReader):
 | def __init__(
 |     self,
 |     token_indexers: Dict[str, TokenIndexer] = None,
 |     tokenizer: Tokenizer = None,
 |     segment_sentences: bool = False,
 |     max_sequence_length: int = None,
 |     skip_label_indexing: bool = False,
 |     **kwargs
 | ) -> None

Reads tokens and their labels from a labeled text classification dataset. Expects a "text" field and a "label" field in JSON format.

The output of read is a list of Instance s with the fields: tokens : TextField and label : LabelField

Registered as a DatasetReader with name "text_classification_json".

Parameters

token_indexers : Dict[str, TokenIndexer], optional
optional (default={"tokens": SingleIdTokenIndexer()}) We use this to define the input representation for the text. See TokenIndexer.
tokenizer : Tokenizer, optional (default = {"tokens": SpacyTokenizer()})
Tokenizer to use to split the input text into words or other kinds of tokens.
segment_sentences : bool, optional (default = False)
If True, we will first segment the text into sentences using SpaCy and then tokenize words. Necessary for some models that require pre-segmentation of sentences, like the Hierarchical Attention Network.
max_sequence_length : int, optional (default = None)
If specified, will truncate tokens to specified maximum length.
skip_label_indexing : bool, optional (default = False)
Whether or not to skip label indexing. You might want to skip label indexing if your labels are numbers, so the dataset reader doesn't re-number them starting from 0.

text_to_instance#

class TextClassificationJsonReader(DatasetReader):
 | ...
 | @overrides
 | def text_to_instance(
 |     self,
 |     text: str,
 |     label: Union[str, int] = None
 | ) -> Instance

Parameters

text : str
The text to classify
label : str, optional (default = None)
The label for this text.

Returns

An Instance containing the following fields:
- tokens (TextField) : The tokens in the sentence or phrase.
- label (LabelField) : The label label of the sentence or phrase.