trainer

allennlp.training.trainer

Trainer¶

class Trainer(Registrable):
 | def __init__(
 |     self,
 |     serialization_dir: str = None,
 |     cuda_device: Optional[Union[int, torch.device]] = None,
 |     distributed: bool = False,
 |     local_rank: int = 0,
 |     world_size: int = 1
 | ) -> None

The base class for an AllenNLP trainer. It can do pretty much anything you want. Your subclass should implement train and also probably from_params.

default_implementation¶

class Trainer(Registrable):
 | ...
 | default_implementation = "gradient_descent"

train¶

class Trainer(Registrable):
 | ...
 | def train(self) -> Dict[str, Any]

Train a model and return the results.

get_checkpoint_state¶

class Trainer(Registrable):
 | ...
 | @contextmanager
 | def get_checkpoint_state(
 |     self
 | ) -> Iterator[Tuple[Dict[str, Any], Dict[str, Any]]]

Returns a tuple of (model state, training state), where training state could have several internal components (e.g., for an, optimizer, learning rate scheduler, etc.).

This is a context manager, and should be called as with trainer.get_checkpoint_state() as state:, so that the trainer has the opportunity to change and restore its internal state for checkpointing. This is used, e.g., for moving averages of model weights.

TrainerCallback¶

class TrainerCallback(Registrable):
 | def __init__(self, serialization_dir: str) -> None

A general callback object that handles multiple events.

This class has on_batch, on_epoch, and on_end methods, corresponding to each callback type. Each one receives the state of the wrapper object as self. This enables easier state sharing between related callbacks.

Also, this callback type is instantiated with serialization_dir and on_start is called with the trainer instance as an argument. This might be handy in case of callback logging and saving its own files next to the config/checkpoints/logs/etc.

on_start¶

class TrainerCallback(Registrable):
 | ...
 | def on_start(
 |     self,
 |     trainer: "GradientDescentTrainer",
 |     is_primary: bool = True,
 |     **kwargs
 | ) -> None

This callback hook is called before the training is started.

on_batch¶

class TrainerCallback(Registrable):
 | ...
 | def on_batch(
 |     self,
 |     trainer: "GradientDescentTrainer",
 |     batch_inputs: List[List[TensorDict]],
 |     batch_outputs: List[Dict[str, Any]],
 |     batch_metrics: Dict[str, Any],
 |     epoch: int,
 |     batch_number: int,
 |     is_training: bool,
 |     is_primary: bool = True,
 |     batch_grad_norm: Optional[float] = None,
 |     **kwargs
 | ) -> None

This callback hook is called after the end of each batch.

on_epoch¶

class TrainerCallback(Registrable):
 | ...
 | def on_epoch(
 |     self,
 |     trainer: "GradientDescentTrainer",
 |     metrics: Dict[str, Any],
 |     epoch: int,
 |     is_primary: bool = True,
 |     **kwargs
 | ) -> None

This callback hook is called after the end of each epoch.

on_end¶

class TrainerCallback(Registrable):
 | ...
 | def on_end(
 |     self,
 |     trainer: "GradientDescentTrainer",
 |     metrics: Dict[str, Any] = None,
 |     epoch: int = None,
 |     is_primary: bool = True,
 |     **kwargs
 | ) -> None

This callback hook is called after the final training epoch.

TensorBoardCallback¶

@TrainerCallback.register("tensorboard")
class TensorBoardCallback(TrainerCallback):
 | def __init__(
 |     self,
 |     serialization_dir: str,
 |     tensorboard_writer: Lazy[TensorBoardWriter] = Lazy(TensorBoardWriter)
 | ) -> None

Log training statistics and metrics to TensorBoard using the TensorBoardWriter.

on_start¶

class TensorBoardCallback(TrainerCallback):
 | ...
 | def on_start(
 |     self,
 |     trainer: "GradientDescentTrainer",
 |     is_primary: bool = True,
 |     **kwargs
 | ) -> None

on_batch¶

class TensorBoardCallback(TrainerCallback):
 | ...
 | def on_batch(
 |     self,
 |     trainer: "GradientDescentTrainer",
 |     batch_inputs: List[List[TensorDict]],
 |     batch_outputs: List[Dict[str, Any]],
 |     batch_metrics: Dict[str, Any],
 |     epoch: int,
 |     batch_number: int,
 |     is_training: bool,
 |     is_primary: bool = True,
 |     batch_grad_norm: Optional[float] = None,
 |     **kwargs
 | ) -> None

In the distributed case we need to call this from every worker, since every worker reports its own memory usage.

on_epoch¶

class TensorBoardCallback(TrainerCallback):
 | ...
 | def on_epoch(
 |     self,
 |     trainer: "GradientDescentTrainer",
 |     metrics: Dict[str, Any],
 |     epoch: int,
 |     is_primary: bool = True,
 |     **kwargs
 | ) -> None

on_end¶

class TensorBoardCallback(TrainerCallback):
 | ...
 | def on_end(
 |     self,
 |     trainer: "GradientDescentTrainer",
 |     metrics: Dict[str, Any] = None,
 |     epoch: int = None,
 |     is_primary: bool = True,
 |     **kwargs
 | ) -> None

TrackEpochCallback¶

@TrainerCallback.register("track_epoch_callback")
class TrackEpochCallback(TrainerCallback)

A callback that you can pass to the GradientDescentTrainer to access the current epoch number in your model during training. This callback sets model.epoch, which can be read inside of model.forward(). We set model.epoch = epoch + 1 which now denotes the number of completed epochs at a given training state.

on_start¶

class TrackEpochCallback(TrainerCallback):
 | ...
 | def on_start(
 |     self,
 |     trainer: "GradientDescentTrainer",
 |     is_primary: bool = True,
 |     **kwargs
 | ) -> None

on_epoch¶

class TrackEpochCallback(TrainerCallback):
 | ...
 | def on_epoch(
 |     self,
 |     trainer: "GradientDescentTrainer",
 |     metrics: Dict[str, Any],
 |     epoch: int,
 |     is_primary: bool = True,
 |     **kwargs
 | ) -> None

GradientDescentTrainer¶

@Trainer.register("gradient_descent", constructor="from_partial_objects")
class GradientDescentTrainer(Trainer):
 | def __init__(
 |     self,
 |     model: Model,
 |     optimizer: torch.optim.Optimizer,
 |     data_loader: DataLoader,
 |     patience: Optional[int] = None,
 |     validation_metric: Union[str, List[str]] = "-loss",
 |     validation_data_loader: DataLoader = None,
 |     num_epochs: int = 20,
 |     serialization_dir: Optional[str] = None,
 |     checkpointer: Checkpointer = None,
 |     cuda_device: Optional[Union[int, torch.device]] = None,
 |     grad_norm: Optional[float] = None,
 |     grad_clipping: Optional[float] = None,
 |     learning_rate_scheduler: Optional[LearningRateScheduler] = None,
 |     momentum_scheduler: Optional[MomentumScheduler] = None,
 |     moving_average: Optional[MovingAverage] = None,
 |     callbacks: List[TrainerCallback] = None,
 |     distributed: bool = False,
 |     local_rank: int = 0,
 |     world_size: int = 1,
 |     num_gradient_accumulation_steps: int = 1,
 |     use_amp: bool = False
 | ) -> None

A trainer for doing supervised learning with gradient descent. It just takes a labeled dataset and a DataLoader, and uses the supplied Optimizer to learn the weights for your model over some fixed number of epochs. You can also pass in a validation data_loader and enable early stopping. There are many other bells and whistles as well.

Registered as a Trainer with the name "gradient_descent" (and is also the default Trainer). The constructor that is registered is from_partial_objects - see the arguments to that function for the exact keys that should be used, if you are using a configuration file. They largely match the arguments to __init__, and we don't repeat their docstrings in from_partial_objects.

Parameters¶

model : Model
An AllenNLP model to be optimized. Pytorch Modules can also be optimized if their forward method returns a dictionary with a "loss" key, containing a scalar tensor representing the loss function to be optimized.

If you are training your model using GPUs, your model should already be on the correct device. (If you are using our train command this will be handled for you.)

In a typical AllenNLP configuration file, this parameter does not get an entry under the "trainer", it gets constructed separately.
optimizer : torch.nn.Optimizer
An instance of a Pytorch Optimizer, instantiated with the parameters of the model to be optimized.
data_loader : DataLoader
A DataLoader containing your Dataset, yielding padded indexed batches.

In a typical AllenNLP configuration file, this parameter does not get an entry under the "trainer", it gets constructed separately.
patience : Optional[int] > 0, optional (default = None)
Number of epochs to be patient before early stopping: the training is stopped after patience epochs with no improvement. If given, it must be > 0. If None, early stopping is disabled.
validation_metric : Union[str, List[str]], optional (default = "-loss")
Validation metric to measure for whether to stop training using patience and whether to serialize an is_best model each epoch. The metric name must be prepended with either "+" or "-", which specifies whether the metric is an increasing or decreasing function. If you specify more than one metric, the metrics will be summed to make the is_best decision.
validation_data_loader : DataLoader, optional (default = None)
A DataLoader to use for the validation set. If None, then use the training DataLoader with the validation data.

In a typical AllenNLP configuration file, this parameter does not get an entry under the "trainer", it gets constructed separately.
num_epochs : int, optional (default = 20)
Number of training epochs.
serialization_dir : str, optional (default = None)
Path to directory for saving and loading model files. Models will not be saved if this parameter is not passed.

In a typical AllenNLP configuration file, this parameter does not get an entry under the "trainer", it gets constructed separately.
checkpointer : Checkpointer, optional (default = None)
A Checkpointer is responsible for periodically saving model weights. If none is given here, we will construct one with default parameters.
cuda_device : int, optional (default = -1)
An integer specifying the CUDA device(s) to use for this process. If -1, the CPU is used. Data parallelism is controlled at the allennlp train level, so each trainer will have a single GPU.
grad_norm : float, optional (default = None)
If provided, gradient norms will be rescaled to have a maximum of this value.
grad_clipping : float, optional (default = None)
If provided, gradients will be clipped during the backward pass to have an (absolute) maximum of this value. If you are getting NaNs in your gradients during training that are not solved by using grad_norm, you may need this.
learning_rate_scheduler : LearningRateScheduler, optional (default = None)
If specified, the learning rate will be decayed with respect to this schedule at the end of each epoch (or batch, if the scheduler implements the step_batch method). If you use torch.optim.lr_scheduler.ReduceLROnPlateau, this will use the validation_metric provided to determine if learning has plateaued. To support updating the learning rate on every batch, this can optionally implement step_batch(batch_num_total) which updates the learning rate given the batch number.
momentum_scheduler : MomentumScheduler, optional (default = None)
If specified, the momentum will be updated at the end of each batch or epoch according to the schedule.
moving_average : MovingAverage, optional (default = None)
If provided, we will maintain moving averages for all parameters. During training, we employ a shadow variable for each parameter, which maintains the moving average. During evaluation, we backup the original parameters and assign the moving averages to corresponding parameters. Be careful that when saving the checkpoint, we will save the moving averages of parameters. This is necessary because we want the saved model to perform as well as the validated model if we load it later. But this may cause problems if you restart the training from checkpoint.
callbacks : List[TrainerCallback], optional (default = None)
A list of callbacks that can be called at certain events: e.g. each batch, epoch, and at the start and end of training, etc.
distributed : bool, optional (default = False)
If set, PyTorch's DistributedDataParallel is used to train the model in multiple GPUs. This also requires world_size to be greater than 1.

In a typical AllenNLP configuration file, this parameter does not get an entry under the "trainer", it gets constructed separately (you need a top-level "distributed" key, next to the "trainer" entry, that specifies a list of "cuda_devices").
local_rank : int, optional (default = 0)
This is the unique identifier of the Trainer in a distributed process group. The GPU device id is used as the rank.

In a typical AllenNLP configuration file, this parameter does not get an entry under the "trainer", it gets constructed separately.
world_size : int, optional (default = 1)
The number of Trainer workers participating in the distributed training.

In a typical AllenNLP configuration file, this parameter does not get an entry under the "trainer", it gets constructed separately.
num_gradient_accumulation_steps : int, optional (default = 1)
Gradients are accumulated for the given number of steps before doing an optimizer step. This can be useful to accommodate batches that are larger than the RAM size. Refer Thomas Wolf's post for details on Gradient Accumulation.
use_amp : bool, optional (default = False)
If True, we'll train using Automatic Mixed Precision.

rescale_gradients¶

class GradientDescentTrainer(Trainer):
 | ...
 | def rescale_gradients(self) -> float

Performs gradient rescaling. Is a no-op if gradient rescaling is not enabled.

Returns the norm of the gradients.

batch_outputs¶

class GradientDescentTrainer(Trainer):
 | ...
 | def batch_outputs(
 |     self,
 |     batch: TensorDict,
 |     for_training: bool
 | ) -> Dict[str, torch.Tensor]

Does a forward pass on the given batch and returns the output dictionary that the model returns, after adding any specified regularization penalty to the loss (if training).

train¶

class GradientDescentTrainer(Trainer):
 | ...
 | def train(self) -> Dict[str, Any]

Trains the supplied model with the supplied parameters.

get_checkpoint_state¶

class GradientDescentTrainer(Trainer):
 | ...
 | @contextmanager
 | def get_checkpoint_state(
 |     self
 | ) -> Iterator[Tuple[Dict[str, Any], Dict[str, Any]]]

from_partial_objects¶

class GradientDescentTrainer(Trainer):
 | ...
 | @classmethod
 | def from_partial_objects(
 |     cls,
 |     model: Model,
 |     serialization_dir: str,
 |     data_loader: DataLoader,
 |     validation_data_loader: DataLoader = None,
 |     local_rank: int = 0,
 |     patience: int = None,
 |     validation_metric: Union[str, List[str]] = "-loss",
 |     num_epochs: int = 20,
 |     cuda_device: Optional[Union[int, torch.device]] = None,
 |     grad_norm: float = None,
 |     grad_clipping: float = None,
 |     distributed: bool = False,
 |     world_size: int = 1,
 |     num_gradient_accumulation_steps: int = 1,
 |     use_amp: bool = False,
 |     no_grad: List[str] = None,
 |     optimizer: Lazy[Optimizer] = Lazy(Optimizer.default),
 |     learning_rate_scheduler: Lazy[LearningRateScheduler] = None,
 |     momentum_scheduler: Lazy[MomentumScheduler] = None,
 |     moving_average: Lazy[MovingAverage] = None,
 |     checkpointer: Lazy[Checkpointer] = Lazy(Checkpointer),
 |     callbacks: List[Lazy[TrainerCallback]] = None,
 |     trainer_callbacks: List[Lazy[TrainerCallback]] = None
 | ) -> "Trainer"

This method exists so that we can have a documented method to construct this class using FromParams. If you are not using FromParams or config files, you can safely ignore this method.

The reason we can't just use __init__ with FromParams here is because there are sequential dependencies to this class's arguments. Anything that has a Lazy[] type annotation needs something from one of the non-Lazy arguments. The Optimizer needs to have the parameters from the Model before it's constructed, and the Schedulers need to have the Optimizer. Because of this, the typical way we construct things FromParams doesn't work, so we use Lazy to allow for constructing the objects sequentially.

If you're not using FromParams, you can just construct these arguments in the right order yourself in your code and call the constructor directly.