nn.optimizers
fn adagrad #
fn adagrad[T](config AdaGradOptimizerConfig) &AdaGradOptimizer[T]
adagrad creates a new AdaGradOptimizer.
fn adam_optimizer #
fn adam_optimizer[T](config AdamOptimizerConfig) &AdamOptimizer[T]
adam_optimizer creates a new AdamOptimizer with the given configuration.
Example
import vtl.nn.optimizers
opt := optimizers.adam_optimizer[f64](learning_rate: 0.001)
opt.build_params(model.layers())
// inside training loop:
opt.update()!
fn adamw #
fn adamw[T](config AdamWOptimizerConfig) &AdamWOptimizer[T]
adamw creates a new AdamWOptimizer.
fn cosine_annealing_lr #
fn cosine_annealing_lr[T](t_max int, lrd f64) &CosineAnnealingLR[T]
cosine_annealing_lr creates a CosineAnnealingLR scheduler: LR decays from current_lr to lrd following a cosine schedule over t_max steps.
fn exponential_lr #
fn exponential_lr[T](gamma f64) &ExponentialLR[T]
exponential_lr creates an ExponentialLR scheduler: LR decays by gamma at every step. new_lr = current_lr * gamma^step
fn reduce_lr_on_plateau #
fn reduce_lr_on_plateau[T](config ReduceLROnPlateauConfig) &ReduceLROnPlateau[T]
reduce_lr_on_plateau creates a ReduceLROnPlateau scheduler. Reduces LR by factor when the monitored metric stops improving for patience steps. Pass metric_delta to next_lr each step: negative = improvement.
fn rmsprop #
fn rmsprop[T](config RMSPropOptimizerConfig) &RMSPropOptimizer[T]
rmsprop creates a new RMSPropOptimizer.
fn sgd #
fn sgd[T](config SgdOptimizerConfig) &SgdOptimizer[T]
sgd creates a new SgdOptimizer.
fn step_lr #
fn step_lr[T](step_size int, gamma f64) &StepLR[T]
step_lr creates a StepLR scheduler: LR decays by gamma every step_size steps. e.g. step_size=30, gamma=0.1 → LR is 10× smaller after each 30 steps.
interface Scheduler #
interface Scheduler[T] {
next_lr(current_lr f64, step int) f64
}
Scheduler is the interface for learning rate schedulers. Scheduler is the interface for learning rate schedulers. Implementations update the learning rate based on the current training step.
fn (AdaGradOptimizer[T]) build_params #
fn (mut o AdaGradOptimizer[T]) build_params(layers []types.Layer[T])
build_params registers all trainable variables from layers. Call once before training.
fn (AdaGradOptimizer[T]) update #
fn (mut o AdaGradOptimizer[T]) update() !
update performs one AdaGrad parameter update and zeros all gradients.
fn (AdamOptimizer[T]) build_params #
fn (mut o AdamOptimizer[T]) build_params(layers []types.Layer[T])
build_params registers all trainable variables from layers into the optimizer. Call once after constructing the model, before the first update().
fn (AdamOptimizer[T]) update #
fn (mut o AdamOptimizer[T]) update() !
update performs one Adam parameter update step and zeros all gradients. Must be called after loss.backward().
fn (AdamWOptimizer[T]) build_params #
fn (mut o AdamWOptimizer[T]) build_params(layers []types.Layer[T])
build_params registers all trainable variables from layers. Call once before training.
fn (AdamWOptimizer[T]) update #
fn (mut o AdamWOptimizer[T]) update() !
update performs one AdamW parameter update and zeros all gradients.
fn (CosineAnnealingLR[T]) next_lr #
fn (s &CosineAnnealingLR[T]) next_lr(current_lr f64, step int) f64
fn (ExponentialLR[T]) next_lr #
fn (s &ExponentialLR[T]) next_lr(current_lr f64, step int) f64
fn (RMSPropOptimizer[T]) build_params #
fn (mut o RMSPropOptimizer[T]) build_params(layers []types.Layer[T])
build_params registers all trainable variables from layers. Call once before training.
fn (RMSPropOptimizer[T]) update #
fn (mut o RMSPropOptimizer[T]) update() !
update performs one RMSProp parameter update and zeros all gradients.
fn (ReduceLROnPlateau[T]) next_lr #
fn (mut s ReduceLROnPlateau[T]) next_lr(current_lr f64, step int, metric_delta f64) f64
fn (SgdOptimizer[T]) build_params #
fn (mut o SgdOptimizer[T]) build_params(layers []types.Layer[T])
build_params registers all trainable variables from layers. Call once before training.
fn (SgdOptimizer[T]) update #
fn (mut o SgdOptimizer[T]) update() !
update performs one SGD parameter update and zeros all gradients.
fn (StepLR[T]) next_lr #
fn (s &StepLR[T]) next_lr(current_lr f64, step int) f64
struct AdaGradOptimizer #
struct AdaGradOptimizer[T] {
learning_rate f64
epsilon f64
pub mut:
weight_decay f64
params []&autograd.Variable[T]
accumulated_sq_grads []&vtl.Tensor[T]
}
AdaGradOptimizer implements the AdaGrad (Adaptive Gradient) algorithm. Accumulates squared gradients and adapts the learning rate per parameter.
struct AdaGradOptimizerConfig #
struct AdaGradOptimizerConfig {
learning_rate f64 = 0.01
epsilon f64 = 1e-8
weight_decay f64 = 0.0
}
struct AdamOptimizer #
struct AdamOptimizer[T] {
learning_rate f64
epsilon f64
pub mut:
beta1 f64
beta2 f64
beta1_t f64
beta2_t f64
params []&autograd.Variable[T]
first_moments []&vtl.Tensor[T]
second_moments []&vtl.Tensor[T]
}
AdamOptimizer implements the Adam optimiser (Adaptive Moment Estimation).
Maintains per-parameter first-moment (mean) and second-moment (uncentred variance) moving averages of the gradients, with bias correction applied at each step.
Update rule: m = β₁·m + (1-β₁)·g v = β₂·v + (1-β₂)·g² θ = θ - lr · √(1-β₂ᵗ) / (1-β₁ᵗ) · m / (√v + ε)
Reference: Kingma & Ba, "Adam: A Method for Stochastic Optimization" (2014).
struct AdamOptimizerConfig #
struct AdamOptimizerConfig {
learning_rate f64 = 0.001
beta1 f64 = 0.9
beta2 f64 = 0.999
epsilon f64 = 1e-8
}
AdamOptimizerConfig configures AdamOptimizer.
Fields:- learning_rate — step size α (default: 0.001)
beta1— exponential decay rate for first moment estimates (default: 0.9)beta2— exponential decay rate for second moment estimates (default: 0.999)epsilon— small constant for numerical stability (default: 1e-8)
struct AdamWOptimizer #
struct AdamWOptimizer[T] {
learning_rate f64
epsilon f64
pub mut:
beta1 f64
beta2 f64
beta1_t f64
beta2_t f64
weight_decay f64
params []&autograd.Variable[T]
first_moments []&vtl.Tensor[T]
second_moments []&vtl.Tensor[T]
}
AdamWOptimizer implements AdamW (Adam with Decoupled Weight Decay).
Identical to Adam but weight decay is applied directly to the parameters (not through the gradient), which typically gives better generalisation.
Update rule (after bias correction): θ = θ - lr · (m̂ / (√v̂ + ε) + weight_decay · θ)
Reference: Loshchilov & Hutter, "Decoupled Weight Decay Regularization" (2017).
struct AdamWOptimizerConfig #
struct AdamWOptimizerConfig {
learning_rate f64 = 0.001
beta1 f64 = 0.9
beta2 f64 = 0.999
epsilon f64 = 1e-8
weight_decay f64 = 0.01
}
AdamWOptimizerConfig configures AdamWOptimizer.
Fields:- learning_rate — step size (default: 0.001)
beta1— first-moment decay rate (default: 0.9)beta2— second-moment decay rate (default: 0.999)epsilon— numerical stability constant (default: 1e-8)weight_decay— decoupled weight-decay coefficient λ (default: 0.01)
struct CosineAnnealingLR #
struct CosineAnnealingLR[T] {
pub:
t_max int // maximum number of iterations
lrd f64 // lower bound lr (default: 0)
}
CosineAnnealingLR decays using a cosine schedule from lrd to 0.
struct ExponentialLR #
struct ExponentialLR[T] {
gamma f64
}
ExponentialLR decays the learning rate by gamma at every step.
struct RMSPropOptimizer #
struct RMSPropOptimizer[T] {
learning_rate f64
epsilon f64
pub mut:
alpha f64 // smoothing constant
weight_decay f64
params []&autograd.Variable[T]
sq_avg []&vtl.Tensor[T]
}
RMSPropOptimizer implements the RMSProp optimiser.
Maintains a running average of the squared gradient per parameter and normalises the update by it, allowing different effective learning rates per parameter.
Update rule: sq_avg = α·sq_avg + (1-α)·g² θ = θ - lr · (g / (√sq_avg + ε) + wd·θ)
Reference: Hinton, "Neural Networks for Machine Learning", Lecture 6e.
struct RMSPropOptimizerConfig #
struct RMSPropOptimizerConfig {
learning_rate f64 = 0.001
alpha f64 = 0.99
epsilon f64 = 1e-8
weight_decay f64 = 0.0
}
RMSPropOptimizerConfig configures RMSPropOptimizer.
Fields:- learning_rate — step size (default: 0.001)
alpha— smoothing constant for squared-gradient moving average (default: 0.99)epsilon— numerical stability constant (default: 1e-8)weight_decay— L2 regularisation coefficient (default: 0.0)
struct ReduceLROnPlateau #
struct ReduceLROnPlateau[T] {
factor f64
patience int
threshold f64
epsilon f64
cooldown int
pub mut:
wait int
current_lr f64
}
ReduceLROnPlateau reduces LR when a metric has stopped improving.
struct ReduceLROnPlateauConfig #
struct ReduceLROnPlateauConfig {
factor f64 = 0.1
patience int = 10
threshold f64 = 1e-4
epsilon f64 = 1e-8
cooldown int
}
struct SgdOptimizer #
struct SgdOptimizer[T] {
learning_rate f64
pub mut:
params []&autograd.Variable[T]
}
SgdOptimizer implements vanilla Stochastic Gradient Descent with optional momentum.
struct SgdOptimizerConfig #
struct SgdOptimizerConfig {
pub:
learning_rate f64 = 0.001
}
SgdOptimizerConfig configures SgdOptimizer.
Fields:- learning_rate — step size α (default: 0.001)
struct StepLR #
struct StepLR[T] {
step_size int
gamma f64
}
StepLR decays the learning rate by gamma every step_size steps.
- fn adagrad
- fn adam_optimizer
- fn adamw
- fn cosine_annealing_lr
- fn exponential_lr
- fn reduce_lr_on_plateau
- fn rmsprop
- fn sgd
- fn step_lr
- interface Scheduler
- type AdaGradOptimizer[T]
- type AdamOptimizer[T]
- type AdamWOptimizer[T]
- type CosineAnnealingLR[T]
- type ExponentialLR[T]
- type RMSPropOptimizer[T]
- type ReduceLROnPlateau[T]
- type SgdOptimizer[T]
- type StepLR[T]
- struct AdaGradOptimizer
- struct AdaGradOptimizerConfig
- struct AdamOptimizer
- struct AdamOptimizerConfig
- struct AdamWOptimizer
- struct AdamWOptimizerConfig
- struct CosineAnnealingLR
- struct ExponentialLR
- struct RMSPropOptimizer
- struct RMSPropOptimizerConfig
- struct ReduceLROnPlateau
- struct ReduceLROnPlateauConfig
- struct SgdOptimizer
- struct SgdOptimizerConfig
- struct StepLR