Skip to content

nn.optimizers

fn adagrad #

fn adagrad[T](config AdaGradOptimizerConfig) &AdaGradOptimizer[T]

adagrad creates a new AdaGradOptimizer.

fn adam_optimizer #

fn adam_optimizer[T](config AdamOptimizerConfig) &AdamOptimizer[T]

adam_optimizer creates a new AdamOptimizer with the given configuration.

Example

import vtl.nn.optimizers
opt := optimizers.adam_optimizer[f64](learning_rate: 0.001)
opt.build_params(model.layers())
// inside training loop:
opt.update()!

fn adamw #

fn adamw[T](config AdamWOptimizerConfig) &AdamWOptimizer[T]

adamw creates a new AdamWOptimizer.

fn cosine_annealing_lr #

fn cosine_annealing_lr[T](t_max int, lrd f64) &CosineAnnealingLR[T]

cosine_annealing_lr creates a CosineAnnealingLR scheduler: LR decays from current_lr to lrd following a cosine schedule over t_max steps.

fn exponential_lr #

fn exponential_lr[T](gamma f64) &ExponentialLR[T]

exponential_lr creates an ExponentialLR scheduler: LR decays by gamma at every step. new_lr = current_lr * gamma^step

fn reduce_lr_on_plateau #

fn reduce_lr_on_plateau[T](config ReduceLROnPlateauConfig) &ReduceLROnPlateau[T]

reduce_lr_on_plateau creates a ReduceLROnPlateau scheduler. Reduces LR by factor when the monitored metric stops improving for patience steps. Pass metric_delta to next_lr each step: negative = improvement.

fn rmsprop #

fn rmsprop[T](config RMSPropOptimizerConfig) &RMSPropOptimizer[T]

rmsprop creates a new RMSPropOptimizer.

fn sgd #

fn sgd[T](config SgdOptimizerConfig) &SgdOptimizer[T]

sgd creates a new SgdOptimizer.

fn step_lr #

fn step_lr[T](step_size int, gamma f64) &StepLR[T]

step_lr creates a StepLR scheduler: LR decays by gamma every step_size steps. e.g. step_size=30, gamma=0.1 → LR is 10× smaller after each 30 steps.

interface Scheduler #

interface Scheduler[T] {
	next_lr(current_lr f64, step int) f64
}

Scheduler is the interface for learning rate schedulers. Scheduler is the interface for learning rate schedulers. Implementations update the learning rate based on the current training step.

fn (AdaGradOptimizer[T]) build_params #

fn (mut o AdaGradOptimizer[T]) build_params(layers []types.Layer[T])

build_params registers all trainable variables from layers. Call once before training.

fn (AdaGradOptimizer[T]) update #

fn (mut o AdaGradOptimizer[T]) update() !

update performs one AdaGrad parameter update and zeros all gradients.

fn (AdamOptimizer[T]) build_params #

fn (mut o AdamOptimizer[T]) build_params(layers []types.Layer[T])

build_params registers all trainable variables from layers into the optimizer. Call once after constructing the model, before the first update().

fn (AdamOptimizer[T]) update #

fn (mut o AdamOptimizer[T]) update() !

update performs one Adam parameter update step and zeros all gradients. Must be called after loss.backward().

fn (AdamWOptimizer[T]) build_params #

fn (mut o AdamWOptimizer[T]) build_params(layers []types.Layer[T])

build_params registers all trainable variables from layers. Call once before training.

fn (AdamWOptimizer[T]) update #

fn (mut o AdamWOptimizer[T]) update() !

update performs one AdamW parameter update and zeros all gradients.

fn (CosineAnnealingLR[T]) next_lr #

fn (s &CosineAnnealingLR[T]) next_lr(current_lr f64, step int) f64

fn (ExponentialLR[T]) next_lr #

fn (s &ExponentialLR[T]) next_lr(current_lr f64, step int) f64

fn (RMSPropOptimizer[T]) build_params #

fn (mut o RMSPropOptimizer[T]) build_params(layers []types.Layer[T])

build_params registers all trainable variables from layers. Call once before training.

fn (RMSPropOptimizer[T]) update #

fn (mut o RMSPropOptimizer[T]) update() !

update performs one RMSProp parameter update and zeros all gradients.

fn (ReduceLROnPlateau[T]) next_lr #

fn (mut s ReduceLROnPlateau[T]) next_lr(current_lr f64, step int, metric_delta f64) f64

fn (SgdOptimizer[T]) build_params #

fn (mut o SgdOptimizer[T]) build_params(layers []types.Layer[T])

build_params registers all trainable variables from layers. Call once before training.

fn (SgdOptimizer[T]) update #

fn (mut o SgdOptimizer[T]) update() !

update performs one SGD parameter update and zeros all gradients.

fn (StepLR[T]) next_lr #

fn (s &StepLR[T]) next_lr(current_lr f64, step int) f64

struct AdaGradOptimizer #

struct AdaGradOptimizer[T] {
	learning_rate f64
	epsilon       f64
pub mut:
	weight_decay         f64
	params               []&autograd.Variable[T]
	accumulated_sq_grads []&vtl.Tensor[T]
}

AdaGradOptimizer implements the AdaGrad (Adaptive Gradient) algorithm. Accumulates squared gradients and adapts the learning rate per parameter.

struct AdaGradOptimizerConfig #

@[params]
struct AdaGradOptimizerConfig {
	learning_rate f64 = 0.01
	epsilon       f64 = 1e-8
	weight_decay  f64 = 0.0
}

struct AdamOptimizer #

struct AdamOptimizer[T] {
	learning_rate f64
	epsilon       f64
pub mut:
	beta1          f64
	beta2          f64
	beta1_t        f64
	beta2_t        f64
	params         []&autograd.Variable[T]
	first_moments  []&vtl.Tensor[T]
	second_moments []&vtl.Tensor[T]
}

AdamOptimizer implements the Adam optimiser (Adaptive Moment Estimation).

Maintains per-parameter first-moment (mean) and second-moment (uncentred variance) moving averages of the gradients, with bias correction applied at each step.

Update rule: m = β₁·m + (1-β₁)·g v = β₂·v + (1-β₂)·g² θ = θ - lr · √(1-β₂ᵗ) / (1-β₁ᵗ) · m / (√v + ε)

Reference: Kingma & Ba, "Adam: A Method for Stochastic Optimization" (2014).

struct AdamOptimizerConfig #

@[params]
struct AdamOptimizerConfig {
	learning_rate f64 = 0.001
	beta1         f64 = 0.9
	beta2         f64 = 0.999
	epsilon       f64 = 1e-8
}

AdamOptimizerConfig configures AdamOptimizer.

Fields:- learning_rate — step size α (default: 0.001)

  • beta1 — exponential decay rate for first moment estimates (default: 0.9)
  • beta2 — exponential decay rate for second moment estimates (default: 0.999)
  • epsilon — small constant for numerical stability (default: 1e-8)

struct AdamWOptimizer #

struct AdamWOptimizer[T] {
	learning_rate f64
	epsilon       f64
pub mut:
	beta1          f64
	beta2          f64
	beta1_t        f64
	beta2_t        f64
	weight_decay   f64
	params         []&autograd.Variable[T]
	first_moments  []&vtl.Tensor[T]
	second_moments []&vtl.Tensor[T]
}

AdamWOptimizer implements AdamW (Adam with Decoupled Weight Decay).

Identical to Adam but weight decay is applied directly to the parameters (not through the gradient), which typically gives better generalisation.

Update rule (after bias correction): θ = θ - lr · (m̂ / (√v̂ + ε) + weight_decay · θ)

Reference: Loshchilov & Hutter, "Decoupled Weight Decay Regularization" (2017).

struct AdamWOptimizerConfig #

@[params]
struct AdamWOptimizerConfig {
	learning_rate f64 = 0.001
	beta1         f64 = 0.9
	beta2         f64 = 0.999
	epsilon       f64 = 1e-8
	weight_decay  f64 = 0.01
}

AdamWOptimizerConfig configures AdamWOptimizer.

Fields:- learning_rate — step size (default: 0.001)

  • beta1 — first-moment decay rate (default: 0.9)
  • beta2 — second-moment decay rate (default: 0.999)
  • epsilon — numerical stability constant (default: 1e-8)
  • weight_decay — decoupled weight-decay coefficient λ (default: 0.01)

struct CosineAnnealingLR #

struct CosineAnnealingLR[T] {
pub:
	t_max int // maximum number of iterations
	lrd   f64 // lower bound lr (default: 0)
}

CosineAnnealingLR decays using a cosine schedule from lrd to 0.

struct ExponentialLR #

struct ExponentialLR[T] {
	gamma f64
}

ExponentialLR decays the learning rate by gamma at every step.

struct RMSPropOptimizer #

struct RMSPropOptimizer[T] {
	learning_rate f64
	epsilon       f64
pub mut:
	alpha        f64 // smoothing constant
	weight_decay f64
	params       []&autograd.Variable[T]
	sq_avg       []&vtl.Tensor[T]
}

RMSPropOptimizer implements the RMSProp optimiser.

Maintains a running average of the squared gradient per parameter and normalises the update by it, allowing different effective learning rates per parameter.

Update rule: sq_avg = α·sq_avg + (1-α)·g² θ = θ - lr · (g / (√sq_avg + ε) + wd·θ)

Reference: Hinton, "Neural Networks for Machine Learning", Lecture 6e.

struct RMSPropOptimizerConfig #

@[params]
struct RMSPropOptimizerConfig {
	learning_rate f64 = 0.001
	alpha         f64 = 0.99
	epsilon       f64 = 1e-8
	weight_decay  f64 = 0.0
}

RMSPropOptimizerConfig configures RMSPropOptimizer.

Fields:- learning_rate — step size (default: 0.001)

  • alpha — smoothing constant for squared-gradient moving average (default: 0.99)
  • epsilon — numerical stability constant (default: 1e-8)
  • weight_decay — L2 regularisation coefficient (default: 0.0)

struct ReduceLROnPlateau #

struct ReduceLROnPlateau[T] {
	factor    f64
	patience  int
	threshold f64
	epsilon   f64
	cooldown  int
pub mut:
	wait       int
	current_lr f64
}

ReduceLROnPlateau reduces LR when a metric has stopped improving.

struct ReduceLROnPlateauConfig #

@[params]
struct ReduceLROnPlateauConfig {
	factor    f64 = 0.1
	patience  int = 10
	threshold f64 = 1e-4
	epsilon   f64 = 1e-8
	cooldown  int
}

struct SgdOptimizer #

struct SgdOptimizer[T] {
	learning_rate f64
pub mut:
	params []&autograd.Variable[T]
}

SgdOptimizer implements vanilla Stochastic Gradient Descent with optional momentum.

struct SgdOptimizerConfig #

@[params]
struct SgdOptimizerConfig {
pub:
	learning_rate f64 = 0.001
}

SgdOptimizerConfig configures SgdOptimizer.

Fields:- learning_rate — step size α (default: 0.001)

struct StepLR #

struct StepLR[T] {
	step_size int
	gamma     f64
}

StepLR decays the learning rate by gamma every step_size steps.