nn.internal
fn avgpool2d_backward #
fn avgpool2d_backward[T](grad_out &vtl.Tensor[T], kernel []int, padding []int, stride []int) !&vtl.Tensor[T]
fn avgpool2d_forward #
fn avgpool2d_forward[T](input &vtl.Tensor[T], kernel []int, padding []int, stride []int) !&vtl.Tensor[T]
fn batchnorm1d_backward #
fn batchnorm1d_backward[T](gradient &vtl.Tensor[T], input &vtl.Tensor[T], gamma &vtl.Tensor[T], beta &vtl.Tensor[T], mean &vtl.Tensor[T], var_ &vtl.Tensor[T], eps f64) ![]&vtl.Tensor[T]
batchnorm1d_backward computes gradients w.r.t. input, gamma, beta.
fn batchnorm1d_forward #
fn batchnorm1d_forward[T](input &vtl.Tensor[T], gamma &vtl.Tensor[T], beta &vtl.Tensor[T], running_mean &vtl.Tensor[T], running_var &vtl.Tensor[T], eps f64) !&vtl.Tensor[T]
batchnorm1d_forward computes batch norm using running mean/var (inference path).
fn batchnorm1d_training #
fn batchnorm1d_training[T](input &vtl.Tensor[T], gamma &vtl.Tensor[T], beta &vtl.Tensor[T], eps f64) !(&vtl.Tensor[T], &vtl.Tensor[T], &vtl.Tensor[T])
batchnorm1d_training computes batch norm using batch stats (training path).
fn bce #
fn bce[T](input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]
bce computes binary cross entropy between input and target. input values should be in (0,1) — caller is responsible for clamping/sigmoid.
fn bce_backward #
fn bce_backward[T](gradient &vtl.Tensor[T], input &vtl.Tensor[T], target &vtl.Tensor[T], from_logits bool) !&vtl.Tensor[T]
bce_backward computes the gradient of BCE w.r.t. the raw logits (before sigmoid). If from_logits=true, the upstream gradient is multiplied by the sigmoid derivative.
fn compute_fans #
fn compute_fans(shape []int) (int, int)
fn conv2d_backward #
fn conv2d_backward[T](grad_out &vtl.Tensor[T],
input &vtl.Tensor[T],
weight &vtl.Tensor[T],
bias &vtl.Tensor[T],
kernel_size []int,
config Conv2DConfig) ![]&vtl.Tensor[T]
conv2d_backward computes gradients for input, weight, bias. Returns [d_input, d_weight, d_bias].
fn conv2d_forward #
fn conv2d_forward[T](input &vtl.Tensor[T],
weight &vtl.Tensor[T],
bias &vtl.Tensor[T],
kernel_size []int,
config Conv2DConfig) !&vtl.Tensor[T]
conv2d_forward implements the forward pass of 2D convolution. input: [batch, in_ch, H, W] weight: [out_ch, in_ch/groups, k_h, k_w] bias: [1, out_ch] config: padding, stride, dilation, groups returns: [batch, out_ch, out_H, out_W]
fn cross_entropy #
fn cross_entropy[T](input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]
cross_entropy computes CrossEntropyLoss (LogSoftmax + NLL combined). This is the standard cross-entropy for multi-class classification. input: [batch, n_classes] raw logits target: [batch, n_classes] one-hot targets
fn cross_entropy_backward #
fn cross_entropy_backward[T](gradient &vtl.Tensor[T], input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]
cross_entropy_backward computes gradient of CrossEntropyLoss w.r.t. input logits. dL/dx_i = (softmax(x)_i - target_i) / batch_size
fn deriv_elu #
fn deriv_elu[T](gradient &vtl.Tensor[T], cached &vtl.Tensor[T], alpha T) !&vtl.Tensor[T]
deriv_elu computes the derivative of elu For x >= 0: d/dx = 1 → upstream * 1 For x < 0: d/dx = alpha * exp(x) = elu(x) + alpha = cached + alpha
fn deriv_gelu #
fn deriv_gelu[T](gradient &vtl.Tensor[T], cached &vtl.Tensor[T]) !&vtl.Tensor[T]
deriv_gelu computes the derivative of GELU. d/dx GELU(x) = 0.5 * (1 + tanh(z)) + 0.5 * x * sech^2(z) * dz/dx where z = sqrt(2/pi) * (x + 0.044715 * x^3) and dz/dx = sqrt(2/pi) * (1 + 3 * 0.044715 * x^2)
fn deriv_leaky_relu #
fn deriv_leaky_relu[T](gradient &vtl.Tensor[T], cached &vtl.Tensor[T], alpha T) !&vtl.Tensor[T]
deriv_leaky_relu computes the derivative of leaky_relu
fn deriv_mish #
fn deriv_mish[T](gradient &vtl.Tensor[T], cached &vtl.Tensor[T]) !&vtl.Tensor[T]
deriv_mish computes the derivative of Mish(x) = x * tanh(softplus(x)). d/dx Mish = tanh(softplus(x)) + x * (1 - tanh^2(softplus(x))) * sigmoid(x)
fn deriv_relu #
fn deriv_relu[T](gradient &vtl.Tensor[T], cached &vtl.Tensor[T]) !&vtl.Tensor[T]
deriv_relu computes the derivate of relu
fn deriv_sigmoid #
fn deriv_sigmoid[T](gradient &vtl.Tensor[T], cached &vtl.Tensor[T]) !&vtl.Tensor[T]
deriv_sigmoid computes the derivative of sigmoid sigmoid'(x) = sigmoid(x) * (1 - sigmoid(x)) The gate caches the sigmoid output, so vals[1] = sigmoid(x).
fn deriv_softmax #
fn deriv_softmax[T](gradient &vtl.Tensor[T], input &vtl.Tensor[T], dim int) !&vtl.Tensor[T]
deriv_softmax computes the Jacobian-vector product for softmax. For a softmax slice s_i = exp(x_i) / sum_j exp(x_j), the Jacobian is: dL/dx_k = sum_i L_i * ds_i/dx_k ds_i/dx_k = s_i * (delta_ik - s_k) (i = k: s_i*(1-s_i), i ≠ k: -s_i*s_k)
We implement the fast "jacobian * grad" version: grad_out * J^T gives dL/dx_k = sum_i grad_out_i * ds_i/dx_k = grad_out_i * s_i * (delta_ik - s_k) = s_k * (grad_out_k - sum_i grad_out_i * s_i)
fn deriv_swish #
fn deriv_swish[T](gradient &vtl.Tensor[T], cached &vtl.Tensor[T]) !&vtl.Tensor[T]
deriv_swish computes the derivative of Swish(x) = x * sigmoid(x). d/dx Swish = sigmoid(x) + x * sigmoid(x) * (1 - sigmoid(x)) = sigmoid(x) * (1 + x * (1 - sigmoid(x)))
fn deriv_tanh #
fn deriv_tanh[T](gradient &vtl.Tensor[T], cached &vtl.Tensor[T]) !&vtl.Tensor[T]
deriv_tanh computes the derivative of tanh
fn dropout #
fn dropout[T](input &vtl.Tensor[T], mask &vtl.Tensor[T], prob f64) !&vtl.Tensor[T]
fn dropout_backwards #
fn dropout_backwards[T](gradient &vtl.Tensor[T], mask &vtl.Tensor[T], prob f64) !&vtl.Tensor[T]
fn elu #
fn elu[T](x &vtl.Tensor[T], alpha T) &vtl.Tensor[T]
elu activation function
fn embedding_backward #
fn embedding_backward[T](grad_out &vtl.Tensor[T], input &vtl.Tensor[T], weight &vtl.Tensor[T]) ![]&vtl.Tensor[T]
embedding_backward computes gradient w.r.t. weight. Gradients are accumulated into the weight rows corresponding to the input indices.
fn embedding_forward #
fn embedding_forward[T](input &vtl.Tensor[T], weight &vtl.Tensor[T]) !&vtl.Tensor[T]
embedding_forward looks up each integer index in the weight matrix. input: [batch, seq_len] integer indices weight: [vocab_size, embedding_dim] returns: [batch, seq_len, embedding_dim]
fn gelu #
fn gelu[T](x &vtl.Tensor[T]) &vtl.Tensor[T]
gelu applies the Gaussian Error Linear Unit activation. GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
fn global_avgpool2d_backward #
fn global_avgpool2d_backward[T](grad_out &vtl.Tensor[T], input &vtl.Tensor[T]) !&vtl.Tensor[T]
fn global_avgpool2d_forward #
fn global_avgpool2d_forward[T](input &vtl.Tensor[T]) !&vtl.Tensor[T]
fn gru_forward_single #
fn gru_forward_single[T](input &vtl.Tensor[T],
hidden0 &vtl.Tensor[T],
w_ih &vtl.Tensor[T],
w_hh &vtl.Tensor[T],
b_ih &vtl.Tensor[T],
b_hh &vtl.Tensor[T]) !(&vtl.Tensor[T], &vtl.Tensor[T])
gru_forward_single runs a single-layer GRU over a sequence.
Implements the standard GRU equations (PyTorch/CuDNN compatible): r_t = sigmoid(x_t @ W_ir^T + h_{t-1} @ W_hr^T + b_ir + b_hr) z_t = sigmoid(x_t @ W_iz^T + h_{t-1} @ W_hz^T + b_iz + b_hz) n_t = tanh(x_t @ W_in^T + b_in + r_t * (h_{t-1} @ W_hn^T + b_hn)) h_t = (1 - z_t) * n_t + z_t * h_{t-1}
Shapes: input: [seq_len, batch, input_size] hidden0: [batch, hidden_size] w_ih: [3hidden_size, input_size] (r, z, n gates stacked) w_hh: [3hidden_size, hidden_size] b_ih: [3hidden_size] b_hh: [3hidden_size]
Returns (output [seq_len, batch, hidden_size], h_n [batch, hidden_size])
fn huber #
fn huber[T](input &vtl.Tensor[T], target &vtl.Tensor[T], delta T) !&vtl.Tensor[T]
huber computes the Huber loss (smooth L1)
fn huber_backward #
fn huber_backward[T](gradient &vtl.Tensor[T], input &vtl.Tensor[T], target &vtl.Tensor[T], delta T) !&vtl.Tensor[T]
huber_backward computes the gradient of Huber loss w.r.t. input
fn kaiming_normal #
fn kaiming_normal[T](shape []int) &vtl.Tensor[T]
fn kaiming_uniform #
fn kaiming_uniform[T](shape []int) &vtl.Tensor[T]
fn kl_div #
fn kl_div[T](input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]
kl_div computes KL Divergence loss D_KL(P || Q) = sum(P * log(P/Q)) input: log-probs Q, target: probabilities P
fn kl_div_backward #
fn kl_div_backward[T](gradient &vtl.Tensor[T], input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]
kl_div_backward computes gradient of KL Divergence w.r.t. input (log-probs Q)
fn layer_norm_backward #
fn layer_norm_backward[T](gradient &vtl.Tensor[T], input &vtl.Tensor[T], gamma &vtl.Tensor[T], beta &vtl.Tensor[T], eps f64) ![]&vtl.Tensor[T]
layer_norm_backward computes gradient w.r.t. input, gamma, beta.
fn layer_norm_forward #
fn layer_norm_forward[T](input &vtl.Tensor[T], gamma &vtl.Tensor[T], beta &vtl.Tensor[T], eps f64) !&vtl.Tensor[T]
layer_norm_forward computes layer normalization over all elements of input. gamma, beta: same shape as input (optional affine params, pass nil to skip)
fn leaky_relu #
fn leaky_relu[T](x &vtl.Tensor[T], alpha T) &vtl.Tensor[T]
leaky_relu activation function
fn lstm_forward_single #
fn lstm_forward_single[T](input &vtl.Tensor[T],
hidden0 &vtl.Tensor[T],
cell0 &vtl.Tensor[T],
w_ih &vtl.Tensor[T],
w_hh &vtl.Tensor[T],
b_ih &vtl.Tensor[T],
b_hh &vtl.Tensor[T]) !(&vtl.Tensor[T], &vtl.Tensor[T], &vtl.Tensor[T])
lstm_forward_single runs a single-layer LSTM over a sequence.
Implements the standard LSTM equations: i_t = sigmoid(x_t @ W_ii^T + h_{t-1} @ W_hi^T + b_ii + b_hi) f_t = sigmoid(x_t @ W_if^T + h_{t-1} @ W_hf^T + b_if + b_hf) g_t = tanh(x_t @ W_ig^T + h_{t-1} @ W_hg^T + b_ig + b_hg) o_t = sigmoid(x_t @ W_io^T + h_{t-1} @ W_ho^T + b_io + b_ho) c_t = f_t * c_{t-1} + i_t * g_t h_t = o_t * tanh(c_t)
Shapes: input: [seq_len, batch, input_size] hidden0: [batch, hidden_size] cell0: [batch, hidden_size] w_ih: [4hidden_size, input_size] w_hh: [4hidden_size, hidden_size] b_ih: [4hidden_size] b_hh: [4hidden_size]
Returns (output [seq_len, batch, hidden_size], h_n [batch, hidden_size], c_n [batch, hidden_size])
fn maxpool2d #
fn maxpool2d[T](input &vtl.Tensor[T], kernel []int, padding []int, stride []int) (&vtl.Tensor[int], &vtl.Tensor[T])
fn maxpool2d_backward #
fn maxpool2d_backward[T](shape []int, max_indices &vtl.Tensor[int], grad_output &vtl.Tensor[T]) !&vtl.Tensor[T]
fn mish #
fn mish[T](x &vtl.Tensor[T]) &vtl.Tensor[T]
mish applies the Mish activation: x * tanh(softplus(x)). softplus(x) = log(1 + exp(x))
fn mse #
fn mse[T](input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]
mse squared error between the labels and the predictions
fn mse_backward #
fn mse_backward[T](gradient &vtl.Tensor[T], cache &vtl.Tensor[T], target &vtl.Tensor[T]) ![]&vtl.Tensor[T]
fn nll #
fn nll[T](input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]
nll computes Negative Log Likelihood loss (assumes input is log-probs) Target is one-hot or class probabilities; we compute -sum(target * log(input))
fn nll_backward #
fn nll_backward[T](gradient &vtl.Tensor[T], input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]
nll_backward computes gradient of NLL w.r.t. input (log-probs)
fn relu #
fn relu[T](x &vtl.Tensor[T]) &vtl.Tensor[T]
relu activation function
fn relu_with_backend #
fn relu_with_backend[T](x &vtl.Tensor[T], backend vtl.Backend, strict bool) !&vtl.Tensor[T]
fn sgd_optimize #
fn sgd_optimize[T](mut value vtl.Tensor[T], gradient &vtl.Tensor[T], learning_rate f64) !
fn sigmoid #
fn sigmoid[T](x &vtl.Tensor[T]) &vtl.Tensor[T]
sigmoid takes a real-valued number and squashes it to the range [0, 1]
fn sigmoid_cross_entropy #
fn sigmoid_cross_entropy[T](input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]
sigmoid_cross_entropy computes the sigmoid cross entropy between the labels and the predictions. Uses the numerically stable logsumexp formulation (Arraymancer source of truth): loss = mean( -y*x + max(x,0) + log1p(exp(-|x|)) )
fn sigmoid_cross_entropy_backward #
fn sigmoid_cross_entropy_backward[T](gradient &vtl.Tensor[T], cache &vtl.Tensor[T], target &vtl.Tensor[T]) ![]&vtl.Tensor[T]
fn sigmoid_with_backend #
fn sigmoid_with_backend[T](x &vtl.Tensor[T], backend vtl.Backend, strict bool) !&vtl.Tensor[T]
fn softmax_cross_entropy #
fn softmax_cross_entropy[T](input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]
softmax_cross_entropy computes the mean cross-entropy loss for a batch of logit vectors using the numerically stable log-sum-exp trick.
input: [batch_size, n_classes] — raw logits (unnormalised scores) target: [batch_size, n_classes] — one-hot or soft label targets returns: scalar loss tensor of shape [1]
Formula (Arraymancer-style, numerically stable): For each sample i: lse_i = log( sum_j exp(logit_ij - max_i) ) + max_i loss_i = lse_i - sum_j(target_ij * logit_ij) mean_loss = mean over batch
fn softmax_cross_entropy_backward #
fn softmax_cross_entropy_backward[T](gradient &vtl.Tensor[T], cache &vtl.Tensor[T], target &vtl.Tensor[T]) ![]&vtl.Tensor[T]
softmax_cross_entropy_backward computes the gradient of the SCE loss w.r.t. the logits. The gradient for each logit is: upstream * (softmax(logit_i) - target_i) / batch_size
This is the standard gradient derived from the log-softmax formulation (equivalent to Arraymancer's implementation).
fn softmax_forward #
fn softmax_forward[T](input &vtl.Tensor[T], dim int) !&vtl.Tensor[T]
softmax_forward computes softmax along a specified dimension.
fn swish #
fn swish[T](x &vtl.Tensor[T]) &vtl.Tensor[T]
swish applies the Swish activation: x * sigmoid(beta * x), beta=1.
fn tanh #
fn tanh[T](x &vtl.Tensor[T]) &vtl.Tensor[T]
tanh squashes a real-valued number to the range [-1, 1]
fn tanh_with_backend #
fn tanh_with_backend[T](x &vtl.Tensor[T], backend vtl.Backend, strict bool) !&vtl.Tensor[T]
fn variance_scaled #
fn variance_scaled[T](shape []int, scale T, fan_mode FanMode, distribution Distribution) &vtl.Tensor[T]
fn Distribution.from #
fn Distribution.from[W](input W) !Distribution
fn FanMode.from #
fn FanMode.from[W](input W) !FanMode
enum Distribution #
enum Distribution {
uniform
normal
}
enum FanMode #
enum FanMode {
fan_avg
fan_in
fan_out
}
struct Conv2DConfig #
struct Conv2DConfig {
pub:
padding []int = [0, 0]
stride []int = [1, 1]
dilation []int = [1, 1]
groups int = 1
}
Conv2DConfig mirrors vtl.nn.layers.Conv2DConfig to avoid import cycle.
struct LSTMIntermediate #
struct LSTMIntermediate[T] {
mut:
gates []&vtl.Tensor[T] // full gate tensor per timestep
cells []&vtl.Tensor[T] // cell state per timestep
hiddens []&vtl.Tensor[T] // hidden state per timestep
}
LSTMIntermediate stores per-timestep gate values for backprop.
- fn avgpool2d_backward
- fn avgpool2d_forward
- fn batchnorm1d_backward
- fn batchnorm1d_forward
- fn batchnorm1d_training
- fn bce
- fn bce_backward
- fn compute_fans
- fn conv2d_backward
- fn conv2d_forward
- fn cross_entropy
- fn cross_entropy_backward
- fn deriv_elu
- fn deriv_gelu
- fn deriv_leaky_relu
- fn deriv_mish
- fn deriv_relu
- fn deriv_sigmoid
- fn deriv_softmax
- fn deriv_swish
- fn deriv_tanh
- fn dropout
- fn dropout_backwards
- fn elu
- fn embedding_backward
- fn embedding_forward
- fn gelu
- fn global_avgpool2d_backward
- fn global_avgpool2d_forward
- fn gru_forward_single
- fn huber
- fn huber_backward
- fn kaiming_normal
- fn kaiming_uniform
- fn kl_div
- fn kl_div_backward
- fn layer_norm_backward
- fn layer_norm_forward
- fn leaky_relu
- fn lstm_forward_single
- fn maxpool2d
- fn maxpool2d_backward
- fn mish
- fn mse
- fn mse_backward
- fn nll
- fn nll_backward
- fn relu
- fn relu_with_backend
- fn sgd_optimize
- fn sigmoid
- fn sigmoid_cross_entropy
- fn sigmoid_cross_entropy_backward
- fn sigmoid_with_backend
- fn softmax_cross_entropy
- fn softmax_cross_entropy_backward
- fn softmax_forward
- fn swish
- fn tanh
- fn tanh_with_backend
- fn variance_scaled
- fn Distribution.from
- fn FanMode.from
- enum Distribution
- enum FanMode
- struct Conv2DConfig
- struct LSTMIntermediate