Skip to content

nn.internal

fn avgpool2d_backward #

fn avgpool2d_backward[T](grad_out &vtl.Tensor[T], kernel []int, padding []int, stride []int) !&vtl.Tensor[T]

fn avgpool2d_forward #

fn avgpool2d_forward[T](input &vtl.Tensor[T], kernel []int, padding []int, stride []int) !&vtl.Tensor[T]

fn batchnorm1d_backward #

fn batchnorm1d_backward[T](gradient &vtl.Tensor[T], input &vtl.Tensor[T], gamma &vtl.Tensor[T], beta &vtl.Tensor[T], mean &vtl.Tensor[T], var_ &vtl.Tensor[T], eps f64) ![]&vtl.Tensor[T]

batchnorm1d_backward computes gradients w.r.t. input, gamma, beta.

fn batchnorm1d_forward #

fn batchnorm1d_forward[T](input &vtl.Tensor[T], gamma &vtl.Tensor[T], beta &vtl.Tensor[T], running_mean &vtl.Tensor[T], running_var &vtl.Tensor[T], eps f64) !&vtl.Tensor[T]

batchnorm1d_forward computes batch norm using running mean/var (inference path).

fn batchnorm1d_training #

fn batchnorm1d_training[T](input &vtl.Tensor[T], gamma &vtl.Tensor[T], beta &vtl.Tensor[T], eps f64) !(&vtl.Tensor[T], &vtl.Tensor[T], &vtl.Tensor[T])

batchnorm1d_training computes batch norm using batch stats (training path).

fn bce #

fn bce[T](input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]

bce computes binary cross entropy between input and target. input values should be in (0,1) — caller is responsible for clamping/sigmoid.

fn bce_backward #

fn bce_backward[T](gradient &vtl.Tensor[T], input &vtl.Tensor[T], target &vtl.Tensor[T], from_logits bool) !&vtl.Tensor[T]

bce_backward computes the gradient of BCE w.r.t. the raw logits (before sigmoid). If from_logits=true, the upstream gradient is multiplied by the sigmoid derivative.

fn compute_fans #

fn compute_fans(shape []int) (int, int)

fn conv2d_backward #

fn conv2d_backward[T](grad_out &vtl.Tensor[T],
	input &vtl.Tensor[T],
	weight &vtl.Tensor[T],
	bias &vtl.Tensor[T],
	kernel_size []int,
	config Conv2DConfig) ![]&vtl.Tensor[T]

conv2d_backward computes gradients for input, weight, bias. Returns [d_input, d_weight, d_bias].

fn conv2d_forward #

fn conv2d_forward[T](input &vtl.Tensor[T],
	weight &vtl.Tensor[T],
	bias &vtl.Tensor[T],
	kernel_size []int,
	config Conv2DConfig) !&vtl.Tensor[T]

conv2d_forward implements the forward pass of 2D convolution. input: [batch, in_ch, H, W] weight: [out_ch, in_ch/groups, k_h, k_w] bias: [1, out_ch] config: padding, stride, dilation, groups returns: [batch, out_ch, out_H, out_W]

fn cross_entropy #

fn cross_entropy[T](input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]

cross_entropy computes CrossEntropyLoss (LogSoftmax + NLL combined). This is the standard cross-entropy for multi-class classification. input: [batch, n_classes] raw logits target: [batch, n_classes] one-hot targets

fn cross_entropy_backward #

fn cross_entropy_backward[T](gradient &vtl.Tensor[T], input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]

cross_entropy_backward computes gradient of CrossEntropyLoss w.r.t. input logits. dL/dx_i = (softmax(x)_i - target_i) / batch_size

fn deriv_elu #

fn deriv_elu[T](gradient &vtl.Tensor[T], cached &vtl.Tensor[T], alpha T) !&vtl.Tensor[T]

deriv_elu computes the derivative of elu For x >= 0: d/dx = 1 → upstream * 1 For x < 0: d/dx = alpha * exp(x) = elu(x) + alpha = cached + alpha

fn deriv_gelu #

fn deriv_gelu[T](gradient &vtl.Tensor[T], cached &vtl.Tensor[T]) !&vtl.Tensor[T]

deriv_gelu computes the derivative of GELU. d/dx GELU(x) = 0.5 * (1 + tanh(z)) + 0.5 * x * sech^2(z) * dz/dx where z = sqrt(2/pi) * (x + 0.044715 * x^3) and dz/dx = sqrt(2/pi) * (1 + 3 * 0.044715 * x^2)

fn deriv_leaky_relu #

fn deriv_leaky_relu[T](gradient &vtl.Tensor[T], cached &vtl.Tensor[T], alpha T) !&vtl.Tensor[T]

deriv_leaky_relu computes the derivative of leaky_relu

fn deriv_mish #

fn deriv_mish[T](gradient &vtl.Tensor[T], cached &vtl.Tensor[T]) !&vtl.Tensor[T]

deriv_mish computes the derivative of Mish(x) = x * tanh(softplus(x)). d/dx Mish = tanh(softplus(x)) + x * (1 - tanh^2(softplus(x))) * sigmoid(x)

fn deriv_relu #

fn deriv_relu[T](gradient &vtl.Tensor[T], cached &vtl.Tensor[T]) !&vtl.Tensor[T]

deriv_relu computes the derivate of relu

fn deriv_sigmoid #

fn deriv_sigmoid[T](gradient &vtl.Tensor[T], cached &vtl.Tensor[T]) !&vtl.Tensor[T]

deriv_sigmoid computes the derivative of sigmoid sigmoid'(x) = sigmoid(x) * (1 - sigmoid(x)) The gate caches the sigmoid output, so vals[1] = sigmoid(x).

fn deriv_softmax #

fn deriv_softmax[T](gradient &vtl.Tensor[T], input &vtl.Tensor[T], dim int) !&vtl.Tensor[T]

deriv_softmax computes the Jacobian-vector product for softmax. For a softmax slice s_i = exp(x_i) / sum_j exp(x_j), the Jacobian is: dL/dx_k = sum_i L_i * ds_i/dx_k ds_i/dx_k = s_i * (delta_ik - s_k) (i = k: s_i*(1-s_i), i ≠ k: -s_i*s_k)

We implement the fast "jacobian * grad" version: grad_out * J^T gives dL/dx_k = sum_i grad_out_i * ds_i/dx_k = grad_out_i * s_i * (delta_ik - s_k) = s_k * (grad_out_k - sum_i grad_out_i * s_i)

fn deriv_swish #

fn deriv_swish[T](gradient &vtl.Tensor[T], cached &vtl.Tensor[T]) !&vtl.Tensor[T]

deriv_swish computes the derivative of Swish(x) = x * sigmoid(x). d/dx Swish = sigmoid(x) + x * sigmoid(x) * (1 - sigmoid(x)) = sigmoid(x) * (1 + x * (1 - sigmoid(x)))

fn deriv_tanh #

fn deriv_tanh[T](gradient &vtl.Tensor[T], cached &vtl.Tensor[T]) !&vtl.Tensor[T]

deriv_tanh computes the derivative of tanh

fn dropout #

fn dropout[T](input &vtl.Tensor[T], mask &vtl.Tensor[T], prob f64) !&vtl.Tensor[T]

fn dropout_backwards #

fn dropout_backwards[T](gradient &vtl.Tensor[T], mask &vtl.Tensor[T], prob f64) !&vtl.Tensor[T]

fn elu #

fn elu[T](x &vtl.Tensor[T], alpha T) &vtl.Tensor[T]

elu activation function

fn embedding_backward #

fn embedding_backward[T](grad_out &vtl.Tensor[T], input &vtl.Tensor[T], weight &vtl.Tensor[T]) ![]&vtl.Tensor[T]

embedding_backward computes gradient w.r.t. weight. Gradients are accumulated into the weight rows corresponding to the input indices.

fn embedding_forward #

fn embedding_forward[T](input &vtl.Tensor[T], weight &vtl.Tensor[T]) !&vtl.Tensor[T]

embedding_forward looks up each integer index in the weight matrix. input: [batch, seq_len] integer indices weight: [vocab_size, embedding_dim] returns: [batch, seq_len, embedding_dim]

fn gelu #

fn gelu[T](x &vtl.Tensor[T]) &vtl.Tensor[T]

gelu applies the Gaussian Error Linear Unit activation. GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))

fn global_avgpool2d_backward #

fn global_avgpool2d_backward[T](grad_out &vtl.Tensor[T], input &vtl.Tensor[T]) !&vtl.Tensor[T]

fn global_avgpool2d_forward #

fn global_avgpool2d_forward[T](input &vtl.Tensor[T]) !&vtl.Tensor[T]

fn gru_forward_single #

fn gru_forward_single[T](input &vtl.Tensor[T],
	hidden0 &vtl.Tensor[T],
	w_ih &vtl.Tensor[T],
	w_hh &vtl.Tensor[T],
	b_ih &vtl.Tensor[T],
	b_hh &vtl.Tensor[T]) !(&vtl.Tensor[T], &vtl.Tensor[T])

gru_forward_single runs a single-layer GRU over a sequence.

Implements the standard GRU equations (PyTorch/CuDNN compatible): r_t = sigmoid(x_t @ W_ir^T + h_{t-1} @ W_hr^T + b_ir + b_hr) z_t = sigmoid(x_t @ W_iz^T + h_{t-1} @ W_hz^T + b_iz + b_hz) n_t = tanh(x_t @ W_in^T + b_in + r_t * (h_{t-1} @ W_hn^T + b_hn)) h_t = (1 - z_t) * n_t + z_t * h_{t-1}

Shapes: input: [seq_len, batch, input_size] hidden0: [batch, hidden_size] w_ih: [3hidden_size, input_size] (r, z, n gates stacked) w_hh: [3hidden_size, hidden_size] b_ih: [3hidden_size] b_hh: [3hidden_size]

Returns (output [seq_len, batch, hidden_size], h_n [batch, hidden_size])

fn huber #

fn huber[T](input &vtl.Tensor[T], target &vtl.Tensor[T], delta T) !&vtl.Tensor[T]

huber computes the Huber loss (smooth L1)

fn huber_backward #

fn huber_backward[T](gradient &vtl.Tensor[T], input &vtl.Tensor[T], target &vtl.Tensor[T], delta T) !&vtl.Tensor[T]

huber_backward computes the gradient of Huber loss w.r.t. input

fn kaiming_normal #

fn kaiming_normal[T](shape []int) &vtl.Tensor[T]

fn kaiming_uniform #

fn kaiming_uniform[T](shape []int) &vtl.Tensor[T]

fn kl_div #

fn kl_div[T](input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]

kl_div computes KL Divergence loss D_KL(P || Q) = sum(P * log(P/Q)) input: log-probs Q, target: probabilities P

fn kl_div_backward #

fn kl_div_backward[T](gradient &vtl.Tensor[T], input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]

kl_div_backward computes gradient of KL Divergence w.r.t. input (log-probs Q)

fn layer_norm_backward #

fn layer_norm_backward[T](gradient &vtl.Tensor[T], input &vtl.Tensor[T], gamma &vtl.Tensor[T], beta &vtl.Tensor[T], eps f64) ![]&vtl.Tensor[T]

layer_norm_backward computes gradient w.r.t. input, gamma, beta.

fn layer_norm_forward #

fn layer_norm_forward[T](input &vtl.Tensor[T], gamma &vtl.Tensor[T], beta &vtl.Tensor[T], eps f64) !&vtl.Tensor[T]

layer_norm_forward computes layer normalization over all elements of input. gamma, beta: same shape as input (optional affine params, pass nil to skip)

fn leaky_relu #

fn leaky_relu[T](x &vtl.Tensor[T], alpha T) &vtl.Tensor[T]

leaky_relu activation function

fn lstm_forward_single #

fn lstm_forward_single[T](input &vtl.Tensor[T],
	hidden0 &vtl.Tensor[T],
	cell0 &vtl.Tensor[T],
	w_ih &vtl.Tensor[T],
	w_hh &vtl.Tensor[T],
	b_ih &vtl.Tensor[T],
	b_hh &vtl.Tensor[T]) !(&vtl.Tensor[T], &vtl.Tensor[T], &vtl.Tensor[T])

lstm_forward_single runs a single-layer LSTM over a sequence.

Implements the standard LSTM equations: i_t = sigmoid(x_t @ W_ii^T + h_{t-1} @ W_hi^T + b_ii + b_hi) f_t = sigmoid(x_t @ W_if^T + h_{t-1} @ W_hf^T + b_if + b_hf) g_t = tanh(x_t @ W_ig^T + h_{t-1} @ W_hg^T + b_ig + b_hg) o_t = sigmoid(x_t @ W_io^T + h_{t-1} @ W_ho^T + b_io + b_ho) c_t = f_t * c_{t-1} + i_t * g_t h_t = o_t * tanh(c_t)

Shapes: input: [seq_len, batch, input_size] hidden0: [batch, hidden_size] cell0: [batch, hidden_size] w_ih: [4hidden_size, input_size] w_hh: [4hidden_size, hidden_size] b_ih: [4hidden_size] b_hh: [4hidden_size]

Returns (output [seq_len, batch, hidden_size], h_n [batch, hidden_size], c_n [batch, hidden_size])

fn maxpool2d #

fn maxpool2d[T](input &vtl.Tensor[T], kernel []int, padding []int, stride []int) (&vtl.Tensor[int], &vtl.Tensor[T])

fn maxpool2d_backward #

fn maxpool2d_backward[T](shape []int, max_indices &vtl.Tensor[int], grad_output &vtl.Tensor[T]) !&vtl.Tensor[T]

fn mish #

fn mish[T](x &vtl.Tensor[T]) &vtl.Tensor[T]

mish applies the Mish activation: x * tanh(softplus(x)). softplus(x) = log(1 + exp(x))

fn mse #

fn mse[T](input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]

mse squared error between the labels and the predictions

fn mse_backward #

fn mse_backward[T](gradient &vtl.Tensor[T], cache &vtl.Tensor[T], target &vtl.Tensor[T]) ![]&vtl.Tensor[T]

fn nll #

fn nll[T](input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]

nll computes Negative Log Likelihood loss (assumes input is log-probs) Target is one-hot or class probabilities; we compute -sum(target * log(input))

fn nll_backward #

fn nll_backward[T](gradient &vtl.Tensor[T], input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]

nll_backward computes gradient of NLL w.r.t. input (log-probs)

fn relu #

fn relu[T](x &vtl.Tensor[T]) &vtl.Tensor[T]

relu activation function

fn relu_with_backend #

fn relu_with_backend[T](x &vtl.Tensor[T], backend vtl.Backend, strict bool) !&vtl.Tensor[T]

fn sgd_optimize #

fn sgd_optimize[T](mut value vtl.Tensor[T], gradient &vtl.Tensor[T], learning_rate f64) !

fn sigmoid #

fn sigmoid[T](x &vtl.Tensor[T]) &vtl.Tensor[T]

sigmoid takes a real-valued number and squashes it to the range [0, 1]

fn sigmoid_cross_entropy #

fn sigmoid_cross_entropy[T](input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]

sigmoid_cross_entropy computes the sigmoid cross entropy between the labels and the predictions. Uses the numerically stable logsumexp formulation (Arraymancer source of truth): loss = mean( -y*x + max(x,0) + log1p(exp(-|x|)) )

fn sigmoid_cross_entropy_backward #

fn sigmoid_cross_entropy_backward[T](gradient &vtl.Tensor[T], cache &vtl.Tensor[T], target &vtl.Tensor[T]) ![]&vtl.Tensor[T]

fn sigmoid_with_backend #

fn sigmoid_with_backend[T](x &vtl.Tensor[T], backend vtl.Backend, strict bool) !&vtl.Tensor[T]

fn softmax_cross_entropy #

fn softmax_cross_entropy[T](input &vtl.Tensor[T], target &vtl.Tensor[T]) !&vtl.Tensor[T]

softmax_cross_entropy computes the mean cross-entropy loss for a batch of logit vectors using the numerically stable log-sum-exp trick.

input: [batch_size, n_classes] — raw logits (unnormalised scores) target: [batch_size, n_classes] — one-hot or soft label targets returns: scalar loss tensor of shape [1]

Formula (Arraymancer-style, numerically stable): For each sample i: lse_i = log( sum_j exp(logit_ij - max_i) ) + max_i loss_i = lse_i - sum_j(target_ij * logit_ij) mean_loss = mean over batch

fn softmax_cross_entropy_backward #

fn softmax_cross_entropy_backward[T](gradient &vtl.Tensor[T], cache &vtl.Tensor[T], target &vtl.Tensor[T]) ![]&vtl.Tensor[T]

softmax_cross_entropy_backward computes the gradient of the SCE loss w.r.t. the logits. The gradient for each logit is: upstream * (softmax(logit_i) - target_i) / batch_size

This is the standard gradient derived from the log-softmax formulation (equivalent to Arraymancer's implementation).

fn softmax_forward #

fn softmax_forward[T](input &vtl.Tensor[T], dim int) !&vtl.Tensor[T]

softmax_forward computes softmax along a specified dimension.

fn swish #

fn swish[T](x &vtl.Tensor[T]) &vtl.Tensor[T]

swish applies the Swish activation: x * sigmoid(beta * x), beta=1.

fn tanh #

fn tanh[T](x &vtl.Tensor[T]) &vtl.Tensor[T]

tanh squashes a real-valued number to the range [-1, 1]

fn tanh_with_backend #

fn tanh_with_backend[T](x &vtl.Tensor[T], backend vtl.Backend, strict bool) !&vtl.Tensor[T]

fn variance_scaled #

fn variance_scaled[T](shape []int, scale T, fan_mode FanMode, distribution Distribution) &vtl.Tensor[T]

fn Distribution.from #

fn Distribution.from[W](input W) !Distribution

fn FanMode.from #

fn FanMode.from[W](input W) !FanMode

enum Distribution #

enum Distribution {
	uniform
	normal
}

enum FanMode #

enum FanMode {
	fan_avg
	fan_in
	fan_out
}

struct Conv2DConfig #

struct Conv2DConfig {
pub:
	padding  []int = [0, 0]
	stride   []int = [1, 1]
	dilation []int = [1, 1]
	groups   int   = 1
}

Conv2DConfig mirrors vtl.nn.layers.Conv2DConfig to avoid import cycle.

struct LSTMIntermediate #

struct LSTMIntermediate[T] {
mut:
	gates   []&vtl.Tensor[T] // full gate tensor per timestep
	cells   []&vtl.Tensor[T] // cell state per timestep
	hiddens []&vtl.Tensor[T] // hidden state per timestep
}

LSTMIntermediate stores per-timestep gate values for backprop.