Skip to content

datasets #

VTL Datasets

VTL provides dataset loaders and batching utilities for ML examples and tests.

Available datasets

Dataset Loader Purpose
MNIST datasets.load_mnist() Handwritten digit images (28x28)
IMDB datasets.load_imdb() Sentiment analysis reviews
CIFAR-10 datasets.load_cifar10(...) Image classification examples
DataLoader datasets.DataLoader[T] Batch, shuffle, and iterate tensors/labels

Examples

Run from ~/.vmodules:

v run vtl/examples/datasets_mnist/main.v
v run vtl/examples/datasets_imdb/main.v
v run vtl/examples/nn_cifar10_tiny_synth/main.v

Use synthetic examples (nn_cifar10_tiny_synth, nn_cifar10_f32_tiny_synth) for CI and quick local checks. Real dataset examples may download/cache data and should be treated as local integration tests.

Constants #

const cifar10_base_url = 'https://www.cs.toronto.edu/~kriz/'

cifar10_base_url is a public constant used by this module.

const cifar10_file = 'cifar-10-binary.tar.gz'

cifar10_file is a public constant used by this module.

const imdb_file_name = 'aclImdb_v1.tar.gz'

imdb_file_name is a public constant used by this module.

const imdb_base_url = 'http://ai.stanford.edu/~amaas/data/sentiment/'

imdb_base_url is a public constant used by this module.

const mnist_base_url = 'https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data/'

mnist_base_url is a public constant used by this module.

const mnist_train_images_file = 'train-images-idx3-ubyte.gz'

mnist_train_images_file is a public constant used by this module.

const mnist_train_labels_file = 'train-labels-idx1-ubyte.gz'

mnist_train_labels_file is a public constant used by this module.

const mnist_test_images_file = 't10k-images-idx3-ubyte.gz'

mnist_test_images_file is a public constant used by this module.

const mnist_test_labels_file = 't10k-labels-idx1-ubyte.gz'

mnist_test_labels_file is a public constant used by this module.

fn class_names #

fn class_names() []string

class_names returns the CIFAR-10 class names.

fn load_cifar10 #

fn load_cifar10() !Cifar10Dataset

load_cifar10 loads the CIFAR-10 dataset. Returns train/test features (normalized [0,1]) and one-hot encoded labels.

fn load_cifar10_with_config #

fn load_cifar10_with_config(cfg Cifar10Config) !Cifar10Dataset

load_cifar10_with_config loads CIFAR-10 with custom configuration.

fn load_imdb #

fn load_imdb() !ImdbDataset

load_imdb loads the IMDB dataset.

fn load_mnist #

fn load_mnist() !MnistDataset

load_mnist loads the MNIST dataset.

fn new_data_loader #

fn new_data_loader[T](dataset &vtl.Tensor[T], config DataLoaderConfig) &DataLoader[T]

new_data_loader creates a DataLoader from a dataset tensor (features only).

fn new_data_loader_with_labels #

fn new_data_loader_with_labels[T](dataset &vtl.Tensor[T], labels &vtl.Tensor[T], config DataLoaderConfig) &DataLoader[T]

new_data_loader_with_labels creates a DataLoader with both features and labels tensors. Both tensors must have the same first dimension (number of samples).

fn (DataLoaderIterator[T]) next #

fn (mut it DataLoaderIterator[T]) next[T]() ?&vtl.Tensor[T]

next returns the next batch. Returns none when exhausted.

fn (DataLoader[T]) batch #

fn (dl &DataLoader[T]) batch(i int) ?&vtl.Tensor[T]

batch returns the batch at index i as a zero-copy view into the dataset. Returns none if the index is out of range.

fn (DataLoader[T]) batch_with_labels #

fn (dl &DataLoader[T]) batch_with_labels(i int) ?(&vtl.Tensor[T], &vtl.Tensor[T])

batch_with_labels returns both features and labels for batch i. Both tensors are extracted using the same index set. Returns none if the index is out of range or if no labels tensor was provided.

fn (DataLoader[T]) len #

fn (dl &DataLoader[T]) len() int

len returns the number of batches (drop_last affects count).

fn (DataLoader[T]) total_samples #

fn (dl &DataLoader[T]) total_samples() int

total_samples returns the number of samples in the dataset.

fn (DataLoader[T]) reset #

fn (mut dl DataLoader[T]) reset()

reset re-shuffles the indices (call between epochs). Uses seed if set.

fn (DataLoader[T]) iter #

fn (dl &DataLoader[T]) iter[T]() DataLoaderIterator[T]

iter returns an iterator over the DataLoader batches.

fn (DataLoader[T]) for_each #

fn (dl &DataLoader[T]) for_each(fn_each fn (batch &vtl.Tensor[T]) bool) int

for_each applies fn to each batch until exhaustion or fn returns false.

fn (DataLoader[T]) for_each_with_labels #

fn (dl &DataLoader[T]) for_each_with_labels(fn_each fn (features &vtl.Tensor[T], labels &vtl.Tensor[T]) bool) int

for_each_with_labels applies fn to each (features, labels) batch. Only works if the DataLoader was created with labels.

fn (DataLoader[T]) split #

fn (dl &DataLoader[T]) split(val_fraction f64) (&DataLoader[T], &DataLoader[T])

split splits a DataLoader into train and validation DataLoaders.

fn (DataLoader[T]) mini_batch_count #

fn (dl &DataLoader[T]) mini_batch_count() int

mini_batch_count returns the total number of batches in one epoch.

struct Cifar10Config #

struct Cifar10Config {
pub:
	channels    int = 3
	height      int = 32
	width       int = 32
	num_classes int = 10
	train_count int = 50000
	test_count  int = 10000
}

Cifar10Config holds configuration for CIFAR-10 dataset loading.

struct Cifar10Dataset #

struct Cifar10Dataset {
pub:
	train_features &vtl.Tensor[f64] = unsafe { nil }
	train_labels   &vtl.Tensor[f64] = unsafe { nil }
	test_features  &vtl.Tensor[f64] = unsafe { nil }
	test_labels    &vtl.Tensor[f64] = unsafe { nil }
}

Cifar10Dataset holds the CIFAR-10 dataset.

struct DataLoader #

@[heap]
struct DataLoader[T] {
pub:
	dataset    &vtl.Tensor[T]
	labels     &vtl.Tensor[T] = unsafe { nil }
	batch_size int
	shuffle    bool
	drop_last  bool
	seed       u64
pub mut:
	indices []int
	epoch   int
}

DataLoader defines a public data structure for this module.

struct DataLoaderConfig #

struct DataLoaderConfig {
pub:
	batch_size int  = 32
	shuffle    bool = true
	drop_last  bool = true
	seed       u64
}

DataLoaderConfig holds configuration for creating a DataLoader.

struct DataLoaderIterator #

struct DataLoaderIterator[T] {
mut:
	loader  &DataLoader[T]
	current int
}

DataLoaderIterator provides a for-in loop interface over a DataLoader.

struct ImdbDataset #

struct ImdbDataset {
pub:
	train_features &vtl.Tensor[string] = unsafe { nil }
	train_labels   &vtl.Tensor[int]    = unsafe { nil }
	test_features  &vtl.Tensor[string] = unsafe { nil }
	test_labels    &vtl.Tensor[int]    = unsafe { nil }
}

ImdbDataset is a dataset for sentiment analysis.

struct MnistDataset #

struct MnistDataset {
pub:
	train_features &vtl.Tensor[u8] = unsafe { nil }
	train_labels   &vtl.Tensor[u8] = unsafe { nil }
	test_features  &vtl.Tensor[u8] = unsafe { nil }
	test_labels    &vtl.Tensor[u8] = unsafe { nil }
}

MnistDataset is a dataset of MNIST handwritten digits.