Datasets - acwooding/dimension_reduction GitHub Wiki
Timings
Time to run
- PCA
- MDS
- tSNE
- Isomap
- UMAP
with default settings altogether takes
- Digits: 2 minutes
- COIL-20: 8 minutes
- Frey faces: 4 minutes
- HIVA: 17 minutes
- LVQ-PAK Phoneme Dataset: 15 minutes as a frame of reference.
- MNIST and f-MNIST are too big to run MDS on a local machine (pairwise distances).
Synthetic
From MPH2009
- Swiss roll
- Helix
- Twin peaks
- Broken Swiss roll
- High-dimensional Here's the matlab code used to generate the datasets
function [X, labels, t] = generate_data(dataname, n, noise)
%GENERATE_DATA Generates an artificial dataset (manifold)
%
% [X, labels, t] = generate_data(dataname, n, noise)
%
% Generates an artificial dataset. Possible datasets are: 'swiss' for the Swiss roll
% dataset, 'helix' for the helix dataset, 'twinpeaks' for the twinpeaks dataset,
% '3d_clusters' for the 3D clusters dataset, and 'intersect' for the intersecting
% dataset. The variable n indicates the number of datapoints to generate
% (default = 1000). The variable noise indicates the amount of noise that
% is added to the data (default = 0.05). The function returns the
% high-dimensional dataset in X, and corresponding labels in labels. In
% addition, the function returns the coordinates of the datapoints on the
% underlying manifold in t.
%
%
% This file is part of the Matlab Toolbox for Dimensionality Reduction.
% The toolbox can be obtained from http://homepage.tudelft.nl/19j49
% You are free to use, change, or redistribute this code in any way you
% want for non-commercial purposes. However, it is appreciated if you
% maintain the name of the original author.
%
% (C) Laurens van der Maaten, Delft University of Technology
welcome;
if ~exist('n', 'var')
n = 1000;
end
if ~exist('noise', 'var')
noise = 0.05;
end
switch dataname
case 'swiss'
t = (3 * pi / 2) * (1 + 2 * rand(n, 1));
height = 30 * rand(n, 1);
X = [t .* cos(t) height t .* sin(t)] + noise * randn(n, 3);
%labels = uint8(t);
labels = rem(sum([round(t / 2) round(height / 12)], 2), 2);
t = [t height];
case 'brokenswiss'
t = [(3 * pi / 2) * (1 + 2 * rand(ceil(n / 2), 1) * .4); (3 * pi / 2) * (1 + 2 * (rand(floor(n / 2), 1) * .4 + .6))];
height = 30 * rand(n, 1);
X = [t .* cos(t) height t .* sin(t)] + noise * randn(n, 3);
labels = uint8(t);
%labels = rem(sum([round(t / 2) round(height / 12)], 2), 2);
t = [t height];
case 'changing_swiss'
r = zeros(1, n);
for i=1:n
pass = 0;
while ~pass
rr = rand(1);
if rand(1) > rr
r(i) = rr;
pass = 1;
end
end
end
t = (3 * pi / 2) * (1 + 2 * r);
height = 21 * rand(1, n);
X = [t .* cos(t); height; t .* sin(t)]' + noise * randn(n, 3);
%labels = uint8(t)';
labels = rem(sum([round(t / 2); round(height / 10)], 1), 2)';
case 'helix'
t = [1:n]' / n;
t = t .^ (1.0) * 2 * pi;
X = [(2 + cos(8 * t)) .* cos(t) (2 + cos(8 * t)) .* sin(t) sin(8 * t)] + noise * randn(n, 3);
%labels = uint8(t);
labels = rem(round(t * 1.5), 2);
case 'twinpeaks'
inc = 1.5 / sqrt(n);
[xx2, yy2] = meshgrid(-1:inc:1);
xy = 1 - 2 * rand(2, n);
X = [xy; sin(pi * xy(1,:)) .* tanh(3 * xy(2,:))]' + noise * randn(n, 3);
X(:,3) = X(:,3) * 10;
t = xy';
%labels = uint8(X(:,3));
labels = rem(sum(round((X + repmat(min(X, [], 1), [size(X, 1) 1])) ./ 10), 2), 2);
case '3d_clusters'
numClusters = 5;
centers = 10 * rand(numClusters, 3);
D = L2_distance(centers', centers');
minDistance = min(D(D > 0));
k = 1;
n2 = n - (numClusters - 1) * 9;
X = repmat(0, [n 3]);
labels = repmat(0, [n 1]);
for i=1:numClusters
for j=1:ceil(n2 / numClusters)
X(k, 1:3) = centers(i, 1:3) + (rand(1, 3) - 0.5) * minDistance / sqrt(12);
labels(k) = i;
k = k + 1;
end
end
X = X + noise * randn(size(X, 1), 3);
t = [];
case 'intersect'
t = [1:n]' ./ n .* (2 * pi);
x = cos(t);
y = sin(t);
height = rand(length(x), 1) * 5;
X = [x x .* y height] + noise * randn(n, 3);
%labels = uint8(5 * t);
labels = rem(sum([round(t / 2) round(height / 2)], 2), 2);
case 'difficult'
% Generate underlying manifold
no_dims = 5;
no_points_per_dim = round(n ^ (1 / no_dims));
l = linspace(0, 1, no_points_per_dim);
t = combn(l, no_dims);
% Generate high-dimensional dataset
X = [cos(t(:,1)) tanh(3 * t(:,2)) t(:,1) + t(:,3) t(:,4) .* sin(t(:,2)) sin(t(:,1) + t(:,5)) t(:,5) .* cos(t(:,2)) t(:,5) + t(:,4) t(:,2) t(:,3) .* t(:,4) t(:,1)];
X = X + noise * randn(size(X));
% Generate labels for dataset (2x2x2x2x2 checkerboard pattern)
tt = 1 + round(t);
labels = rem(sum(tt, 2), 2);
otherwise
error('Unknown dataset name.');
end