""" ======================= MNIST dataset benchmark ======================= Benchmark on the MNIST dataset. The dataset comprises 70,000 samples and 784 features. Here, we consider the task of predicting 10 classes - digits from 0 to 9 from their raw images. By contrast to the covertype dataset, the feature space is homogeneous. Example of output : [..] Classification performance: =========================== Classifier train-time test-time error-rate ------------------------------------------------------------ MLP_adam 53.46s 0.11s 0.0224 Nystroem-SVM 112.97s 0.92s 0.0228 MultilayerPerceptron 24.33s 0.14s 0.0287 ExtraTrees 42.99s 0.57s 0.0294 RandomForest 42.70s 0.49s 0.0318 SampledRBF-SVM 135.81s 0.56s 0.0486 LinearRegression-SAG 16.67s 0.06s 0.0824 CART 20.69s 0.02s 0.1219 dummy 0.00s 0.01s 0.8973 """ # Author: Issam H. Laradji # Arnaud Joly # License: BSD 3 clause import os from time import time import argparse import numpy as np from joblib import Memory from sklearn.datasets import fetch_openml from sklearn.datasets import get_data_home from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.dummy import DummyClassifier from sklearn.kernel_approximation import Nystroem from sklearn.kernel_approximation import RBFSampler from sklearn.metrics import zero_one_loss from sklearn.pipeline import make_pipeline from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier from sklearn.utils import check_array from sklearn.linear_model import LogisticRegression from sklearn.neural_network import MLPClassifier # Memoize the data extraction and memory map the resulting # train / test splits in readonly mode memory = Memory(os.path.join(get_data_home(), "mnist_benchmark_data"), mmap_mode="r") @memory.cache def load_data(dtype=np.float32, order="F"): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") data = fetch_openml("mnist_784") X = check_array(data["data"], dtype=dtype, order=order) y = data["target"] # Normalize features X = X / 255 # Create train-test split (as [Joachims, 2006]) print("Creating train-test split...") n_train = 60000 X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] return X_train, X_test, y_train, y_test ESTIMATORS = { "dummy": DummyClassifier(), "CART": DecisionTreeClassifier(), "ExtraTrees": ExtraTreesClassifier(), "RandomForest": RandomForestClassifier(), "Nystroem-SVM": make_pipeline( Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=100) ), "SampledRBF-SVM": make_pipeline( RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=100) ), "LogisticRegression-SAG": LogisticRegression(solver="sag", tol=1e-1, C=1e4), "LogisticRegression-SAGA": LogisticRegression(solver="saga", tol=1e-1, C=1e4), "MultilayerPerceptron": MLPClassifier( hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, solver="sgd", learning_rate_init=0.2, momentum=0.9, verbose=1, tol=1e-4, random_state=1, ), "MLP-adam": MLPClassifier( hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, solver="adam", learning_rate_init=0.001, verbose=1, tol=1e-4, random_state=1, ), } if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--classifiers", nargs="+", choices=ESTIMATORS, type=str, default=["ExtraTrees", "Nystroem-SVM"], help="list of classifiers to benchmark.", ) parser.add_argument( "--n-jobs", nargs="?", default=1, type=int, help=( "Number of concurrently running workers for " "models that support parallelism." ), ) parser.add_argument( "--order", nargs="?", default="C", type=str, choices=["F", "C"], help="Allow to choose between fortran and C ordered data", ) parser.add_argument( "--random-seed", nargs="?", default=0, type=int, help="Common seed used by random number generator.", ) args = vars(parser.parse_args()) print(__doc__) X_train, X_test, y_train, y_test = load_data(order=args["order"]) print("") print("Dataset statistics:") print("===================") print("%s %d" % ("number of features:".ljust(25), X_train.shape[1])) print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size)) print("%s %s" % ("data type:".ljust(25), X_train.dtype)) print( "%s %d (size=%dMB)" % ( "number of train samples:".ljust(25), X_train.shape[0], int(X_train.nbytes / 1e6), ) ) print( "%s %d (size=%dMB)" % ( "number of test samples:".ljust(25), X_test.shape[0], int(X_test.nbytes / 1e6), ) ) print() print("Training Classifiers") print("====================") error, train_time, test_time = {}, {}, {} for name in sorted(args["classifiers"]): print("Training %s ... " % name, end="") estimator = ESTIMATORS[name] estimator_params = estimator.get_params() estimator.set_params( **{ p: args["random_seed"] for p in estimator_params if p.endswith("random_state") } ) if "n_jobs" in estimator_params: estimator.set_params(n_jobs=args["n_jobs"]) time_start = time() estimator.fit(X_train, y_train) train_time[name] = time() - time_start time_start = time() y_pred = estimator.predict(X_test) test_time[name] = time() - time_start error[name] = zero_one_loss(y_test, y_pred) print("done") print() print("Classification performance:") print("===========================") print( "{0: <24} {1: >10} {2: >11} {3: >12}".format( "Classifier ", "train-time", "test-time", "error-rate" ) ) print("-" * 60) for name in sorted(args["classifiers"], key=error.get): print( "{0: <23} {1: >10.2f}s {2: >10.2f}s {3: >12.4f}".format( name, train_time[name], test_time[name], error[name] ) ) print()