forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbench_glmnet.py
137 lines (110 loc) · 3.87 KB
/
bench_glmnet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
To run this, you'll need to have installed.
* glmnet-python
* scikit-learn (of course)
Does two benchmarks
First, we fix a training set and increase the number of
samples. Then we plot the computation time as function of
the number of samples.
In the second benchmark, we increase the number of dimensions of the
training set. Then we plot the computation time as function of
the number of dimensions.
In both cases, only 10% of the features are informative.
"""
import numpy as np
import gc
from time import time
from sklearn.datasets import make_regression
alpha = 0.1
# alpha = 0.01
def rmse(a, b):
return np.sqrt(np.mean((a - b) ** 2))
def bench(factory, X, Y, X_test, Y_test, ref_coef):
gc.collect()
# start time
tstart = time()
clf = factory(alpha=alpha).fit(X, Y)
delta = time() - tstart
# stop time
print("duration: %0.3fs" % delta)
print("rmse: %f" % rmse(Y_test, clf.predict(X_test)))
print("mean coef abs diff: %f" % abs(ref_coef - clf.coef_.ravel()).mean())
return delta
if __name__ == "__main__":
from glmnet.elastic_net import Lasso as GlmnetLasso
from sklearn.linear_model import Lasso as ScikitLasso
# Delayed import of matplotlib.pyplot
import matplotlib.pyplot as plt
scikit_results = []
glmnet_results = []
n = 20
step = 500
n_features = 1000
n_informative = n_features / 10
n_test_samples = 1000
for i in range(1, n + 1):
print("==================")
print("Iteration %s of %s" % (i, n))
print("==================")
X, Y, coef_ = make_regression(
n_samples=(i * step) + n_test_samples,
n_features=n_features,
noise=0.1,
n_informative=n_informative,
coef=True,
)
X_test = X[-n_test_samples:]
Y_test = Y[-n_test_samples:]
X = X[: (i * step)]
Y = Y[: (i * step)]
print("benchmarking scikit-learn: ")
scikit_results.append(bench(ScikitLasso, X, Y, X_test, Y_test, coef_))
print("benchmarking glmnet: ")
glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_))
plt.clf()
xx = range(0, n * step, step)
plt.title("Lasso regression on sample dataset (%d features)" % n_features)
plt.plot(xx, scikit_results, "b-", label="scikit-learn")
plt.plot(xx, glmnet_results, "r-", label="glmnet")
plt.legend()
plt.xlabel("number of samples to classify")
plt.ylabel("Time (s)")
plt.show()
# now do a benchmark where the number of points is fixed
# and the variable is the number of features
scikit_results = []
glmnet_results = []
n = 20
step = 100
n_samples = 500
for i in range(1, n + 1):
print("==================")
print("Iteration %02d of %02d" % (i, n))
print("==================")
n_features = i * step
n_informative = n_features / 10
X, Y, coef_ = make_regression(
n_samples=(i * step) + n_test_samples,
n_features=n_features,
noise=0.1,
n_informative=n_informative,
coef=True,
)
X_test = X[-n_test_samples:]
Y_test = Y[-n_test_samples:]
X = X[:n_samples]
Y = Y[:n_samples]
print("benchmarking scikit-learn: ")
scikit_results.append(bench(ScikitLasso, X, Y, X_test, Y_test, coef_))
print("benchmarking glmnet: ")
glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_))
xx = np.arange(100, 100 + n * step, step)
plt.figure("scikit-learn vs. glmnet benchmark results")
plt.title("Regression in high dimensional spaces (%d samples)" % n_samples)
plt.plot(xx, scikit_results, "b-", label="scikit-learn")
plt.plot(xx, glmnet_results, "r-", label="glmnet")
plt.legend()
plt.xlabel("number of features")
plt.ylabel("Time (s)")
plt.axis("tight")
plt.show()