ENH: improve FA vs PCA example

wwwslinger · Oct 25, 2013 · 4f3722c · 4f3722c
1 parent 0e218c6
commit 4f3722c
Showing 1 changed file with 33 additions and 6 deletions.
diff --git a/examples/decomposition/plot_pca_vs_fa_model_selection.py b/examples/decomposition/plot_pca_vs_fa_model_selection.py
@@ -8,30 +8,38 @@
 
 Probabilistic PCA and Factor Analysis are probabilistic models.
 The consequence is that the likelihood of new data can be used
-for model selection. Here we compare PCA and FA with cross-validation
-on low rank data corrupted with homoscedastic noise (noise variance
+for model selection and covariance estimation.
+Here we compare PCA and FA with cross-validation on low rank data corrupted
+with homoscedastic noise (noise variance
 is the same for each feature) or heteroscedastic noise (noise variance
-is the different for each feature).
+is the different for each feature). In a second step we compare the model
+likelihood to the likelihoods obtained from shrinkage covariance estimators.
 
 One can observe that with homoscedastic noise both FA and PCA succeed
 in recovering the size of the low rank subspace. The likelihood with PCA
 is higher than FA in this case. However PCA fails and overestimates
-the rank when heteroscedastic noise is present. The automatic estimation from
+the rank when heteroscedastic noise is present. Under appropriate
+circumstances the low rank models are more likily than shrinkage models.
+
+The automatic estimation from
 Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604
 by Thomas P. Minka is also compared.
 
 """
 print(__doc__)
 
 # Authors: Alexandre Gramfort
+#          Denis A. Engemann
 # License: BSD 3 clause
 
 import numpy as np
 import pylab as pl
 from scipy import linalg
 
 from sklearn.decomposition import PCA, FactorAnalysis
+from sklearn.covariance import ShrunkCovariance, LedoitWolf
 from sklearn.cross_validation import cross_val_score
+from sklearn.grid_search import GridSearchCV
 
 ###############################################################################
 # Create the data
@@ -67,7 +75,19 @@ def compute_scores(X):
         fa_scores.append(np.mean(cross_val_score(fa, X)))
 
     return pca_scores, fa_scores
-
+
+
+def shrunk_cov_score(X):
+    shrinkages = np.logspace(-100, 0, 30)
+    tuned_parameters = [{'shrinkage': shrinkages}]
+    cv = GridSearchCV(ShrunkCovariance(), tuned_parameters)
+    return np.mean(cross_val_score(cv.fit(X).best_estimator_, X, cv=3))
+
+
+def lw_score(X):
+    return np.mean(cross_val_score(LedoitWolf(), X, cv=3))    
+
+
 for X, title in [(X_homo, 'Homoscedastic Noise'),
                  (X_hetero, 'Heteroscedastic Noise')]:
     pca_scores, fa_scores = compute_scores(X)
@@ -77,7 +97,7 @@ def compute_scores(X):
     pca = PCA(n_components='mle')
     pca.fit(X)
     n_components_pca_mle = pca.n_components_
-
+    
     print("best n_components by PCA CV = %d" % n_components_pca)
     print("best n_components by FactorAnalysis CV = %d" % n_components_fa)
     print("best n_components by PCA MLE = %d" % n_components_pca_mle)
@@ -92,6 +112,13 @@ def compute_scores(X):
                label='FactorAnalysis CV: %d' % n_components_fa, linestyle='--')
     pl.axvline(n_components_pca_mle, color='k',
                label='PCA MLE: %d' % n_components_pca_mle, linestyle='--')
+
+    # compare with other covariance estimators
+    pl.axhline(shrunk_cov_score(X), color='violet',
+               label='Shrunk Covariance MLE', linestyle='--')
+    pl.axhline(lw_score(X), color='orange',
+               label='LedoitWolf MLE' % n_components_pca_mle, linestyle='--')
+
     pl.xlabel('nb of components')
     pl.ylabel('CV scores')
     pl.legend(loc='lower right')