API deprecate CalibratedClassifierCV(..., cv=prefit) for FrozenEstima…

…tor (#30171) Co-authored-by: Adam Li <[email protected]>
scikit-learn · Oct 30, 2024 · b4eef25 · b4eef25
1 parent ba2dd5d
commit b4eef25
Show file tree

Hide file tree

Showing 5 changed files with 74 additions and 40 deletions.
diff --git a/doc/modules/calibration.rst b/doc/modules/calibration.rst
@@ -193,10 +193,11 @@ The main advantage of using `ensemble=False` is computational: it reduces the
 overall fit time by training only a single base classifier and calibrator
 pair, decreases the final model size and increases prediction speed.
 
-Alternatively an already fitted classifier can be calibrated by setting
-`cv="prefit"`. In this case, the data is not split and all of it is used to
-fit the regressor. It is up to the user to
-make sure that the data used for fitting the classifier is disjoint from the
+Alternatively an already fitted classifier can be calibrated by using a
+:class:`~sklearn.frozen.FrozenEstimator` as
+``CalibratedClassifierCV(estimator=FrozenEstimator(estimator))``.
+It is up to the user to make sure that the data used for fitting the classifier
+is disjoint from the data used for fitting the regressor.
 data used for fitting the regressor.
 
 :class:`CalibratedClassifierCV` supports the use of two regression techniques

diff --git a/doc/whats_new/upcoming_changes/sklearn.calibration/30171.api.rst b/doc/whats_new/upcoming_changes/sklearn.calibration/30171.api.rst
@@ -0,0 +1,4 @@
+- `cv="prefit"` is deprecated for :class:`~sklearn.calibration.CalibratedClassifierCV`.
+  Use :class:`~sklearn.frozen.FrozenEstimator` instead, as
+  `CalibratedClassifierCV(FrozenEstimator(estimator))`.
+  By `Adrin Jalali`_.
diff --git a/examples/calibration/plot_calibration_multiclass.py b/examples/calibration/plot_calibration_multiclass.py
@@ -64,10 +64,11 @@ class of an instance (red: class 1, green: class 2, blue: class 3).
 # using the valid data subset (400 samples) in a 2-stage process.
 
 from sklearn.calibration import CalibratedClassifierCV
+from sklearn.frozen import FrozenEstimator
 
 clf = RandomForestClassifier(n_estimators=25)
 clf.fit(X_train, y_train)
-cal_clf = CalibratedClassifierCV(clf, method="sigmoid", cv="prefit")
+cal_clf = CalibratedClassifierCV(FrozenEstimator(clf), method="sigmoid")
 cal_clf.fit(X_valid, y_valid)
 
 # %%

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
@@ -23,6 +23,7 @@
     _fit_context,
     clone,
 )
+from .frozen import FrozenEstimator
 from .isotonic import IsotonicRegression
 from .model_selection import LeaveOneOut, check_cv, cross_val_predict
 from .preprocessing import LabelEncoder, label_binarize
@@ -34,6 +35,7 @@
 )
 from .utils._param_validation import (
     HasMethods,
+    Hidden,
     Interval,
     StrOptions,
     validate_params,
@@ -75,8 +77,8 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator)
     `probabilities=True` for :class:`~sklearn.svm.SVC` and :class:`~sklearn.svm.NuSVC`
     estimators (see :ref:`User Guide <scores_probabilities>` for details).
 
-    Already fitted classifiers can be calibrated via the parameter
-    `cv="prefit"`. In this case, no cross-validation is used and all provided
+    Already fitted classifiers can be calibrated by wrapping the model in a
+    :class:`~sklearn.frozen.FrozenEstimator`. In this case all provided
     data is used for calibration. The user has to take care manually that data
     for model fitting and calibration are disjoint.
 
@@ -106,8 +108,7 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator)
         use isotonic calibration with too few calibration samples
         ``(<<1000)`` since it tends to overfit.
 
-    cv : int, cross-validation generator, iterable or "prefit", \
-            default=None
+    cv : int, cross-validation generator, or iterable, default=None
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -124,12 +125,13 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator)
         Refer to the :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
 
-        If "prefit" is passed, it is assumed that `estimator` has been
-        fitted already and all data is used for calibration.
-
         .. versionchanged:: 0.22
             ``cv`` default value if None changed from 3-fold to 5-fold.
 
+        .. versionchanged:: 1.6
+            `"prefit"` is deprecated. Use :class:`~sklearn.frozen.FrozenEstimator`
+            instead.
+
     n_jobs : int, default=None
         Number of jobs to run in parallel.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
@@ -142,9 +144,11 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator)
 
         .. versionadded:: 0.24
 
-    ensemble : bool, default=True
-        Determines how the calibrator is fitted when `cv` is not `'prefit'`.
-        Ignored if `cv='prefit'`.
+    ensemble : bool, or "auto", default="auto"
+        Determines how the calibrator is fitted.
+
+        "auto" will use `False` if the `estimator` is a
+        :class:`~sklearn.frozen.FrozenEstimator`, and `True` otherwise.
 
         If `True`, the `estimator` is fitted using training data, and
         calibrated using testing data, for each `cv` fold. The final estimator
@@ -161,6 +165,9 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator)
 
         .. versionadded:: 0.24
 
+        .. versionchanged:: 1.6
+            `"auto"` option is added and is the default.
+
     Attributes
     ----------
     classes_ : ndarray of shape (n_classes,)
@@ -178,17 +185,13 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator)
 
         .. versionadded:: 1.0
 
-    calibrated_classifiers_ : list (len() equal to cv or 1 if `cv="prefit"` \
-            or `ensemble=False`)
+    calibrated_classifiers_ : list (len() equal to cv or 1 if `ensemble=False`)
         The list of classifier and calibrator pairs.
 
-        - When `cv="prefit"`, the fitted `estimator` and fitted
+        - When `ensemble=True`, `n_cv` fitted `estimator` and calibrator pairs.
+          `n_cv` is the number of cross-validation folds.
+        - When `ensemble=False`, the `estimator`, fitted on all the data, and fitted
           calibrator.
-        - When `cv` is not "prefit" and `ensemble=True`, `n_cv` fitted
-          `estimator` and calibrator pairs. `n_cv` is the number of
-          cross-validation folds.
-        - When `cv` is not "prefit" and `ensemble=False`, the `estimator`,
-          fitted on all the data, and fitted calibrator.
 
         .. versionchanged:: 0.24
             Single calibrated classifier case when `ensemble=False`.
@@ -240,7 +243,8 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator)
     >>> base_clf = GaussianNB()
     >>> base_clf.fit(X_train, y_train)
     GaussianNB()
-    >>> calibrated_clf = CalibratedClassifierCV(base_clf, cv="prefit")
+    >>> from sklearn.frozen import FrozenEstimator
+    >>> calibrated_clf = CalibratedClassifierCV(FrozenEstimator(base_clf))
     >>> calibrated_clf.fit(X_calib, y_calib)
     CalibratedClassifierCV(...)
     >>> len(calibrated_clf.calibrated_classifiers_)
@@ -256,9 +260,9 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator)
             None,
         ],
         "method": [StrOptions({"isotonic", "sigmoid"})],
-        "cv": ["cv_object", StrOptions({"prefit"})],
+        "cv": ["cv_object", Hidden(StrOptions({"prefit"}))],
         "n_jobs": [Integral, None],
-        "ensemble": ["boolean"],
+        "ensemble": ["boolean", StrOptions({"auto"})],
     }
 
     def __init__(
@@ -268,7 +272,7 @@ def __init__(
         method="sigmoid",
         cv=None,
         n_jobs=None,
-        ensemble=True,
+        ensemble="auto",
     ):
         self.estimator = estimator
         self.method = method
@@ -323,8 +327,18 @@ def fit(self, X, y, sample_weight=None, **fit_params):
 
         estimator = self._get_estimator()
 
+        _ensemble = self.ensemble
+        if _ensemble == "auto":
+            _ensemble = not isinstance(estimator, FrozenEstimator)
+
         self.calibrated_classifiers_ = []
         if self.cv == "prefit":
+            # TODO(1.8): Remove this code branch and cv='prefit'
+            warnings.warn(
+                "The `cv='prefit'` option is deprecated in 1.6 and will be removed in"
+                " 1.8. You can use CalibratedClassifierCV(FrozenEstimator(estimator))"
+                " instead."
+            )
             # `classes_` should be consistent with that of estimator
             check_is_fitted(self.estimator, attributes=["classes_"])
             self.classes_ = self.estimator.classes_
@@ -404,7 +418,7 @@ def fit(self, X, y, sample_weight=None, **fit_params):
                 )
             cv = check_cv(self.cv, y, classifier=True)
 
-            if self.ensemble:
+            if _ensemble:
                 parallel = Parallel(n_jobs=self.n_jobs)
                 self.calibrated_classifiers_ = parallel(
                     delayed(_fit_classifier_calibrator_pair)(

diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
@@ -22,6 +22,7 @@
 )
 from sklearn.exceptions import NotFittedError
 from sklearn.feature_extraction import DictVectorizer
+from sklearn.frozen import FrozenEstimator
 from sklearn.impute import SimpleImputer
 from sklearn.isotonic import IsotonicRegression
 from sklearn.linear_model import LogisticRegression, SGDClassifier
@@ -45,6 +46,7 @@
     assert_almost_equal,
     assert_array_almost_equal,
     assert_array_equal,
+    ignore_warnings,
 )
 from sklearn.utils.extmath import softmax
 from sklearn.utils.fixes import CSR_CONTAINERS
@@ -299,9 +301,11 @@ def predict(self, X):
     assert_allclose(probas, 1.0 / clf.n_classes_)
 
 
+@ignore_warnings(category=FutureWarning)
 @pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
 def test_calibration_prefit(csr_container):
     """Test calibration for prefitted classifiers"""
+    # TODO(1.8): Remove cv="prefit" options here and the @ignore_warnings of the test
     n_samples = 50
     X, y = make_classification(n_samples=3 * n_samples, n_features=6, random_state=42)
     sample_weight = np.random.RandomState(seed=42).uniform(size=y.size)
@@ -333,17 +337,25 @@ def test_calibration_prefit(csr_container):
         (csr_container(X_calib), csr_container(X_test)),
     ]:
         for method in ["isotonic", "sigmoid"]:
-            cal_clf = CalibratedClassifierCV(clf, method=method, cv="prefit")
+            cal_clf_prefit = CalibratedClassifierCV(clf, method=method, cv="prefit")
+            cal_clf_frozen = CalibratedClassifierCV(FrozenEstimator(clf), method=method)
 
             for sw in [sw_calib, None]:
-                cal_clf.fit(this_X_calib, y_calib, sample_weight=sw)
-                y_prob = cal_clf.predict_proba(this_X_test)
-                y_pred = cal_clf.predict(this_X_test)
-                prob_pos_cal_clf = y_prob[:, 1]
-                assert_array_equal(y_pred, np.array([0, 1])[np.argmax(y_prob, axis=1)])
-
+                cal_clf_prefit.fit(this_X_calib, y_calib, sample_weight=sw)
+                cal_clf_frozen.fit(this_X_calib, y_calib, sample_weight=sw)
+
+                y_prob_prefit = cal_clf_prefit.predict_proba(this_X_test)
+                y_prob_frozen = cal_clf_frozen.predict_proba(this_X_test)
+                y_pred_prefit = cal_clf_prefit.predict(this_X_test)
+                y_pred_frozen = cal_clf_frozen.predict(this_X_test)
+                prob_pos_cal_clf_prefit = y_prob_prefit[:, 1]
+                prob_pos_cal_clf_frozen = y_prob_frozen[:, 1]
+                assert_array_equal(y_pred_prefit, y_pred_frozen)
+                assert_array_equal(
+                    y_pred_prefit, np.array([0, 1])[np.argmax(y_prob_prefit, axis=1)]
+                )
                 assert brier_score_loss(y_test, prob_pos_clf) > brier_score_loss(
-                    y_test, prob_pos_cal_clf
+                    y_test, prob_pos_cal_clf_frozen
                 )
 
 
@@ -515,8 +527,10 @@ def dict_data():
         {"state": "NY", "age": "adult"},
         {"state": "TX", "age": "adult"},
         {"state": "VT", "age": "child"},
+        {"state": "CT", "age": "adult"},
+        {"state": "BR", "age": "child"},
     ]
-    text_labels = [1, 0, 1]
+    text_labels = [1, 0, 1, 1, 0]
     return dict_data, text_labels
 
 
@@ -540,7 +554,7 @@ def test_calibration_dict_pipeline(dict_data, dict_data_pipeline):
     """
     X, y = dict_data
     clf = dict_data_pipeline
-    calib_clf = CalibratedClassifierCV(clf, cv="prefit")
+    calib_clf = CalibratedClassifierCV(FrozenEstimator(clf), cv=2)
     calib_clf.fit(X, y)
     # Check attributes are obtained from fitted estimator
     assert_array_equal(calib_clf.classes_, clf.classes_)
@@ -584,7 +598,7 @@ def test_calibration_inconsistent_prefit_n_features_in():
     # is consistent with training set
     X, y = make_classification(n_samples=10, n_features=5, n_classes=2, random_state=7)
     clf = LinearSVC(C=1).fit(X, y)
-    calib_clf = CalibratedClassifierCV(clf, cv="prefit")
+    calib_clf = CalibratedClassifierCV(FrozenEstimator(clf))
 
     msg = "X has 3 features, but LinearSVC is expecting 5 features as input."
     with pytest.raises(ValueError, match=msg):
@@ -602,7 +616,7 @@ def test_calibration_votingclassifier():
     )
     vote.fit(X, y)
 
-    calib_clf = CalibratedClassifierCV(estimator=vote, cv="prefit")
+    calib_clf = CalibratedClassifierCV(estimator=FrozenEstimator(vote))
     # smoke test: should not raise an error
     calib_clf.fit(X, y)