REVERT ENH add the parameter prefit in the FixedThresholdClassifier (#…

…29067) (#30172)
scikit-learn · Nov 5, 2024 · 70aab36 · 70aab36
1 parent 004cf9e
commit 70aab36
Show file tree

Hide file tree

Showing 7 changed files with 165 additions and 68 deletions.
diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
@@ -144,7 +144,9 @@ Manually setting the decision threshold
 The previous sections discussed strategies to find an optimal decision threshold. It is
 also possible to manually set the decision threshold using the class
 :class:`~sklearn.model_selection.FixedThresholdClassifier`. In case that you don't want
-to refit the model when calling `fit`, you can set the parameter `prefit=True`.
+to refit the model when calling `fit`, wrap your sub-estimator with a
+:class:`~sklearn.frozen.FrozenEstimator` and do
+``FixedThresholdClassifier(FrozenEstimator(estimator), ...)``.
 
 Examples
 --------

diff --git a/doc/whats_new/upcoming_changes/sklearn.model_selection/29067.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.model_selection/29067.enhancement.rst
diff --git a/doc/whats_new/upcoming_changes/sklearn.model_selection/30172.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.model_selection/30172.enhancement.rst
@@ -0,0 +1,4 @@
+- There is no need to call `fit` on a
+  :class:`~sklearn.model_selection.FixedThresholdClassifier` if the underlying
+  estimator is already fitted.
+  By :user:`Adrin Jalali <adrinjalali>`
diff --git a/examples/frozen/plot_frozen_examples.py b/examples/frozen/plot_frozen_examples.py
@@ -0,0 +1,98 @@
+"""
+===================================
+Examples of Using `FrozenEstimator`
+===================================
+
+This examples showcases some use cases of :class:`~sklearn.frozen.FrozenEstimator`.
+
+:class:`~sklearn.frozen.FrozenEstimator` is a utility class that allows to freeze a
+fitted estimator. This is useful, for instance, when we want to pass a fitted estimator
+to a meta-estimator, such as :class:`~sklearn.model_selection.FixedThresholdClassifier`
+without letting the meta-estimator refit the estimator.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# %%
+# Setting a decision threshold for a pre-fitted classifier
+# --------------------------------------------------------
+# Fitted classifiers in scikit-learn use an arbitrary decision threshold to decide
+# which class the given sample belongs to. The decision threshold is either `0.0` on the
+# value returned by :term:`decision_function`, or `0.5` on the probability returned by
+# :term:`predict_proba`.
+#
+# However, one might want to set a custom decision threshold. We can do this by
+# using :class:`~sklearn.model_selection.FixedThresholdClassifier` and wrapping the
+# classifier with :class:`~sklearn.frozen.FrozenEstimator`.
+from sklearn.datasets import make_classification
+from sklearn.frozen import FrozenEstimator
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import FixedThresholdClassifier, train_test_split
+
+X, y = make_classification(n_samples=1000, random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+classifier = LogisticRegression().fit(X_train, y_train)
+
+print(
+    "Probability estimates for three data points:\n"
+    f"{classifier.predict_proba(X_test[-3:]).round(3)}"
+)
+print(
+    "Predicted class for the same three data points:\n"
+    f"{classifier.predict(X_test[-3:])}"
+)
+
+# %%
+# Now imagine you'd want to set a different decision threshold on the probability
+# estimates. We can do this by wrapping the classifier with
+# :class:`~sklearn.frozen.FrozenEstimator` and passing it to
+# :class:`~sklearn.model_selection.FixedThresholdClassifier`.
+
+threshold_classifier = FixedThresholdClassifier(
+    estimator=FrozenEstimator(classifier), threshold=0.9
+)
+
+# %%
+# Note that in the above piece of code, calling `fit` on
+# :class:`~sklearn.model_selection.FixedThresholdClassifier` does not refit the
+# underlying classifier.
+#
+# Now, let's see how the predictions changed with respect to the probability
+# threshold.
+print(
+    "Probability estimates for three data points with FixedThresholdClassifier:\n"
+    f"{threshold_classifier.predict_proba(X_test[-3:]).round(3)}"
+)
+print(
+    "Predicted class for the same three data points with FixedThresholdClassifier:\n"
+    f"{threshold_classifier.predict(X_test[-3:])}"
+)
+
+# %%
+# We see that the probability estimates stay the same, but since a different decision
+# threshold is used, the predicted classes are different.
+#
+# Please refer to
+# :ref:`sphx_glr_auto_examples_model_selection_plot_cost_sensitive_learning.py`
+# to learn about cost-sensitive learning and decision threshold tuning.
+
+# %%
+# Calibration of a pre-fitted classifier
+# --------------------------------------
+# You can use :class:`~sklearn.frozen.FrozenEstimator` to calibrate a pre-fitted
+# classifier using :class:`~sklearn.calibration.CalibratedClassifierCV`.
+from sklearn.calibration import CalibratedClassifierCV
+from sklearn.metrics import brier_score_loss
+
+calibrated_classifier = CalibratedClassifierCV(
+    estimator=FrozenEstimator(classifier)
+).fit(X_train, y_train)
+
+prob_pos_clf = classifier.predict_proba(X_test)[:, 1]
+clf_score = brier_score_loss(y_test, prob_pos_clf)
+print(f"No calibration: {clf_score:.3f}")
+
+prob_pos_calibrated = calibrated_classifier.predict_proba(X_test)[:, 1]
+calibrated_score = brier_score_loss(y_test, prob_pos_calibrated)
+print(f"With calibration: {calibrated_score:.3f}")
diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -660,15 +660,18 @@ def business_metric(y_true, y_pred, amount):
 #
 # The class :class:`~sklearn.model_selection.FixedThresholdClassifier` allows us to
 # manually set the decision threshold. At prediction time, it behave as the previous
-# tuned model but no search is performed during the fitting process.
+# tuned model but no search is performed during the fitting process. Note that here
+# we use :class:`~sklearn.frozen.FrozenEstimator` to wrap the predictive model to
+# avoid any refitting.
 #
 # Here, we will reuse the decision threshold found in the previous section to create a
 # new model and check that it gives the same results.
+from sklearn.frozen import FrozenEstimator
 from sklearn.model_selection import FixedThresholdClassifier
 
 model_fixed_threshold = FixedThresholdClassifier(
-    estimator=model, threshold=tuned_model.best_threshold_, prefit=True
-).fit(data_train, target_train)
+    estimator=FrozenEstimator(model), threshold=tuned_model.best_threshold_
+)
 
 # %%
 business_score = business_scorer(

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
@@ -43,6 +43,13 @@
 from ._split import StratifiedShuffleSplit, check_cv
 
 
+def _check_is_fitted(estimator):
+    try:
+        check_is_fitted(estimator.estimator)
+    except NotFittedError:
+        check_is_fitted(estimator, "estimator_")
+
+
 def _estimator_has(attr):
     """Check if we can delegate a method to the underlying estimator.
 
@@ -170,8 +177,9 @@ def predict_proba(self, X):
         probabilities : ndarray of shape (n_samples, n_classes)
             The class probabilities of the input samples.
         """
-        check_is_fitted(self, "estimator_")
-        return self.estimator_.predict_proba(X)
+        _check_is_fitted(self)
+        estimator = getattr(self, "estimator_", self.estimator)
+        return estimator.predict_proba(X)
 
     @available_if(_estimator_has("predict_log_proba"))
     def predict_log_proba(self, X):
@@ -188,8 +196,9 @@ def predict_log_proba(self, X):
         log_probabilities : ndarray of shape (n_samples, n_classes)
             The logarithm class probabilities of the input samples.
         """
-        check_is_fitted(self, "estimator_")
-        return self.estimator_.predict_log_proba(X)
+        _check_is_fitted(self)
+        estimator = getattr(self, "estimator_", self.estimator)
+        return estimator.predict_log_proba(X)
 
     @available_if(_estimator_has("decision_function"))
     def decision_function(self, X):
@@ -206,8 +215,9 @@ def decision_function(self, X):
         decisions : ndarray of shape (n_samples,)
             The decision function computed the fitted estimator.
         """
-        check_is_fitted(self, "estimator_")
-        return self.estimator_.decision_function(X)
+        _check_is_fitted(self)
+        estimator = getattr(self, "estimator_", self.estimator)
+        return estimator.decision_function(X)
 
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
@@ -264,13 +274,6 @@ class FixedThresholdClassifier(BaseThresholdClassifier):
           If the method is not implemented by the classifier, it will raise an
           error.
 
-    prefit : bool, default=False
-        Whether a pre-fitted model is expected to be passed into the constructor
-        directly or not. If `True`, `estimator` must be a fitted estimator. If `False`,
-        `estimator` is fitted and updated by calling `fit`.
-
-        .. versionadded:: 1.6
-
     Attributes
     ----------
     estimator_ : estimator instance
@@ -322,7 +325,6 @@ class FixedThresholdClassifier(BaseThresholdClassifier):
         **BaseThresholdClassifier._parameter_constraints,
         "threshold": [StrOptions({"auto"}), Real],
         "pos_label": [Real, str, "boolean", None],
-        "prefit": ["boolean"],
     }
 
     def __init__(
@@ -332,12 +334,22 @@ def __init__(
         threshold="auto",
         pos_label=None,
         response_method="auto",
-        prefit=False,
     ):
         super().__init__(estimator=estimator, response_method=response_method)
         self.pos_label = pos_label
         self.threshold = threshold
-        self.prefit = prefit
+
+    @property
+    def classes_(self):
+        if estimator := getattr(self, "estimator_", None):
+            return estimator.classes_
+        try:
+            check_is_fitted(self.estimator)
+            return self.estimator.classes_
+        except NotFittedError:
+            raise AttributeError(
+                "The underlying estimator is not fitted yet."
+            ) from NotFittedError
 
     def _fit(self, X, y, **params):
         """Fit the classifier.
@@ -360,13 +372,7 @@ def _fit(self, X, y, **params):
             Returns an instance of self.
         """
         routed_params = process_routing(self, "fit", **params)
-        if self.prefit:
-            check_is_fitted(self.estimator)
-            self.estimator_ = self.estimator
-        else:
-            self.estimator_ = clone(self.estimator).fit(
-                X, y, **routed_params.estimator.fit
-            )
+        self.estimator_ = clone(self.estimator).fit(X, y, **routed_params.estimator.fit)
         return self
 
     def predict(self, X):
@@ -382,9 +388,12 @@ def predict(self, X):
         class_labels : ndarray of shape (n_samples,)
             The predicted class.
         """
-        check_is_fitted(self, "estimator_")
+        _check_is_fitted(self)
+
+        estimator = getattr(self, "estimator_", self.estimator)
+
         y_score, _, response_method_used = _get_response_values_binary(
-            self.estimator_,
+            estimator,
             X,
             self._get_response_method(),
             pos_label=self.pos_label,

diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -2,7 +2,7 @@
 import pytest
 
 from sklearn import config_context
-from sklearn.base import BaseEstimator, ClassifierMixin, clone
+from sklearn.base import clone
 from sklearn.datasets import (
     load_breast_cancer,
     load_iris,
@@ -593,41 +593,26 @@ def test_fixed_threshold_classifier_metadata_routing():
     assert_allclose(classifier_default_threshold.estimator_.coef_, classifier.coef_)
 
 
-class ClassifierLoggingFit(ClassifierMixin, BaseEstimator):
-    """Classifier that logs the number of `fit` calls."""
-
-    def __init__(self, fit_calls=0):
-        self.fit_calls = fit_calls
-
-    def fit(self, X, y, **fit_params):
-        self.fit_calls += 1
-        self.is_fitted_ = True
-        return self
-
-    def predict_proba(self, X):
-        return np.ones((X.shape[0], 2), np.float64)  # pragma: nocover
-
-
-def test_fixed_threshold_classifier_prefit():
-    """Check the behaviour of the `FixedThresholdClassifier` with the `prefit`
-    parameter."""
+@pytest.mark.parametrize(
+    "method", ["predict_proba", "decision_function", "predict", "predict_log_proba"]
+)
+def test_fixed_threshold_classifier_fitted_estimator(method):
+    """Check that if the underlying estimator is already fitted, no fit is required."""
     X, y = make_classification(random_state=0)
+    classifier = LogisticRegression().fit(X, y)
+    fixed_threshold_classifier = FixedThresholdClassifier(estimator=classifier)
+    # This should not raise an error
+    getattr(fixed_threshold_classifier, method)(X)
 
-    estimator = ClassifierLoggingFit()
-    model = FixedThresholdClassifier(estimator=estimator, prefit=True)
-    with pytest.raises(NotFittedError):
-        model.fit(X, y)
 
-    # check that we don't clone the classifier when `prefit=True`.
-    estimator.fit(X, y)
-    model.fit(X, y)
-    assert estimator.fit_calls == 1
-    assert model.estimator_ is estimator
+def test_fixed_threshold_classifier_classes_():
+    """Check that the classes_ attribute is properly set."""
+    X, y = make_classification(random_state=0)
+    with pytest.raises(
+        AttributeError, match="The underlying estimator is not fitted yet."
+    ):
+        FixedThresholdClassifier(estimator=LogisticRegression()).classes_
 
-    # check that we clone the classifier when `prefit=False`.
-    estimator = ClassifierLoggingFit()
-    model = FixedThresholdClassifier(estimator=estimator, prefit=False)
-    model.fit(X, y)
-    assert estimator.fit_calls == 0
-    assert model.estimator_.fit_calls == 1
-    assert model.estimator_ is not estimator
+    classifier = LogisticRegression().fit(X, y)
+    fixed_threshold_classifier = FixedThresholdClassifier(estimator=classifier)
+    assert_array_equal(fixed_threshold_classifier.classes_, classifier.classes_)