FEA add zero_division to matthews_corrcoef (#28509)

Co-authored-by: Marc Torrellas Socastro <[email protected]> Co-authored-by: Guillaume Lemaitre <[email protected]>
scikit-learn · Oct 30, 2024 · ba2dd5d · ba2dd5d
1 parent 49c5948
commit ba2dd5d
Show file tree

Hide file tree

Showing 3 changed files with 53 additions and 12 deletions.
diff --git a/doc/whats_new/upcoming_changes/sklearn.metrics/28509.feature.rst b/doc/whats_new/upcoming_changes/sklearn.metrics/28509.feature.rst
@@ -0,0 +1,3 @@
+- Adds `zero_division` to :func:`metrics.matthews_corrcoef`.
+  When there is a zero division, the metric is undefined and this value is returned.
+  By :user:`Marc Torrellas Socastro <marctorsoc>` and :user:`Noam Keidar <redjest>`
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
@@ -1015,10 +1015,15 @@ def jaccard_score(
         "y_true": ["array-like"],
         "y_pred": ["array-like"],
         "sample_weight": ["array-like", None],
+        "zero_division": [
+            Options(Real, {0.0, 1.0}),
+            "nan",
+            StrOptions({"warn"}),
+        ],
     },
     prefer_skip_nested_validation=True,
 )
-def matthews_corrcoef(y_true, y_pred, *, sample_weight=None):
+def matthews_corrcoef(y_true, y_pred, *, sample_weight=None, zero_division="warn"):
     """Compute the Matthews correlation coefficient (MCC).
 
     The Matthews correlation coefficient is used in machine learning as a
@@ -1049,6 +1054,13 @@ def matthews_corrcoef(y_true, y_pred, *, sample_weight=None):
 
         .. versionadded:: 0.18
 
+    zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn"
+        Sets the value to return when there is a zero division, i.e. when all
+        predictions and labels are negative. If set to "warn", this acts like 0,
+        but a warning is also raised.
+
+        .. versionadded:: 1.6
+
     Returns
     -------
     mcc : float
@@ -1102,7 +1114,13 @@ def matthews_corrcoef(y_true, y_pred, *, sample_weight=None):
     cov_ytyt = n_samples**2 - np.dot(t_sum, t_sum)
 
     if cov_ypyp * cov_ytyt == 0:
-        return 0.0
+        if zero_division == "warn":
+            msg = (
+                "Matthews correlation coefficient is ill-defined and being set to 0.0. "
+                "Use `zero_division` to control this behaviour."
+            )
+            warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)
+        return _check_zero_division(zero_division)
     else:
         return cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
 

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
@@ -795,9 +795,22 @@ def test_cohen_kappa():
     )
 
 
-def test_matthews_corrcoef_nan():
-    assert matthews_corrcoef([0], [1]) == 0.0
-    assert matthews_corrcoef([0, 0], [0, 1]) == 0.0
+@pytest.mark.parametrize("zero_division", ["warn", 0, 1, np.nan])
+@pytest.mark.parametrize("y_true, y_pred", [([0], [1]), ([0, 0], [0, 1])])
+def test_matthews_corrcoef_zero_division(zero_division, y_true, y_pred):
+    """Check the behaviour of `zero_division` in `matthews_corrcoef`."""
+    expected_result = 0.0 if zero_division == "warn" else zero_division
+
+    if zero_division == "warn":
+        with pytest.warns(UndefinedMetricWarning):
+            result = matthews_corrcoef(y_true, y_pred, zero_division=zero_division)
+    else:
+        result = matthews_corrcoef(y_true, y_pred, zero_division=zero_division)
+
+    if np.isnan(expected_result):
+        assert np.isnan(result)
+    else:
+        assert result == expected_result
 
 
 @pytest.mark.parametrize("zero_division", [0, 1, np.nan])
@@ -924,15 +937,19 @@ def test_matthews_corrcoef():
 
     # For the zero vector case, the corrcoef cannot be calculated and should
     # output 0
-    assert_almost_equal(matthews_corrcoef([0, 0, 0, 0], [0, 0, 0, 0]), 0.0)
+    assert_almost_equal(
+        matthews_corrcoef([0, 0, 0, 0], [0, 0, 0, 0], zero_division=0), 0.0
+    )
 
     # And also for any other vector with 0 variance
-    assert_almost_equal(matthews_corrcoef(y_true, ["a"] * len(y_true)), 0.0)
+    assert_almost_equal(
+        matthews_corrcoef(y_true, ["a"] * len(y_true), zero_division=0), 0.0
+    )
 
     # These two vectors have 0 correlation and hence mcc should be 0
     y_1 = [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1]
     y_2 = [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1]
-    assert_almost_equal(matthews_corrcoef(y_1, y_2), 0.0)
+    assert_almost_equal(matthews_corrcoef(y_1, y_2, zero_division=0), 0.0)
 
     # Check that sample weight is able to selectively exclude
     mask = [1] * 10 + [0] * 10
@@ -965,17 +982,17 @@ def test_matthews_corrcoef_multiclass():
     # Zero variance will result in an mcc of zero
     y_true = [0, 1, 2]
     y_pred = [3, 3, 3]
-    assert_almost_equal(matthews_corrcoef(y_true, y_pred), 0.0)
+    assert_almost_equal(matthews_corrcoef(y_true, y_pred, zero_division=0), 0.0)
 
     # Also for ground truth with zero variance
     y_true = [3, 3, 3]
     y_pred = [0, 1, 2]
-    assert_almost_equal(matthews_corrcoef(y_true, y_pred), 0.0)
+    assert_almost_equal(matthews_corrcoef(y_true, y_pred, zero_division=0), 0.0)
 
     # These two vectors have 0 correlation and hence mcc should be 0
     y_1 = [0, 1, 2, 0, 1, 2, 0, 1, 2]
     y_2 = [1, 1, 1, 2, 2, 2, 0, 0, 0]
-    assert_almost_equal(matthews_corrcoef(y_1, y_2), 0.0)
+    assert_almost_equal(matthews_corrcoef(y_1, y_2, zero_division=0), 0.0)
 
     # We can test that binary assumptions hold using the multiclass computation
     # by masking the weight of samples not in the first two classes
@@ -994,7 +1011,10 @@ def test_matthews_corrcoef_multiclass():
     y_pred = [0, 0, 1, 2]
     sample_weight = [1, 1, 0, 0]
     assert_almost_equal(
-        matthews_corrcoef(y_true, y_pred, sample_weight=sample_weight), 0.0
+        matthews_corrcoef(
+            y_true, y_pred, sample_weight=sample_weight, zero_division=0.0
+        ),
+        0.0,
     )