Skip to content

Commit

Permalink
FEA add zero_division to matthews_corrcoef (#28509)
Browse files Browse the repository at this point in the history
Co-authored-by: Marc Torrellas Socastro <[email protected]>
Co-authored-by: Guillaume Lemaitre <[email protected]>
  • Loading branch information
3 people authored Oct 30, 2024
1 parent 49c5948 commit ba2dd5d
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 12 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- Adds `zero_division` to :func:`metrics.matthews_corrcoef`.
When there is a zero division, the metric is undefined and this value is returned.
By :user:`Marc Torrellas Socastro <marctorsoc>` and :user:`Noam Keidar <redjest>`
22 changes: 20 additions & 2 deletions sklearn/metrics/_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -1015,10 +1015,15 @@ def jaccard_score(
"y_true": ["array-like"],
"y_pred": ["array-like"],
"sample_weight": ["array-like", None],
"zero_division": [
Options(Real, {0.0, 1.0}),
"nan",
StrOptions({"warn"}),
],
},
prefer_skip_nested_validation=True,
)
def matthews_corrcoef(y_true, y_pred, *, sample_weight=None):
def matthews_corrcoef(y_true, y_pred, *, sample_weight=None, zero_division="warn"):
"""Compute the Matthews correlation coefficient (MCC).
The Matthews correlation coefficient is used in machine learning as a
Expand Down Expand Up @@ -1049,6 +1054,13 @@ def matthews_corrcoef(y_true, y_pred, *, sample_weight=None):
.. versionadded:: 0.18
zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn"
Sets the value to return when there is a zero division, i.e. when all
predictions and labels are negative. If set to "warn", this acts like 0,
but a warning is also raised.
.. versionadded:: 1.6
Returns
-------
mcc : float
Expand Down Expand Up @@ -1102,7 +1114,13 @@ def matthews_corrcoef(y_true, y_pred, *, sample_weight=None):
cov_ytyt = n_samples**2 - np.dot(t_sum, t_sum)

if cov_ypyp * cov_ytyt == 0:
return 0.0
if zero_division == "warn":
msg = (
"Matthews correlation coefficient is ill-defined and being set to 0.0. "
"Use `zero_division` to control this behaviour."
)
warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)
return _check_zero_division(zero_division)
else:
return cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

Expand Down
40 changes: 30 additions & 10 deletions sklearn/metrics/tests/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,9 +795,22 @@ def test_cohen_kappa():
)


def test_matthews_corrcoef_nan():
assert matthews_corrcoef([0], [1]) == 0.0
assert matthews_corrcoef([0, 0], [0, 1]) == 0.0
@pytest.mark.parametrize("zero_division", ["warn", 0, 1, np.nan])
@pytest.mark.parametrize("y_true, y_pred", [([0], [1]), ([0, 0], [0, 1])])
def test_matthews_corrcoef_zero_division(zero_division, y_true, y_pred):
"""Check the behaviour of `zero_division` in `matthews_corrcoef`."""
expected_result = 0.0 if zero_division == "warn" else zero_division

if zero_division == "warn":
with pytest.warns(UndefinedMetricWarning):
result = matthews_corrcoef(y_true, y_pred, zero_division=zero_division)
else:
result = matthews_corrcoef(y_true, y_pred, zero_division=zero_division)

if np.isnan(expected_result):
assert np.isnan(result)
else:
assert result == expected_result


@pytest.mark.parametrize("zero_division", [0, 1, np.nan])
Expand Down Expand Up @@ -924,15 +937,19 @@ def test_matthews_corrcoef():

# For the zero vector case, the corrcoef cannot be calculated and should
# output 0
assert_almost_equal(matthews_corrcoef([0, 0, 0, 0], [0, 0, 0, 0]), 0.0)
assert_almost_equal(
matthews_corrcoef([0, 0, 0, 0], [0, 0, 0, 0], zero_division=0), 0.0
)

# And also for any other vector with 0 variance
assert_almost_equal(matthews_corrcoef(y_true, ["a"] * len(y_true)), 0.0)
assert_almost_equal(
matthews_corrcoef(y_true, ["a"] * len(y_true), zero_division=0), 0.0
)

# These two vectors have 0 correlation and hence mcc should be 0
y_1 = [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1]
y_2 = [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1]
assert_almost_equal(matthews_corrcoef(y_1, y_2), 0.0)
assert_almost_equal(matthews_corrcoef(y_1, y_2, zero_division=0), 0.0)

# Check that sample weight is able to selectively exclude
mask = [1] * 10 + [0] * 10
Expand Down Expand Up @@ -965,17 +982,17 @@ def test_matthews_corrcoef_multiclass():
# Zero variance will result in an mcc of zero
y_true = [0, 1, 2]
y_pred = [3, 3, 3]
assert_almost_equal(matthews_corrcoef(y_true, y_pred), 0.0)
assert_almost_equal(matthews_corrcoef(y_true, y_pred, zero_division=0), 0.0)

# Also for ground truth with zero variance
y_true = [3, 3, 3]
y_pred = [0, 1, 2]
assert_almost_equal(matthews_corrcoef(y_true, y_pred), 0.0)
assert_almost_equal(matthews_corrcoef(y_true, y_pred, zero_division=0), 0.0)

# These two vectors have 0 correlation and hence mcc should be 0
y_1 = [0, 1, 2, 0, 1, 2, 0, 1, 2]
y_2 = [1, 1, 1, 2, 2, 2, 0, 0, 0]
assert_almost_equal(matthews_corrcoef(y_1, y_2), 0.0)
assert_almost_equal(matthews_corrcoef(y_1, y_2, zero_division=0), 0.0)

# We can test that binary assumptions hold using the multiclass computation
# by masking the weight of samples not in the first two classes
Expand All @@ -994,7 +1011,10 @@ def test_matthews_corrcoef_multiclass():
y_pred = [0, 0, 1, 2]
sample_weight = [1, 1, 0, 0]
assert_almost_equal(
matthews_corrcoef(y_true, y_pred, sample_weight=sample_weight), 0.0
matthews_corrcoef(
y_true, y_pred, sample_weight=sample_weight, zero_division=0.0
),
0.0,
)


Expand Down

0 comments on commit ba2dd5d

Please sign in to comment.