Skip to content

Commit

Permalink
TST Add minimal setup to be able to run test suite on float32 (scikit…
Browse files Browse the repository at this point in the history
…-learn#22690)

Co-authored-by: Thomas J. Fan <[email protected]>
Co-authored-by: Olivier Grisel <[email protected]>
Co-authored-by: Jérémie du Boisberranger <[email protected]>
  • Loading branch information
4 people authored Mar 17, 2022
1 parent 5a9b2ce commit 613773d
Show file tree
Hide file tree
Showing 7 changed files with 141 additions and 7 deletions.
1 change: 1 addition & 0 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ jobs:
MATPLOTLIB_VERSION: 'min'
THREADPOOLCTL_VERSION: '2.2.0'
SKLEARN_ENABLE_DEBUG_CYTHON_DIRECTIVES: '1'
SKLEARN_RUN_FLOAT32_TESTS: '1'
SKLEARN_TESTS_GLOBAL_RANDOM_SEED: '2' # non-default seed
# Linux environment to test the latest available dependencies.
# It runs tests requiring lightgbm, pandas and PyAMG.
Expand Down
8 changes: 8 additions & 0 deletions doc/computing/parallelism.rst
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,14 @@ When this environment variable is set to a non zero value, the tests that need
network access are skipped. When this environment variable is not set then
network tests are skipped.

`SKLEARN_RUN_FLOAT32_TESTS`
~~~~~~~~~~~~~~~~~~~~~~~~~~~

When this environment variable is set to '1', the tests using the
`global_dtype` fixture are also run on float32 data.
When this environment variable is not set, the tests are only run on
float64 data.

`SKLEARN_ENABLE_DEBUG_CYTHON_DIRECTIVES`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
16 changes: 16 additions & 0 deletions doc/developers/develop.rst
Original file line number Diff line number Diff line change
Expand Up @@ -774,3 +774,19 @@ The reason for this setup is reproducibility:
when an estimator is ``fit`` twice to the same data,
it should produce an identical model both times,
hence the validation in ``fit``, not ``__init__``.

Numerical assertions in tests
-----------------------------

When asserting the quasi-equality of arrays of continuous values,
do use :func:`sklearn.utils._testing.assert_allclose`.

The relative tolerance is automatically inferred from the provided arrays
dtypes (for float32 and float64 dtypes in particular) but you can override
via ``rtol``.

When comparing arrays of zero-elements, please do provide a non-zero value for
the absolute tolerance via ``atol``.

For more information, please refer to the docstring of
:func:`sklearn.utils._testing.assert_allclose`.
12 changes: 12 additions & 0 deletions sklearn/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys

import pytest
import numpy as np
from threadpoolctl import threadpool_limits
from _pytest.doctest import DoctestItem

Expand Down Expand Up @@ -41,6 +42,17 @@
"fetch_rcv1_fxt": fetch_rcv1,
}

_SKIP32_MARK = pytest.mark.skipif(
environ.get("SKLEARN_RUN_FLOAT32_TESTS", "0") != "1",
reason="Set SKLEARN_RUN_FLOAT32_TESTS=1 to run float32 dtype tests",
)


# Global fixtures
@pytest.fixture(params=[pytest.param(np.float32, marks=_SKIP32_MARK), np.float64])
def global_dtype(request):
yield request.param


def _fetch_fixture(f):
"""Fetch dataset (download if missing and requested by environment)."""
Expand Down
16 changes: 10 additions & 6 deletions sklearn/feature_selection/tests/test_mutual_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
from scipy.sparse import csr_matrix

from sklearn.utils import check_random_state
from sklearn.utils._testing import assert_array_equal, assert_almost_equal
from sklearn.utils._testing import (
assert_array_equal,
assert_almost_equal,
assert_allclose,
)
from sklearn.feature_selection._mutual_info import _compute_mi
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif

Expand All @@ -21,7 +25,7 @@ def test_compute_mi_dd():
assert_almost_equal(_compute_mi(x, y, True, True), I_xy)


def test_compute_mi_cc():
def test_compute_mi_cc(global_dtype):
# For two continuous variables a good approach is to test on bivariate
# normal distribution, where mutual information is known.

Expand All @@ -43,15 +47,15 @@ def test_compute_mi_cc():
I_theory = np.log(sigma_1) + np.log(sigma_2) - 0.5 * np.log(np.linalg.det(cov))

rng = check_random_state(0)
Z = rng.multivariate_normal(mean, cov, size=1000)
Z = rng.multivariate_normal(mean, cov, size=1000).astype(global_dtype, copy=False)

x, y = Z[:, 0], Z[:, 1]

# Theory and computed values won't be very close, assert that the
# first figures after decimal point match.
# Theory and computed values won't be very close
# We here check with a large relative tolerance
for n_neighbors in [3, 5, 7]:
I_computed = _compute_mi(x, y, False, False, n_neighbors)
assert_almost_equal(I_computed, I_theory, 1)
assert_allclose(I_computed, I_theory, rtol=1e-1)


def test_compute_mi_cd():
Expand Down
76 changes: 75 additions & 1 deletion sklearn/utils/_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
except NameError:
WindowsError = None

from numpy.testing import assert_allclose
from numpy.testing import assert_allclose as np_assert_allclose
from numpy.testing import assert_almost_equal
from numpy.testing import assert_approx_equal
from numpy.testing import assert_array_equal
Expand Down Expand Up @@ -387,6 +387,80 @@ def assert_raise_message(exceptions, message, function, *args, **kwargs):
raise AssertionError("%s not raised by %s" % (names, function.__name__))


def assert_allclose(
actual, desired, rtol=None, atol=0.0, equal_nan=True, err_msg="", verbose=True
):
"""dtype-aware variant of numpy.testing.assert_allclose
This variant introspects the least precise floating point dtype
in the input argument and automatically sets the relative tolerance
parameter to 1e-4 float32 and use 1e-7 otherwise (typically float64
in scikit-learn).
`atol` is always left to 0. by default. It should be adjusted manually
to an assertion-specific value in case there are null values expected
in `desired`.
The aggregate tolerance is `atol + rtol * abs(desired)`.
Parameters
----------
actual : array_like
Array obtained.
desired : array_like
Array desired.
rtol : float, optional, default=None
Relative tolerance.
If None, it is set based on the provided arrays' dtypes.
atol : float, optional, default=0.
Absolute tolerance.
If None, it is set based on the provided arrays' dtypes.
equal_nan : bool, optional, default=True
If True, NaNs will compare equal.
err_msg : str, optional, default=''
The error message to be printed in case of failure.
verbose : bool, optional, default=True
If True, the conflicting values are appended to the error message.
Raises
------
AssertionError
If actual and desired are not equal up to specified precision.
See Also
--------
numpy.testing.assert_allclose
Examples
--------
>>> import numpy as np
>>> from sklearn.utils._testing import assert_allclose
>>> x = [1e-5, 1e-3, 1e-1]
>>> y = np.arccos(np.cos(x))
>>> assert_allclose(x, y, rtol=1e-5, atol=0)
>>> a = np.full(shape=10, fill_value=1e-5, dtype=np.float32)
>>> assert_allclose(a, 1e-5)
"""
dtypes = []

actual, desired = np.asanyarray(actual), np.asanyarray(desired)
dtypes = [actual.dtype, desired.dtype]

if rtol is None:
rtols = [1e-4 if dtype == np.float32 else 1e-7 for dtype in dtypes]
rtol = max(rtols)

np_assert_allclose(
actual,
desired,
rtol=rtol,
atol=atol,
equal_nan=equal_nan,
err_msg=err_msg,
verbose=verbose,
)


def assert_allclose_dense_sparse(x, y, rtol=1e-07, atol=1e-9, err_msg=""):
"""Assert allclose for sparse and dense data.
Expand Down
19 changes: 19 additions & 0 deletions sklearn/utils/tests/test_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
_delete_folder,
_convert_container,
raises,
assert_allclose,
)

from sklearn.tree import DecisionTreeClassifier
Expand Down Expand Up @@ -854,3 +855,21 @@ def test_raises():
with pytest.raises(AssertionError):
with raises((TypeError, ValueError)):
pass


def test_float32_aware_assert_allclose():
# The relative tolerance for float32 inputs is 1e-4
assert_allclose(np.array([1.0 + 2e-5], dtype=np.float32), 1.0)
with pytest.raises(AssertionError):
assert_allclose(np.array([1.0 + 2e-4], dtype=np.float32), 1.0)

# The relative tolerance for other inputs is left to 1e-7 as in
# the original numpy version.
assert_allclose(np.array([1.0 + 2e-8], dtype=np.float64), 1.0)
with pytest.raises(AssertionError):
assert_allclose(np.array([1.0 + 2e-7], dtype=np.float64), 1.0)

# atol is left to 0.0 by default, even for float32
with pytest.raises(AssertionError):
assert_allclose(np.array([1e-5], dtype=np.float32), 0.0)
assert_allclose(np.array([1e-5], dtype=np.float32), 0.0, atol=2e-5)

0 comments on commit 613773d

Please sign in to comment.