NumPy 2.0 compatibility (#1632)

* Replace `np.in1d` with `np.isin` This comes by checking against the NPY201 rule for Ruff that checks deprecations for a NumPy <2 codebase * Clean up setup script slightly * Bump Cython, NumPy build-time requirements * Run linter * Remove <2 pin on NumPy runtime dependency * Update lockfile * Reset and exercise poetry caches * Increment virtual environment cache * Bump scikit-learn, polars, scipy, pandas, matplotlib, sympy * Update lockfile * Bump build number to invalidate caches * Ensure Python floats are output * ruff * fix remaining tests * format * Redefine * Increment cache number * Include poetry cache key in venv cache key * Update lockfile for Polars * Revert "Include poetry cache key in venv cache key" This reverts commit 2548909. * Use `cache@v4` and `checkout@v4` * Apply suggestions from code review --------- Co-authored-by: Max Halford <[email protected]> Co-authored-by: Saulo Martiello Mastelini <[email protected]>
online-ml · Nov 19, 2024 · e069b67 · e069b67
1 parent ada5ada
commit e069b67
Show file tree

Hide file tree

Showing 31 changed files with 2,231 additions and 2,006 deletions.
diff --git a/.github/actions/install-env/action.yml b/.github/actions/install-env/action.yml
@@ -26,7 +26,7 @@ runs:
       uses: actions/cache@v4
       with:
         path: ~/.local # the path depends on the OS
-        key: poetry-2 # increment to reset cache
+        key: poetry-2 # modify to reset cache
 
     - name: Install poetry
       uses: snok/install-poetry@v1

diff --git a/.github/workflows/code-quality.yml b/.github/workflows/code-quality.yml
@@ -12,7 +12,7 @@ jobs:
   ubuntu:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Build River
         uses: ./.github/actions/install-env

diff --git a/.github/workflows/dev-docs.yml b/.github/workflows/dev-docs.yml
@@ -10,7 +10,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Build River
         uses: ./.github/actions/install-env

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -25,7 +25,7 @@ jobs:
           ]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Set up rust
         if: matrix.os != 'ubuntu-20.04' && matrix.os != 'ubuntu-22.04'
@@ -104,7 +104,7 @@ jobs:
     name: Build source distribution
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Build River
         uses: ./.github/actions/install-env

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -18,21 +18,21 @@ jobs:
     runs-on: ${{ matrix.os }}
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Build River
         uses: ./.github/actions/install-env
         with:
           python-version: "3.12"
 
       - name: Cache River datasets
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/river_data
           key: ${{ runner.os }}
 
       - name: Cache scikit-learn datasets
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/scikit_learn_data
           key: ${{ runner.os }}

diff --git a/build.py b/build.py
@@ -1,36 +1,23 @@
 import platform
-from distutils.command.build_ext import build_ext
-from distutils.errors import CCompilerError, DistutilsExecError, DistutilsPlatformError
+
+import numpy
 import setuptools
+from Cython.Build import cythonize
+from setuptools.command.build_ext import build_ext
+from setuptools.errors import CCompilerError
 from setuptools_rust import Binding, RustExtension
 
-try:
-    from numpy import __version__ as numpy_version
-    from numpy import get_include
-except ImportError:
-    subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy"])
-    from numpy import __version__ as numpy_version
-    from numpy import get_include
-
-try:
-    from Cython.Build import cythonize
-except ImportError:
-    subprocess.check_call([sys.executable, "-m", "pip", "install", "Cython"])
-    from Cython.Build import cythonize  # type: ignore
-
-
 ext_modules = cythonize(
     module_list=[
         setuptools.Extension(
             "*",
-            sources=["**/*.pyx"],
-            include_dirs=[get_include()],
+            sources=["river/**/*.pyx"],
+            include_dirs=[numpy.get_include()],
             libraries=[] if platform.system() == "Windows" else ["m"],
             define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
         )
     ],
     compiler_directives={
-        "language_level": 3,
         "binding": True,
         "embedsignature": True,
     },
@@ -47,13 +34,13 @@ class ExtBuilder(build_ext):
     def run(self):
         try:
             build_ext.run(self)
-        except (DistutilsPlatformError, FileNotFoundError):
+        except (FileNotFoundError):
             raise BuildFailed("File not found. Could not compile C extension.")
 
     def build_extension(self, ext):
         try:
             build_ext.build_extension(self, ext)
-        except (CCompilerError, DistutilsExecError, DistutilsPlatformError, ValueError):
+        except (CCompilerError, ValueError):
             raise BuildFailed("Could not compile C extension.")
 
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,11 @@
 [build-system]
-requires = ["poetry-core>=1.0.0", "cython", "numpy", "setuptools", "wheel", "setuptools-rust"]
+requires = [
+    "poetry-core>=1.0.0",
+    "cython>3",
+    "numpy>=2.0.0",
+    "setuptools>=70.1.0",
+    "setuptools-rust",
+]
 build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
@@ -20,7 +26,7 @@ include = [
     "river/datasets/*.zip",
     "river/stream/*.zip",
     "Cargo.toml",
-    "rust_src/**/*"
+    "rust_src/**/*",
 ]
 
 [tool.poetry.build]
@@ -29,34 +35,34 @@ script = "build.py"
 
 [tool.poetry.dependencies]
 python = "^3.9"
-numpy = "^1.23.0"
-scipy = "^1.12.1"
-pandas = "^2.1"
+numpy = ">=1.23.0"
+scipy = "^1.13.1"
+pandas = "^2.2.3"
 
 [tool.poetry.group.dev.dependencies]
 graphviz = "^0.20.1"
 gymnasium = "^0.29.0"
-matplotlib = "^3.0.2"
+matplotlib = "^3.8.4"
 mypy = "^1.11.1"
 pre-commit = "^3.5.0"
 pytest = "^7.4.2"
 ruff = "^0.4.10"
-scikit-learn = "^1.3.1"
+scikit-learn = "^1.5.1"
 sqlalchemy = "^2.0.22"
-sympy = "^1.10.1"
-pytest-xdist = {extras = ["psutil"], version = "^3.3.1"}
+sympy = "^1.12.1"
+pytest-xdist = { extras = ["psutil"], version = "^3.3.1" }
 ipykernel = "^6.26.0"
 ipython = "^8.17.2"
 rich = "^13.6.0"
 jupyter = "^1.0.0"
 mike = "^2.0.0"
-polars = "^0.20.8"
+polars = "^1.1.0"
 
 [tool.poetry.group.compat]
 optional = true
 
 [tool.poetry.group.compat.dependencies]
-scikit-learn = "^1.0.1"
+scikit-learn = "^1.5.1"
 sqlalchemy = "^2.0.0"
 
 [tool.poetry.group.docs]
@@ -84,7 +90,7 @@ optional = true
 
 [tool.poetry.group.benchmark.dependencies]
 "dominate" = "2.8.0"
-"scikit-learn" = "1.3.1"
+"scikit-learn" = "1.5.1"
 "tabulate" = "0.9.0"
 "vowpalwabbit" = "9.9.0"
 "watermark" = "2.4.3"
@@ -161,7 +167,7 @@ module = [
     "requests.*",
     "gymnasium.*",
     "sympy.*",
-    "polars.*"
+    "polars.*",
 ]
 ignore_missing_imports = true
 

diff --git a/river/compose/test_product.py b/river/compose/test_product.py
@@ -83,10 +83,10 @@ def test_issue_1253():
     >>> model = group1 + group1 * group2
     >>> XT = model.transform_many(X)
 
-    >>> XT.memory_usage().sum() // 1000
+    >>> XT.memory_usage().sum().item() // 1000
     85
 
-    >>> XT.sparse.to_dense().memory_usage().sum() // 1000
+    >>> XT.sparse.to_dense().memory_usage().sum().item() // 1000
     4455
 
     >>> X, y = datasets.make_regression(n_samples=6, n_features=2)

diff --git a/river/datasets/synth/anomaly_sine.py b/river/datasets/synth/anomaly_sine.py
@@ -139,4 +139,4 @@ def __iter__(self):
         self._generate_data()
 
         for xi, yi in itertools.zip_longest(self.X, self.y if hasattr(self.y, "__iter__") else []):
-            yield dict(zip(["sine", "cosine"], xi)), bool(yi)
+            yield dict(zip(["sine", "cosine"], xi.tolist())), bool(yi)
diff --git a/river/datasets/synth/logical.py b/river/datasets/synth/logical.py
@@ -67,7 +67,10 @@ def __iter__(self):
         X, Y = self._make_logical(n_tiles=self.n_tiles, shuffle=self.shuffle)
 
         for xi, yi in itertools.zip_longest(X, Y if hasattr(Y, "__iter__") else []):
-            yield dict(zip(self.feature_names, xi)), dict(zip(self.target_names, yi))
+            yield (
+                dict(zip(self.feature_names, xi.tolist())),
+                dict(zip(self.target_names, yi.tolist())),
+            )
 
     def _make_logical(self, n_tiles: int = 1, shuffle: bool = True):
         """Make toy dataset"""

diff --git a/river/facto/ffm.py b/river/facto/ffm.py
@@ -255,7 +255,7 @@ def __init__(
 
     def predict_one(self, x):
         x = self._ohe_cat_features(x)
-        return self._raw_dot(x)
+        return self._raw_dot(x).item()
 
 
 class FFMClassifier(FFM, base.Classifier):

diff --git a/river/facto/fm.py b/river/facto/fm.py
@@ -238,7 +238,7 @@ def __init__(
 
     def predict_one(self, x):
         x = self._ohe_cat_features(x)
-        return self._raw_dot(x)
+        return self._raw_dot(x).item()
 
 
 class FMClassifier(FM, base.Classifier):

diff --git a/river/facto/fwfm.py b/river/facto/fwfm.py
@@ -275,7 +275,7 @@ def __init__(
 
     def predict_one(self, x):
         x = self._ohe_cat_features(x)
-        return self._raw_dot(x)
+        return self._raw_dot(x).item()
 
 
 class FwFMClassifier(FwFM, base.Classifier):

diff --git a/river/facto/hofm.py b/river/facto/hofm.py
@@ -267,7 +267,7 @@ def __init__(
 
     def predict_one(self, x):
         x = self._ohe_cat_features(x)
-        return self._raw_dot(x)
+        return self._raw_dot(x).item()
 
 
 class HOFMClassifier(HOFM, base.Classifier):

diff --git a/river/forest/adaptive_random_forest.py b/river/forest/adaptive_random_forest.py
@@ -160,10 +160,12 @@ def learn_one(self, x: dict, y: base.typing.Target, **kwargs):
             # Update performance evaluator
             self._metrics[i].update(
                 y_true=y,
-                y_pred=model.predict_proba_one(x)
-                if isinstance(self.metric, metrics.base.ClassificationMetric)
-                and not self.metric.requires_labels
-                else y_pred,
+                y_pred=(
+                    model.predict_proba_one(x)
+                    if isinstance(self.metric, metrics.base.ClassificationMetric)
+                    and not self.metric.requires_labels
+                    else y_pred
+                ),
             )
 
             k = poisson(rate=self.lambda_value, rng=self._rng)

diff --git a/river/linear_model/bayesian_lin_reg.py b/river/linear_model/bayesian_lin_reg.py
@@ -211,7 +211,7 @@ def predict_one(self, x, with_dist=False):
         """
 
         # Bishop equation 3.58
-        y_pred_mean = utils.math.dot(self._m, x)
+        y_pred_mean = 0.0 if not len(self._m) else utils.math.dot(self._m, x).item()
         if not with_dist:
             return y_pred_mean
 

diff --git a/river/naive_bayes/base.py b/river/naive_bayes/base.py
@@ -83,7 +83,7 @@ def one_hot_encode(y: pd.Series) -> pd.DataFrame:
     """
     classes = np.unique(y)
     indices = np.searchsorted(classes, y)
-    indptr = np.hstack((0, np.cumsum(np.in1d(y, classes))))
+    indptr = np.hstack((0, np.cumsum(np.isin(y, classes))))
     data = np.empty_like(indices)
     data.fill(1)
     return pd.DataFrame.sparse.from_spmatrix(

diff --git a/river/optim/initializers.py b/river/optim/initializers.py
@@ -80,7 +80,7 @@ class Normal(Initializer):
     >>> init = optim.initializers.Normal(mu=0, sigma=1, seed=42)
 
     >>> init(shape=1)
-    0.496714
+    np.float64(0.4967141...)
 
     >>> init(shape=2)
     array([-0.1382643 ,  0.64768854])

diff --git a/river/optim/newton.py b/river/optim/newton.py
@@ -45,10 +45,10 @@ def sherman_morrison(A_inv: dict, u: dict, v: dict) -> dict:
 
     den = 1 + utils.math.dot(utils.math.dotvecmat(u, A_inv), v)
 
-    for k, v in utils.math.matmul2d(
+    for k, val in utils.math.matmul2d(
         utils.math.matmul2d(A_inv, utils.math.outer(u, v)), A_inv
     ).items():
-        A_inv[k] = A_inv.get(k, 0) - v / den
+        A_inv[k] = A_inv.get(k, 0) - val / den
 
     return A_inv
 

diff --git a/river/preprocessing/lda.py b/river/preprocessing/lda.py
@@ -209,7 +209,7 @@ def transform_one(self, x):
         # Sample empirical topic assignment:
         _, components = self._compute_statistics_components(words_indexes_list)
 
-        return dict(enumerate(components))
+        return dict(enumerate(components.tolist()))
 
     def _update_indexes(self, word_list: typing.Iterable[str]):
         """

diff --git a/river/preprocessing/scale.py b/river/preprocessing/scale.py
@@ -212,10 +212,12 @@ def learn_many(self, X: pd.DataFrame):
             a = old_count / (old_count + new_count)
             b = new_count / (old_count + new_count)
 
-            self.means[col] = a * old_mean + b * new_mean
+            self.means[col] = (a * old_mean + b * new_mean).item()
             if self.with_std:
-                self.vars[col] = a * old_var + b * new_var + a * b * (old_mean - new_mean) ** 2
-            self.counts[col] += new_count
+                self.vars[col] = (
+                    a * old_var + b * new_var + a * b * (old_mean - new_mean) ** 2
+                ).item()
+            self.counts[col] += new_count.item()
 
     def transform_many(self, X: pd.DataFrame):
         """Scale a mini-batch of features.

diff --git a/river/proba/beta.py b/river/proba/beta.py
@@ -92,20 +92,20 @@ def revert(self, x):
         else:
             self.beta -= 1
 
-    def __call__(self, p: float):
+    def __call__(self, p: float) -> float:
         return (
             p ** (self.alpha - 1) * (1 - p) ** (self.beta - 1) / _beta_func(self.alpha, self.beta)
         )
 
-    def sample(self):
+    def sample(self) -> float:
         return self._rng.betavariate(self.alpha, self.beta)
 
     @property
-    def mode(self):
+    def mode(self) -> float:
         try:
             return (self.alpha - 1) / (self.alpha + self.beta - 2)
         except ZeroDivisionError:
             return 0.5
 
-    def cdf(self, x):
-        return scipy.special.betainc(self.alpha, self.beta, x)
+    def cdf(self, x) -> float:
+        return scipy.special.betainc(self.alpha, self.beta, x).item()