scikit-learn · john-zhang-uoft · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/doc/whats_new/upcoming_changes/sklearn.linear_model/30113.feature.rst b/doc/whats_new/upcoming_changes/sklearn.linear_model/30113.feature.rst
@@ -0,0 +1,3 @@
+- :class:`SGDRegressor`` now has a parameter to set a gradient clip value, which clips gradients above a specified norm at each step.
+
+By :user:`John Zhang <john-zhang-uoft>`
diff --git a/sklearn/linear_model/_sgd_fast.pyx.tp b/sklearn/linear_model/_sgd_fast.pyx.tp
@@ -30,7 +30,7 @@ import numpy as np
 from time import time
 
 from cython cimport floating
-from libc.math cimport exp, fabs, isfinite, log, pow, INFINITY
+from libc.math cimport exp, fabs, isfinite, log, pow, INFINITY, sqrt
 
 from .._loss._loss cimport CyLossFunction
 from ..utils._typedefs cimport uint32_t, uint8_t
@@ -306,6 +306,7 @@ def _plain_sgd{{name_suffix}}(
     double t=1.0,
     double intercept_decay=1.0,
     int average=0,
+    double gradient_clip_norm=-1,
 ):
     """SGD for generic loss functions and penalties with optional averaging
 
@@ -380,7 +381,10 @@ def _plain_sgd{{name_suffix}}(
     average : int
         The number of iterations before averaging starts. average=1 is
         equivalent to averaging for all iterations.
-
+    gradient_clip_norm : double
+        The maximum size of the norm of the weights gradient per update.
+        If it is set to 0, there is no clipping according to weight gradients.
+        Default: 0.
 
     Returns
     -------
@@ -429,6 +433,8 @@ def _plain_sgd{{name_suffix}}(
     cdef double optimal_init = 0.0
     cdef double dloss = 0.0
     cdef double MAX_DLOSS = 1e12
+    cdef double gradient_norm = 0.0
+    cdef double scaling_factor = 0.0
 
     cdef long long sample_index
 
@@ -502,6 +508,14 @@ def _plain_sgd{{name_suffix}}(
                         dloss = -MAX_DLOSS
                     elif dloss > MAX_DLOSS:
                         dloss = MAX_DLOSS
+
+                    if gradient_clip_norm > 0:
+                        # Scale down dloss when the gradient norm
+                        # is larger than the threshold
+                        gradient_norm = sqrt(sqnorm(x_data_ptr, x_ind_ptr, xnnz)) * fabs(dloss)
+                        if gradient_norm > gradient_clip_norm:
+                            scaling_factor = gradient_clip_norm / gradient_norm
+                            dloss *= scaling_factor
                     update = -eta * dloss
 
                 if learning_rate >= PA1:

diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
@@ -113,6 +113,7 @@ def __init__(
         n_iter_no_change=5,
         warm_start=False,
         average=False,
+        gradient_clip_norm=0,
     ):
         self.loss = loss
         self.penalty = penalty
@@ -132,6 +133,7 @@ def __init__(
         self.n_iter_no_change = n_iter_no_change
         self.warm_start = warm_start
         self.average = average
+        self.gradient_clip_norm = gradient_clip_norm
         self.max_iter = max_iter
         self.tol = tol
 
@@ -1422,6 +1424,7 @@ def __init__(
         n_iter_no_change=5,
         warm_start=False,
         average=False,
+        gradient_clip_norm=0,
     ):
         super().__init__(
             loss=loss,
@@ -1443,6 +1446,7 @@ def __init__(
             n_iter_no_change=n_iter_no_change,
             warm_start=warm_start,
             average=average,
+            gradient_clip_norm=gradient_clip_norm,
         )
 
     def _partial_fit(
@@ -1753,6 +1757,7 @@ def _fit_regressor(
             self.t_,
             intercept_decay,
             self.average,
+            self.gradient_clip_norm,
         )
 
         self.t_ += self.n_iter_ * X.shape[0]
@@ -1950,6 +1955,10 @@ class SGDRegressor(BaseSGDRegressor):
         samples seen reaches `average`. So ``average=10`` will begin
         averaging after seeing 10 samples.
 
+    gradient_clip_norm : float, default=0
+        If greater than 0, the gradient norms are clipped to the value of
+        `gradient_clip_norm` before updating the weights each step.
+
     Attributes
     ----------
     coef_ : ndarray of shape (n_features,)
@@ -2016,6 +2025,7 @@ class SGDRegressor(BaseSGDRegressor):
         ],
         "epsilon": [Interval(Real, 0, None, closed="left")],
         "eta0": [Interval(Real, 0, None, closed="left")],
+        "gradient_clip_norm": [Interval(Real, 0, None, closed="left")],
     }
 
     def __init__(
@@ -2040,6 +2050,7 @@ def __init__(
         n_iter_no_change=5,
         warm_start=False,
         average=False,
+        gradient_clip_norm=0,
     ):
         super().__init__(
             loss=loss,
@@ -2061,6 +2072,7 @@ def __init__(
             n_iter_no_change=n_iter_no_change,
             warm_start=warm_start,
             average=average,
+            gradient_clip_norm=gradient_clip_norm,
         )
 
 

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
@@ -2180,3 +2180,157 @@ def test_sgd_one_class_svm_estimator_type():
     """
     sgd_ocsvm = SGDOneClassSVM()
     assert get_tags(sgd_ocsvm).estimator_type == "outlier_detector"
+
+
+def test_sgd_regressor_gradient_clip_norm():
+
+    clipped_weights_results = []
+    for gradient_clip_norm in [0.01, 0.1, 0.5]:
+        # Step 1: Train without gradient clipping
+        X = np.array([[1, 1], [1, 1], [1, 1], [1, 1]])  # Simple 2D data
+        y = np.array(
+            [5000, 5000, 5000, 5000]
+        )  # Labels with large values to induce large gradients
+
+        # These input values should have very large gradients
+        # The gradients initially should be around 0.1 * 2 * 5000 = 1000
+        # which is sufficient such that the weights will be clipped
+        clf_no_clip = SGDRegressor(
+            max_iter=1,
+            tol=None,
+            eta0=0.05,
+            warm_start=True,
+            random_state=42,
+            gradient_clip_norm=0,  # No clipping
+            penalty=None,
+            alpha=0,
+        )
+        clf_no_clip.fit(X, y)
+        clf_no_clip.coef_ = np.zeros_like(clf_no_clip.coef_)
+        clf_no_clip.intercept_ = np.zeros_like(clf_no_clip.intercept_)
+        clf_no_clip.fit(X, y)
+        weights_no_clip = clf_no_clip.coef_.copy()
+
+        clf_with_clip = SGDRegressor(
+            max_iter=1,
+            tol=None,
+            eta0=0.05,
+            warm_start=True,
+            random_state=42,
+            gradient_clip_norm=gradient_clip_norm,  # Clipping threshold
+            penalty=None,
+            alpha=0,
+        )
+        clf_with_clip.fit(X, y)
+        clf_with_clip.coef_ = np.zeros_like(clf_with_clip.coef_)
+        clf_with_clip.intercept_ = np.zeros_like(clf_with_clip.intercept_)
+        clf_with_clip.fit(X, y)
+        weights_with_clip = clf_with_clip.coef_.copy()
+
+        weight_norm_with_clip = np.linalg.norm(weights_with_clip)
+        assert (
+            weight_norm_with_clip <= gradient_clip_norm
+        ), f"Norm {weight_norm_with_clip} exceeds threshold {gradient_clip_norm}"
+
+        weight_norm_no_clip = np.linalg.norm(weights_no_clip)
+        assert (
+            weight_norm_no_clip > gradient_clip_norm * 100
+        ), f"Unclipped norm should exceed the threshold {gradient_clip_norm} * 100"
+        clipped_weights_results.append(weight_norm_with_clip)
+
+    # Check that the clipped weights are strictly increasing with increasing clip norm
+    assert np.all(
+        np.diff(clipped_weights_results) > 0
+    ), "Clipped weights should increase with increasing clip norm."
+
+
+@pytest.mark.parametrize("gradient_clip_norm", [0.5, 1.0, 5.0])
+def test_sgd_gradient_clipping_math(gradient_clip_norm):
+    """Test that the gradient clipping math is correct."""
+    X = np.array([[1, 1]])
+    y = np.array([5000])
+
+    clf = SGDRegressor(
+        max_iter=1,
+        tol=None,
+        alpha=0,
+        warm_start=True,
+        random_state=42,
+        gradient_clip_norm=gradient_clip_norm,
+        penalty=None,
+        eta0=1.0,
+    )
+    clf.fit(X, y)
+    clf.coef_ = np.zeros_like(clf.coef_)
+    clf.intercept_ = np.zeros_like(clf.intercept_)
+    clf.fit(X, y)
+
+    weights_with_clip = clf.coef_.copy()
+    weight_norm_with_clip = np.linalg.norm(weights_with_clip)
+    assert (
+        abs(weight_norm_with_clip - gradient_clip_norm) < 1e-9
+    ), f"Weight norm {weight_norm_with_clip} is not {gradient_clip_norm}"
+
+
+def test_sgd_regressor_no_clipping():
+    """Test that the gradient is not clipped when gradient_clip_norm=0."""
+    X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]])
+    y = np.array([1, 2, 3, 4])
+
+    reg_no_clip = SGDRegressor(
+        max_iter=1,
+        tol=None,
+        alpha=0.01,
+        learning_rate="constant",
+        eta0=0.1,
+        gradient_clip_norm=0,  # No clipping
+        random_state=42,
+    )
+    reg_clip = SGDRegressor(
+        max_iter=1,
+        tol=None,
+        alpha=0.01,
+        learning_rate="constant",
+        eta0=0.1,
+        gradient_clip_norm=1000,  # Large enough to have no effect
+        random_state=42,
+    )
+
+    reg_no_clip.fit(X, y)
+    reg_clip.fit(X, y)
+
+    assert_array_almost_equal(reg_no_clip.coef_, reg_clip.coef_, decimal=6)
+
+
+def test_sgd_regressor_weights_change_with_clipping():
+    """Test that coefficients are affected by gradient clipping."""
+    X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]])
+    y = np.array([1, 2, 3, 4])
+
+    reg_clip = SGDRegressor(
+        max_iter=10,
+        tol=None,
+        alpha=0.01,
+        learning_rate="constant",
+        eta0=0.1,
+        gradient_clip_norm=1.0,
+        random_state=42,
+    )
+
+    reg_no_clip = SGDRegressor(
+        max_iter=10,
+        tol=None,
+        alpha=0.01,
+        learning_rate="constant",
+        eta0=0.1,
+        gradient_clip_norm=0,  # No gradient clipping
+        random_state=42,
+    )
+
+    reg_clip.fit(X, y)
+    reg_no_clip.fit(X, y)
+
+    # Check that weights are different to see the effect of gradient clipping
+    assert not np.allclose(
+        reg_clip.coef_, reg_no_clip.coef_, atol=1e-2
+    ), "Gradient clipping should affect the weights."
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		- :class:`SGDRegressor`` now has a parameter to set a gradient clip value, which clips gradients above a specified norm at each step.

		By :user:`John Zhang <john-zhang-uoft>`