Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- :class:`SGDRegressor`` now has a parameter to set a gradient clip value, which clips gradients above a specified norm at each step.

By :user:`John Zhang <john-zhang-uoft>`
18 changes: 16 additions & 2 deletions sklearn/linear_model/_sgd_fast.pyx.tp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import numpy as np
from time import time

from cython cimport floating
from libc.math cimport exp, fabs, isfinite, log, pow, INFINITY
from libc.math cimport exp, fabs, isfinite, log, pow, INFINITY, sqrt

from .._loss._loss cimport CyLossFunction
from ..utils._typedefs cimport uint32_t, uint8_t
Expand Down Expand Up @@ -306,6 +306,7 @@ def _plain_sgd{{name_suffix}}(
double t=1.0,
double intercept_decay=1.0,
int average=0,
double gradient_clip_norm=-1,
):
"""SGD for generic loss functions and penalties with optional averaging

Expand Down Expand Up @@ -380,7 +381,10 @@ def _plain_sgd{{name_suffix}}(
average : int
The number of iterations before averaging starts. average=1 is
equivalent to averaging for all iterations.

gradient_clip_norm : double
The maximum size of the norm of the weights gradient per update.
If it is set to 0, there is no clipping according to weight gradients.
Default: 0.

Returns
-------
Expand Down Expand Up @@ -429,6 +433,8 @@ def _plain_sgd{{name_suffix}}(
cdef double optimal_init = 0.0
cdef double dloss = 0.0
cdef double MAX_DLOSS = 1e12
cdef double gradient_norm = 0.0
cdef double scaling_factor = 0.0

cdef long long sample_index

Expand Down Expand Up @@ -502,6 +508,14 @@ def _plain_sgd{{name_suffix}}(
dloss = -MAX_DLOSS
elif dloss > MAX_DLOSS:
dloss = MAX_DLOSS

if gradient_clip_norm > 0:
# Scale down dloss when the gradient norm
# is larger than the threshold
gradient_norm = sqrt(sqnorm(x_data_ptr, x_ind_ptr, xnnz)) * fabs(dloss)
if gradient_norm > gradient_clip_norm:
scaling_factor = gradient_clip_norm / gradient_norm
dloss *= scaling_factor
update = -eta * dloss

if learning_rate >= PA1:
Expand Down
12 changes: 12 additions & 0 deletions sklearn/linear_model/_stochastic_gradient.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def __init__(
n_iter_no_change=5,
warm_start=False,
average=False,
gradient_clip_norm=0,
):
self.loss = loss
self.penalty = penalty
Expand All @@ -132,6 +133,7 @@ def __init__(
self.n_iter_no_change = n_iter_no_change
self.warm_start = warm_start
self.average = average
self.gradient_clip_norm = gradient_clip_norm
self.max_iter = max_iter
self.tol = tol

Expand Down Expand Up @@ -1422,6 +1424,7 @@ def __init__(
n_iter_no_change=5,
warm_start=False,
average=False,
gradient_clip_norm=0,
):
super().__init__(
loss=loss,
Expand All @@ -1443,6 +1446,7 @@ def __init__(
n_iter_no_change=n_iter_no_change,
warm_start=warm_start,
average=average,
gradient_clip_norm=gradient_clip_norm,
)

def _partial_fit(
Expand Down Expand Up @@ -1753,6 +1757,7 @@ def _fit_regressor(
self.t_,
intercept_decay,
self.average,
self.gradient_clip_norm,
)

self.t_ += self.n_iter_ * X.shape[0]
Expand Down Expand Up @@ -1950,6 +1955,10 @@ class SGDRegressor(BaseSGDRegressor):
samples seen reaches `average`. So ``average=10`` will begin
averaging after seeing 10 samples.

gradient_clip_norm : float, default=0
If greater than 0, the gradient norms are clipped to the value of
`gradient_clip_norm` before updating the weights each step.

Attributes
----------
coef_ : ndarray of shape (n_features,)
Expand Down Expand Up @@ -2016,6 +2025,7 @@ class SGDRegressor(BaseSGDRegressor):
],
"epsilon": [Interval(Real, 0, None, closed="left")],
"eta0": [Interval(Real, 0, None, closed="left")],
"gradient_clip_norm": [Interval(Real, 0, None, closed="left")],
}

def __init__(
Expand All @@ -2040,6 +2050,7 @@ def __init__(
n_iter_no_change=5,
warm_start=False,
average=False,
gradient_clip_norm=0,
):
super().__init__(
loss=loss,
Expand All @@ -2061,6 +2072,7 @@ def __init__(
n_iter_no_change=n_iter_no_change,
warm_start=warm_start,
average=average,
gradient_clip_norm=gradient_clip_norm,
)


Expand Down
154 changes: 154 additions & 0 deletions sklearn/linear_model/tests/test_sgd.py
Original file line number Diff line number Diff line change
Expand Up @@ -2180,3 +2180,157 @@ def test_sgd_one_class_svm_estimator_type():
"""
sgd_ocsvm = SGDOneClassSVM()
assert get_tags(sgd_ocsvm).estimator_type == "outlier_detector"


def test_sgd_regressor_gradient_clip_norm():

clipped_weights_results = []
for gradient_clip_norm in [0.01, 0.1, 0.5]:
# Step 1: Train without gradient clipping
X = np.array([[1, 1], [1, 1], [1, 1], [1, 1]]) # Simple 2D data
y = np.array(
[5000, 5000, 5000, 5000]
) # Labels with large values to induce large gradients

# These input values should have very large gradients
# The gradients initially should be around 0.1 * 2 * 5000 = 1000
# which is sufficient such that the weights will be clipped
clf_no_clip = SGDRegressor(
max_iter=1,
tol=None,
eta0=0.05,
warm_start=True,
random_state=42,
gradient_clip_norm=0, # No clipping
penalty=None,
alpha=0,
)
clf_no_clip.fit(X, y)
clf_no_clip.coef_ = np.zeros_like(clf_no_clip.coef_)
clf_no_clip.intercept_ = np.zeros_like(clf_no_clip.intercept_)
clf_no_clip.fit(X, y)
weights_no_clip = clf_no_clip.coef_.copy()

clf_with_clip = SGDRegressor(
max_iter=1,
tol=None,
eta0=0.05,
warm_start=True,
random_state=42,
gradient_clip_norm=gradient_clip_norm, # Clipping threshold
penalty=None,
alpha=0,
)
clf_with_clip.fit(X, y)
clf_with_clip.coef_ = np.zeros_like(clf_with_clip.coef_)
clf_with_clip.intercept_ = np.zeros_like(clf_with_clip.intercept_)
clf_with_clip.fit(X, y)
weights_with_clip = clf_with_clip.coef_.copy()

weight_norm_with_clip = np.linalg.norm(weights_with_clip)
assert (
weight_norm_with_clip <= gradient_clip_norm
), f"Norm {weight_norm_with_clip} exceeds threshold {gradient_clip_norm}"

weight_norm_no_clip = np.linalg.norm(weights_no_clip)
assert (
weight_norm_no_clip > gradient_clip_norm * 100
), f"Unclipped norm should exceed the threshold {gradient_clip_norm} * 100"
clipped_weights_results.append(weight_norm_with_clip)

# Check that the clipped weights are strictly increasing with increasing clip norm
assert np.all(
np.diff(clipped_weights_results) > 0
), "Clipped weights should increase with increasing clip norm."


@pytest.mark.parametrize("gradient_clip_norm", [0.5, 1.0, 5.0])
def test_sgd_gradient_clipping_math(gradient_clip_norm):
"""Test that the gradient clipping math is correct."""
X = np.array([[1, 1]])
y = np.array([5000])

clf = SGDRegressor(
max_iter=1,
tol=None,
alpha=0,
warm_start=True,
random_state=42,
gradient_clip_norm=gradient_clip_norm,
penalty=None,
eta0=1.0,
)
clf.fit(X, y)
clf.coef_ = np.zeros_like(clf.coef_)
clf.intercept_ = np.zeros_like(clf.intercept_)
clf.fit(X, y)

weights_with_clip = clf.coef_.copy()
weight_norm_with_clip = np.linalg.norm(weights_with_clip)
assert (
abs(weight_norm_with_clip - gradient_clip_norm) < 1e-9
), f"Weight norm {weight_norm_with_clip} is not {gradient_clip_norm}"


def test_sgd_regressor_no_clipping():
"""Test that the gradient is not clipped when gradient_clip_norm=0."""
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]])
y = np.array([1, 2, 3, 4])

reg_no_clip = SGDRegressor(
max_iter=1,
tol=None,
alpha=0.01,
learning_rate="constant",
eta0=0.1,
gradient_clip_norm=0, # No clipping
random_state=42,
)
reg_clip = SGDRegressor(
max_iter=1,
tol=None,
alpha=0.01,
learning_rate="constant",
eta0=0.1,
gradient_clip_norm=1000, # Large enough to have no effect
random_state=42,
)

reg_no_clip.fit(X, y)
reg_clip.fit(X, y)

assert_array_almost_equal(reg_no_clip.coef_, reg_clip.coef_, decimal=6)


def test_sgd_regressor_weights_change_with_clipping():
"""Test that coefficients are affected by gradient clipping."""
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]])
y = np.array([1, 2, 3, 4])

reg_clip = SGDRegressor(
max_iter=10,
tol=None,
alpha=0.01,
learning_rate="constant",
eta0=0.1,
gradient_clip_norm=1.0,
random_state=42,
)

reg_no_clip = SGDRegressor(
max_iter=10,
tol=None,
alpha=0.01,
learning_rate="constant",
eta0=0.1,
gradient_clip_norm=0, # No gradient clipping
random_state=42,
)

reg_clip.fit(X, y)
reg_no_clip.fit(X, y)

# Check that weights are different to see the effect of gradient clipping
assert not np.allclose(
reg_clip.coef_, reg_no_clip.coef_, atol=1e-2
), "Gradient clipping should affect the weights."
Loading