Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Add sparse matrix support for histgradientboostingclassifier #19187

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 109 additions & 12 deletions sklearn/ensemble/_hist_gradient_boosting/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# Author: Nicolas Hug

import numpy as np
import scipy.sparse as sp

from ...utils import check_random_state, check_array
from ...base import BaseEstimator, TransformerMixin
Expand All @@ -17,6 +18,52 @@
from ._bitset import set_bitset_memoryview


def _pack_matrix(sparse_matrix):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can convert into a csc feeding it into _BinMapper and then build the bins from there.

"""Pack a sparse matrix into a dense one.

Packing is done column-wise.

Parameters
----------
sparse_matrix : csr_matrix, shape (n_samples, n_features)

Return
------
dense_matrix : ndarray of shape(n_non_zero_samples, n_features)
The sparse_matrix packed, the number of rows is equal to
the biggest number of non-zero samples per column.
row_indices : ndarray of shape(n_non_zero_samples, n_features)
The initial row indice of the packed values
"""
columns = [[] for _ in range(sparse_matrix.shape[1])]
indices = [[] for _ in range(sparse_matrix.shape[1])]
row_indices, column_indices, values = sp.find(sparse_matrix)
for i in range(len(values)):
# sp.find return indices sorted by column so we can just
# append to the list
columns[column_indices[i]].append(values[i])
indices[column_indices[i]].append(row_indices[i])
n_rows = max(map(lambda x: len(x), columns))
# Fill the missing values
for j in range(len(columns)):
columns[j].extend([0 for _ in range(n_rows-len(columns[j]))])
indices[j].extend([np.nan for _ in range(n_rows-len(indices[j]))])
return np.array(columns, dtype=sparse_matrix.dtype).T, np.array(indices).T


def _unpack_matrix(dense_matrix, row_indices, shape):
sparse_matrix = sp.lil_matrix(shape, dtype=dense_matrix.dtype)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From my experience, using lil_matrix to construct medium size matrices and then converting to a tocsr is not great time wise and memory wise.

If we are building a binned matrix, we know for sure that it will have the same nonzero elements as the input sparse matrix. We can construct a the output by copying over the indptr and indices, and setting the data to be the same shape but with the binned dtype.

nan = 0
for i in range(dense_matrix.shape[0]):
for j in range(dense_matrix.shape[1]):
if not np.isnan(row_indices[i, j]):
sparse_matrix[row_indices[i, j], j] = dense_matrix[i, j]
else:
nan += 1
print(nan)
return sparse_matrix.tocsr()


def _find_binning_thresholds(col_data, max_bins):
"""Extract quantiles from a continuous feature.

Expand All @@ -39,12 +86,21 @@ def _find_binning_thresholds(col_data, max_bins):
A given value x will be mapped into bin value i iff
bining_thresholds[i - 1] < x <= binning_thresholds[i]
"""
# ignore missing values when computing bin thresholds
missing_mask = np.isnan(col_data)
if missing_mask.any():
col_data = col_data[~missing_mask]
col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)
distinct_values = np.unique(col_data)
if sp.issparse(col_data):
# Since col_data is only used for get unique values
# and computing percentile, zeros don't matter
distinct_values = col_data[col_data.nonzero()[0]].toarray()
if 0 in col_data:
distinct_values = np.append(distinct_values, 0)
distinct_values = np.unique(distinct_values)
col_data = col_data.toarray()
else:
# ignore missing values when computing bin thresholds
missing_mask = np.isnan(col_data)
if missing_mask.any():
col_data = col_data[~missing_mask]
col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)
distinct_values = np.unique(col_data)
if len(distinct_values) <= max_bins:
midpoints = distinct_values[:-1] + distinct_values[1:]
midpoints *= .5
Expand Down Expand Up @@ -172,13 +228,17 @@ def fit(self, X, y=None):
raise ValueError('n_bins={} should be no smaller than 3 '
'and no larger than 256.'.format(self.n_bins))

X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
X = check_array(X, dtype=[X_DTYPE], force_all_finite=False,
accept_sparse=True)
max_bins = self.n_bins - 1

rng = check_random_state(self.random_state)
if self.subsample is not None and X.shape[0] > self.subsample:
subset = rng.choice(X.shape[0], self.subsample, replace=False)
X = X.take(subset, axis=0)
if not sp.issparse(X):
X = X.take(subset, axis=0)
else:
X = X[subset, :]

if self.is_categorical is None:
self.is_categorical_ = np.zeros(X.shape[1], dtype=np.uint8)
Expand Down Expand Up @@ -248,17 +308,54 @@ def transform(self, X):
X_binned : array-like of shape (n_samples, n_features)
The binned data (fortran-aligned).
"""
X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
X = check_array(X, dtype=[X_DTYPE], force_all_finite=False,
accept_sparse=True)
check_is_fitted(self)
if X.shape[1] != self.n_bins_non_missing_.shape[0]:
raise ValueError(
'This estimator was fitted with {} features but {} got passed '
'to transform()'.format(self.n_bins_non_missing_.shape[0],
X.shape[1])
)
binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F')
_map_to_bins(X, self.bin_thresholds_, self.missing_values_bin_idx_,
binned)
if not sp.issparse(X):
binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F')
_map_to_bins(X, self.bin_thresholds_, self.missing_values_bin_idx_,
binned)
else:
# Compute actual_bin_zeros
zero = np.zeros((1, X.shape[1]), dtype=X_DTYPE)
binned = np.zeros_like(
zero, dtype=X_BINNED_DTYPE, order='F'
).reshape(-1, 1)
_map_to_bins(zero, self.bin_thresholds_,
self.missing_values_bin_idx_, binned)
actual_bin_zeros = binned.copy()
# Pack the matrix to be compatible with _map_to_bins
packed_data, indices = _pack_matrix(X)
packed_binned_data = np.zeros_like(
packed_data, dtype=X_BINNED_DTYPE
).reshape(-1, 1)
_map_to_bins(packed_data, self.bin_thresholds_,
self.missing_values_bin_idx_, packed_binned_data)

def shift_bin(bin_index):
"""Shift the bins [1:actual_bin_zeros-1] to the right
and assign actual_bin_zeros to bin 0
"""
if bin_index >= 0 and bin_index < actual_bin_zeros:
return bin_index+1
elif bin_index == actual_bin_zeros:
return 0
else:
return bin_index

packed_binned_data = np.array(
list(map(shift_bin, packed_binned_data)),
dtype=X_BINNED_DTYPE
).reshape(-1, 1)
# Unpack the matrix back to a sparse one
binned = _unpack_matrix(packed_binned_data, indices, X.shape)
return binned, actual_bin_zeros
return binned

def make_known_categories_bitsets(self):
Expand Down
82 changes: 81 additions & 1 deletion sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import numpy as np
import scipy.sparse as sp
from numpy.testing import assert_array_equal, assert_allclose
import pytest

from sklearn.ensemble._hist_gradient_boosting.binning import (
_BinMapper,
_find_binning_thresholds,
_map_to_bins
_map_to_bins,
_pack_matrix,
_unpack_matrix
)
from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
Expand Down Expand Up @@ -56,6 +59,42 @@ def test_find_binning_thresholds_random_data():
np.array([9.99, 10.00, 10.01]), atol=1e-2)


def test_find_binning_thresholds_sparse_data():
data = np.linspace(0, 10, 11)
data = sp.csr_matrix(
(data, (data.astype(int), [0]*len(data))),
(len(data)+10, 1)
)

bin_thresholds = _find_binning_thresholds(data, max_bins=5)
assert_allclose(bin_thresholds, [0, 0, 2, 6])

bin_thresholds = _find_binning_thresholds(data, max_bins=10)
assert_allclose(bin_thresholds, [0, 0, 0, 0, 0, 2, 4, 6, 8])

bin_thresholds = _find_binning_thresholds(data, max_bins=21)
assert_allclose(bin_thresholds, [0]*10 + np.arange(10) + .5)

bin_thresholds = _find_binning_thresholds(data, max_bins=255)
assert_allclose(bin_thresholds, [0]*10 + np.arange(10) + .5)


@pytest.mark.parametrize("n_samples, density, max_bins", [
(10, 0.8, 10),
(100, 0.5, 10),
(1000, 0.2, 50)
])
def test_find_binning_thresholds_big_large_sparse_data(n_samples, density,
max_bins):
data = np.random.randn(n_samples).reshape(-1, 1)
data[:int(density*n_samples)] = 0
sparse_data = sp.csr_matrix(data)
thresholds = _find_binning_thresholds(data, max_bins=max_bins)
thresholds_sparse = _find_binning_thresholds(
sparse_data, max_bins=max_bins)
assert_array_equal(thresholds, thresholds_sparse)


def test_find_binning_thresholds_low_n_bins():
bin_thresholds = [_find_binning_thresholds(DATA[:, i], max_bins=128)
for i in range(2)]
Expand Down Expand Up @@ -151,6 +190,47 @@ def test_bin_mapper_small_random_data(n_samples, max_bins):
np.arange(n_samples))


@pytest.mark.parametrize("n_samples, density, max_bins", [
(5, 0.5, 5),
(50, 0.1, 10),
(1000, 0.2, 50)
])
def test_bin_mapper_sparse(n_samples, density, max_bins):
data = sp.random(n_samples, 1, density=density,
random_state=42, format="csr")

# max_bins is the number of bins for non-missing values
n_bins = max_bins + 1
mapper = _BinMapper(n_bins=n_bins, random_state=42)
binned, actual_bin_zero = mapper.fit_transform(data)

assert binned.shape == data.shape
assert binned.dtype == np.uint8


@pytest.mark.parametrize("n_samples, n_features", [
(10, 5),
(100, 50),
(100, 255),
(1000, 50)
])
def test_pack_unpack_matrix(n_samples, n_features):
data = sp.random(n_samples, n_features, density=0.5,
random_state=42, format="csr")
packed, indices = _pack_matrix(data)
assert packed.shape[1] == n_features
max_n_samples = 0
for j in range(n_features):
unique = np.unique(data.getcol(j).toarray())
length = len(unique) - 1 if 0 in unique else len(unique)
if length > max_n_samples:
max_n_samples = length
assert packed.shape[0] == max_n_samples
unpacked = _unpack_matrix(packed, indices, data.shape)
assert unpacked.shape == data.shape
assert_array_equal(data.toarray().ravel(), unpacked.toarray().ravel())


@pytest.mark.parametrize("max_bins, n_distinct, multiplier", [
(5, 5, 1),
(5, 5, 3),
Expand Down