scikit-learn · StealthyKamereon · Jan 16, 2021 · Jan 16, 2021 · Jan 17, 2021 · Jan 17, 2021
diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py
@@ -8,6 +8,7 @@
 # Author: Nicolas Hug
 
 import numpy as np
+import scipy.sparse as sp
 
 from ...utils import check_random_state, check_array
 from ...base import BaseEstimator, TransformerMixin
@@ -17,6 +18,52 @@
 from ._bitset import set_bitset_memoryview
 
 
+def _pack_matrix(sparse_matrix):
+    """Pack a sparse matrix into a dense one.
+
+    Packing is done column-wise.
+
+    Parameters
+    ----------
+    sparse_matrix : csr_matrix, shape (n_samples, n_features)
+
+    Return
+    ------
+    dense_matrix : ndarray of shape(n_non_zero_samples, n_features)
+        The sparse_matrix packed, the number of rows is equal to
+        the biggest number of non-zero samples per column.
+    row_indices : ndarray of shape(n_non_zero_samples, n_features)
+        The initial row indice of the packed values
+    """
+    columns = [[] for _ in range(sparse_matrix.shape[1])]
+    indices = [[] for _ in range(sparse_matrix.shape[1])]
+    row_indices, column_indices, values = sp.find(sparse_matrix)
+    for i in range(len(values)):
+        # sp.find return indices sorted by column so we can just
+        # append to the list
+        columns[column_indices[i]].append(values[i])
+        indices[column_indices[i]].append(row_indices[i])
+    n_rows = max(map(lambda x: len(x), columns))
+    # Fill the missing values
+    for j in range(len(columns)):
+        columns[j].extend([0 for _ in range(n_rows-len(columns[j]))])
+        indices[j].extend([np.nan for _ in range(n_rows-len(indices[j]))])
+    return np.array(columns, dtype=sparse_matrix.dtype).T, np.array(indices).T
+
+
+def _unpack_matrix(dense_matrix, row_indices, shape):
+    sparse_matrix = sp.lil_matrix(shape, dtype=dense_matrix.dtype)
+    nan = 0
+    for i in range(dense_matrix.shape[0]):
+        for j in range(dense_matrix.shape[1]):
+            if not np.isnan(row_indices[i, j]):
+                sparse_matrix[row_indices[i, j], j] = dense_matrix[i, j]
+            else:
+                nan += 1
+    print(nan)
+    return sparse_matrix.tocsr()
+
+
 def _find_binning_thresholds(col_data, max_bins):
     """Extract quantiles from a continuous feature.
 
@@ -39,12 +86,21 @@ def _find_binning_thresholds(col_data, max_bins):
         A given value x will be mapped into bin value i iff
         bining_thresholds[i - 1] < x <= binning_thresholds[i]
     """
-    # ignore missing values when computing bin thresholds
-    missing_mask = np.isnan(col_data)
-    if missing_mask.any():
-        col_data = col_data[~missing_mask]
-    col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)
-    distinct_values = np.unique(col_data)
+    if sp.issparse(col_data):
+        # Since col_data is only used for get unique values
+        # and computing percentile, zeros don't matter
+        distinct_values = col_data[col_data.nonzero()[0]].toarray()
+        if 0 in col_data:
+            distinct_values = np.append(distinct_values, 0)
+        distinct_values = np.unique(distinct_values)
+        col_data = col_data.toarray()
+    else:
+        # ignore missing values when computing bin thresholds
+        missing_mask = np.isnan(col_data)
+        if missing_mask.any():
+            col_data = col_data[~missing_mask]
+        col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)
+        distinct_values = np.unique(col_data)
     if len(distinct_values) <= max_bins:
         midpoints = distinct_values[:-1] + distinct_values[1:]
         midpoints *= .5
@@ -172,13 +228,17 @@ def fit(self, X, y=None):
             raise ValueError('n_bins={} should be no smaller than 3 '
                              'and no larger than 256.'.format(self.n_bins))
 
-        X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
+        X = check_array(X, dtype=[X_DTYPE], force_all_finite=False,
+                        accept_sparse=True)
         max_bins = self.n_bins - 1
 
         rng = check_random_state(self.random_state)
         if self.subsample is not None and X.shape[0] > self.subsample:
             subset = rng.choice(X.shape[0], self.subsample, replace=False)
-            X = X.take(subset, axis=0)
+            if not sp.issparse(X):
+                X = X.take(subset, axis=0)
+            else:
+                X = X[subset, :]
 
         if self.is_categorical is None:
             self.is_categorical_ = np.zeros(X.shape[1], dtype=np.uint8)
@@ -248,17 +308,54 @@ def transform(self, X):
         X_binned : array-like of shape (n_samples, n_features)
             The binned data (fortran-aligned).
         """
-        X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
+        X = check_array(X, dtype=[X_DTYPE], force_all_finite=False,
+                        accept_sparse=True)
         check_is_fitted(self)
         if X.shape[1] != self.n_bins_non_missing_.shape[0]:
             raise ValueError(
                 'This estimator was fitted with {} features but {} got passed '
                 'to transform()'.format(self.n_bins_non_missing_.shape[0],
                                         X.shape[1])
             )
-        binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F')
-        _map_to_bins(X, self.bin_thresholds_, self.missing_values_bin_idx_,
-                     binned)
+        if not sp.issparse(X):
+            binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F')
+            _map_to_bins(X, self.bin_thresholds_, self.missing_values_bin_idx_,
+                         binned)
+        else:
+            # Compute actual_bin_zeros
+            zero = np.zeros((1, X.shape[1]), dtype=X_DTYPE)
+            binned = np.zeros_like(
+                    zero, dtype=X_BINNED_DTYPE, order='F'
+                    ).reshape(-1, 1)
+            _map_to_bins(zero, self.bin_thresholds_,
+                         self.missing_values_bin_idx_, binned)
+            actual_bin_zeros = binned.copy()
+            # Pack the matrix to be compatible with _map_to_bins
+            packed_data, indices = _pack_matrix(X)
+            packed_binned_data = np.zeros_like(
+                    packed_data, dtype=X_BINNED_DTYPE
+                    ).reshape(-1, 1)
+            _map_to_bins(packed_data, self.bin_thresholds_,
+                         self.missing_values_bin_idx_, packed_binned_data)
+
+            def shift_bin(bin_index):
+                """Shift the bins [1:actual_bin_zeros-1] to the right
+                and assign actual_bin_zeros to bin 0
+                """
+                if bin_index >= 0 and bin_index < actual_bin_zeros:
+                    return bin_index+1
+                elif bin_index == actual_bin_zeros:
+                    return 0
+                else:
+                    return bin_index
+
+            packed_binned_data = np.array(
+                    list(map(shift_bin, packed_binned_data)),
+                    dtype=X_BINNED_DTYPE
+                    ).reshape(-1, 1)
+            # Unpack the matrix back to a sparse one
+            binned = _unpack_matrix(packed_binned_data, indices, X.shape)
+            return binned, actual_bin_zeros
         return binned
 
     def make_known_categories_bitsets(self):

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
@@ -1,11 +1,14 @@
 import numpy as np
+import scipy.sparse as sp
 from numpy.testing import assert_array_equal, assert_allclose
 import pytest
 
 from sklearn.ensemble._hist_gradient_boosting.binning import (
     _BinMapper,
     _find_binning_thresholds,
-    _map_to_bins
+    _map_to_bins,
+    _pack_matrix,
+    _unpack_matrix
 )
 from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE
 from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
@@ -56,6 +59,42 @@ def test_find_binning_thresholds_random_data():
                     np.array([9.99, 10.00, 10.01]), atol=1e-2)
 
 
+def test_find_binning_thresholds_sparse_data():
+    data = np.linspace(0, 10, 11)
+    data = sp.csr_matrix(
+            (data, (data.astype(int), [0]*len(data))),
+            (len(data)+10, 1)
+            )
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=5)
+    assert_allclose(bin_thresholds, [0, 0, 2, 6])
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=10)
+    assert_allclose(bin_thresholds, [0, 0, 0, 0, 0, 2, 4, 6, 8])
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=21)
+    assert_allclose(bin_thresholds, [0]*10 + np.arange(10) + .5)
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=255)
+    assert_allclose(bin_thresholds, [0]*10 + np.arange(10) + .5)
+
+
+@pytest.mark.parametrize("n_samples, density, max_bins", [
+    (10, 0.8, 10),
+    (100, 0.5, 10),
+    (1000, 0.2, 50)
+])
+def test_find_binning_thresholds_big_large_sparse_data(n_samples, density,
+                                                       max_bins):
+    data = np.random.randn(n_samples).reshape(-1, 1)
+    data[:int(density*n_samples)] = 0
+    sparse_data = sp.csr_matrix(data)
+    thresholds = _find_binning_thresholds(data, max_bins=max_bins)
+    thresholds_sparse = _find_binning_thresholds(
+            sparse_data, max_bins=max_bins)
+    assert_array_equal(thresholds, thresholds_sparse)
+
+
 def test_find_binning_thresholds_low_n_bins():
     bin_thresholds = [_find_binning_thresholds(DATA[:, i], max_bins=128)
                       for i in range(2)]
@@ -151,6 +190,47 @@ def test_bin_mapper_small_random_data(n_samples, max_bins):
                        np.arange(n_samples))
 
 
+@pytest.mark.parametrize("n_samples, density, max_bins", [
+    (5, 0.5, 5),
+    (50, 0.1, 10),
+    (1000, 0.2, 50)
+])
+def test_bin_mapper_sparse(n_samples, density, max_bins):
+    data = sp.random(n_samples, 1, density=density,
+                     random_state=42, format="csr")
+
+    # max_bins is the number of bins for non-missing values
+    n_bins = max_bins + 1
+    mapper = _BinMapper(n_bins=n_bins, random_state=42)
+    binned, actual_bin_zero = mapper.fit_transform(data)
+
+    assert binned.shape == data.shape
+    assert binned.dtype == np.uint8
+
+
+@pytest.mark.parametrize("n_samples, n_features", [
+    (10, 5),
+    (100, 50),
+    (100, 255),
+    (1000, 50)
+])
+def test_pack_unpack_matrix(n_samples, n_features):
+    data = sp.random(n_samples, n_features, density=0.5,
+                     random_state=42, format="csr")
+    packed, indices = _pack_matrix(data)
+    assert packed.shape[1] == n_features
+    max_n_samples = 0
+    for j in range(n_features):
+        unique = np.unique(data.getcol(j).toarray())
+        length = len(unique) - 1 if 0 in unique else len(unique)
+        if length > max_n_samples:
+            max_n_samples = length
+    assert packed.shape[0] == max_n_samples
+    unpacked = _unpack_matrix(packed, indices, data.shape)
+    assert unpacked.shape == data.shape
+    assert_array_equal(data.toarray().ravel(), unpacked.toarray().ravel())
+
+
 @pytest.mark.parametrize("max_bins, n_distinct, multiplier", [
     (5, 5, 1),
     (5, 5, 3),