Skip to content

Commit

Permalink
FIX Fix ExtraTreeRegressor missing data handling (#30318)
Browse files Browse the repository at this point in the history
  • Loading branch information
lesteve authored Nov 22, 2024
1 parent 32a228d commit 27a903b
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
support missing-values in the data matrix ``X``. Missing-values are handled by
randomly moving all of the samples to the left, or right child node as the tree is
traversed.
By :user:`Adam Li <adam2392>`
By :user:`Adam Li <adam2392>` and :user:`Loïc Estève <lesteve>`
5 changes: 5 additions & 0 deletions doc/whats_new/upcoming_changes/sklearn.tree/30318.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- :class:`tree.ExtraTreeClassifier` and :class:`tree.ExtraTreeRegressor` now
support missing-values in the data matrix ``X``. Missing-values are handled by
randomly moving all of the samples to the left, or right child node as the tree is
traversed.
By :user:`Adam Li <adam2392>` and :user:`Loïc Estève <lesteve>`
2 changes: 1 addition & 1 deletion sklearn/tree/_partitioner.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ cdef class DensePartitioner:
"""Partition samples for feature_values at the current_threshold."""
cdef:
intp_t p = self.start
intp_t partition_end = self.end
intp_t partition_end = self.end - self.n_missing
intp_t[::1] samples = self.samples
float32_t[::1] feature_values = self.feature_values

Expand Down
20 changes: 16 additions & 4 deletions sklearn/tree/tests/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -2689,10 +2689,8 @@ def test_regression_tree_missing_values_toy(Tree, X, criterion):
impurity = tree.tree_.impurity
assert all(impurity >= 0), impurity.min() # MSE should always be positive

# Note: the impurity matches after the first split only on greedy trees
if Tree is DecisionTreeRegressor:
# Check the impurity match after the first split
assert_allclose(tree.tree_.impurity[:2], tree_ref.tree_.impurity[:2])
# Check the impurity match after the first split
assert_allclose(tree.tree_.impurity[:2], tree_ref.tree_.impurity[:2])

# Find the leaves with a single sample where the MSE should be 0
leaves_idx = np.flatnonzero(
Expand All @@ -2701,6 +2699,20 @@ def test_regression_tree_missing_values_toy(Tree, X, criterion):
assert_allclose(tree.tree_.impurity[leaves_idx], 0.0)


def test_regression_extra_tree_missing_values_toy(global_random_seed):
rng = np.random.RandomState(global_random_seed)
n_samples = 100
X = np.arange(n_samples, dtype=np.float64).reshape(-1, 1)
X[-20:, :] = np.nan
rng.shuffle(X)
y = np.arange(n_samples)

tree = ExtraTreeRegressor(random_state=global_random_seed, max_depth=5).fit(X, y)

impurity = tree.tree_.impurity
assert all(impurity >= 0), impurity # MSE should always be positive


def test_classification_tree_missing_values_toy():
"""Check that we properly handle missing values in clasification trees using a toy
dataset.
Expand Down

0 comments on commit 27a903b

Please sign in to comment.