Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/expected categories #1597

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Prev Previous commit
Next Next commit
Adjust default params, update respective docs
  • Loading branch information
ColdTeapot273K committed Oct 1, 2024
commit 4ce2ade7f5bc872d6634ed39e46fa5c1af10efa4
31 changes: 23 additions & 8 deletions river/preprocessing/one_hot.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,27 @@ class OneHotEncoder(base.MiniBatchTransformer):

Parameters
----------
categories
Categories (unique values) per feature:
`None` : Determine categories automatically from the training data.

dict of dicts : Expected categories for each feature. The outer dict maps each feature to its inner dict.
The inner dict maps each category to its code.

The used categories can be found in the `values` attribute.
drop_zeros
Whether or not 0s should be made explicit or not.
drop_first
Whether to get `k - 1` dummies out of `k` categorical levels by removing the first key.
This is useful in some statistical models where perfectly collinear features cause
problems.

Attributes
----------
values
A dict of dicts. The outer dict maps each feature to its inner dict. The inner dict maps
each category to its code.

Examples
--------

Expand Down Expand Up @@ -157,6 +171,7 @@ class OneHotEncoder(base.MiniBatchTransformer):
>>> from pprint import pprint
>>> import random
>>> import string
>>> import pandas as pd

>>> random.seed(42)
>>> alphabet = list(string.ascii_lowercase)
Expand Down Expand Up @@ -218,23 +233,23 @@ class OneHotEncoder(base.MiniBatchTransformer):

>>> oh = preprocessing.OneHotEncoder(categories=categories)

# oh = preprocessing.OneHotEncoder()

>>> oh.learn_many(X)
>>> df = oh.transform_many(X)
>>> df.sort_index(axis="columns")
c1_a c1_h c2_e c2_x
0 0 0 0 0
1 1 0 0 1
2 0 0 0 0

"""

def __init__(self, categories = "auto", drop_zeros=False, drop_first=False):
def __init__(self, categories: dict | None = None, drop_zeros=False, drop_first=False):
self.drop_zeros = drop_zeros
self.drop_first = drop_first
self.categories = categories

if self.categories == "auto":
if self.categories is None:
self.values = collections.defaultdict(set)
else:
self.values = self.categories
Expand All @@ -245,7 +260,7 @@ def learn_one(self, x):

# NOTE: assume if category mappings are explicitly provided,
# they're intended to be kept fixed.
if self.categories == "auto":
if self.categories is None:
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
Expand All @@ -263,7 +278,7 @@ def transform_one(self, x, y=None):
# Add 1
# NOTE: assume if category mappings are explicitly provided,
# no other category values are allowed for output. Aligns with `sklearn` behavior.
if self.categories == "auto":
if self.categories is None:
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
Expand Down Expand Up @@ -291,7 +306,7 @@ def learn_many(self, X):

# NOTE: assume if category mappings are explicitly provided,
# they're intended to be kept fixed.
if self.categories == "auto":
if self.categories is None:
for col in X.columns:
self.values[col].update(X[col].unique())

Expand All @@ -300,7 +315,7 @@ def transform_many(self, X):

# NOTE: assume if category mappings are explicitly provided,
# no other category values are allowed for output. Aligns with `sklearn` behavior.
if self.categories != "auto":
if self.categories is not None:
seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals}
to_remove = set(oh.columns) - seen_in_the_past
oh.drop(columns=list(to_remove), inplace=True)
Expand Down
23 changes: 14 additions & 9 deletions river/preprocessing/ordinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@ class OrdinalEncoder(base.MiniBatchTransformer):

Parameters
----------
categories
Categories (unique values) per feature:
`None` : Determine categories automatically from the training data.

dict of dicts : Expected categories for each feature. The outer dict maps each feature to its inner dict.
The inner dict maps each category to its code.

The used categories can be found in the `values` attribute.
unknown_value
The value to use for unknown categories seen during `transform_one`. Unknown categories
will be mapped to an integer once they are seen during `learn_one`. This value can be set
Expand All @@ -31,7 +39,7 @@ class OrdinalEncoder(base.MiniBatchTransformer):

Attributes
----------
categories
values
A dict of dicts. The outer dict maps each feature to its inner dict. The inner dict maps
each category to its code.

Expand Down Expand Up @@ -107,7 +115,7 @@ class OrdinalEncoder(base.MiniBatchTransformer):

def __init__(
self,
categories = "auto",
categories: dict | None = None,
unknown_value: int | None = 0,
none_value: int = -1,
):
Expand All @@ -116,7 +124,7 @@ def __init__(
self.categories = categories
self.values: collections.defaultdict | dict | None = None

if self.categories == "auto":
if self.categories is None:
# We're going to have one auto-incrementing counter per feature. This counter will generate
# the category codes for each feature.
self._counters: collections.defaultdict = collections.defaultdict(
Expand All @@ -129,15 +137,14 @@ def __init__(
else:
self.values = self.categories


def transform_one(self, x):
return {
i: self.none_value if xi is None else self.values[i].get(xi, self.unknown_value)
for i, xi in x.items()
}

def learn_one(self, x):
if self.categories == "auto":
if self.categories is None:
for i, xi in x.items():
if xi is not None and xi not in self.values[i]:
self.values[i][xi] = next(self._counters[i])
Expand All @@ -146,17 +153,15 @@ def transform_many(self, X):
return pd.DataFrame(
{
i: pd.Series(
X[i]
.map({**self.values[i], None: self.none_value})
.fillna(self.unknown_value),
X[i].map({**self.values[i], None: self.none_value}).fillna(self.unknown_value),
dtype=np.int64,
)
for i in X.columns
}
)

def learn_many(self, X, y=None):
if self.categories == "auto":
if self.categories is None:
for i in X.columns:
for xi in X[i].dropna().unique():
if xi not in self.values[i]:
Expand Down