Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/expected categories #1597

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Next Next commit
Add support for explicitly expected categories for OHE
  • Loading branch information
ColdTeapot273K committed Aug 23, 2024
commit 2fce4f3ba5e7a441888668773fc4c27aecff79f6
133 changes: 112 additions & 21 deletions river/preprocessing/one_hot.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ class OneHotEncoder(base.MiniBatchTransformer):
... ]
>>> pprint(X)
[{'c1': 'u', 'c2': 'd'},
{'c1': 'a', 'c2': 'x'},
{'c1': 'i', 'c2': 'h'},
{'c1': 'h', 'c2': 'e'}]
{'c1': 'a', 'c2': 'x'},
{'c1': 'i', 'c2': 'h'},
{'c1': 'h', 'c2': 'e'}]

e can now apply one-hot encoding. All the provided are one-hot encoded, there is therefore
We can now apply one-hot encoding. All the provided are one-hot encoded, there is therefore
no need to specify which features to encode.

>>> from river import preprocessing
Expand Down Expand Up @@ -85,6 +85,45 @@ class OneHotEncoder(base.MiniBatchTransformer):
{'c2_h': 1}
{'c2_e': 1}

Like in `scikit-learn`, you can also specify the expected categories manually.
This is handy when you want to constrain category encoding space
to e.g. top 20% most popular category values you've picked in advance.

X = [
{
'c1': random.choice(alphabet),
'c2': random.choice(alphabet),
}
for _ in range(4)
]
pprint(X)

>>> categories = {'c1': {'a', 'h'}, 'c2': {'x', 'e'}}
>>> oh = preprocessing.OneHotEncoder(categories=categories)
>>> # oh = preprocessing.OneHotEncoder()
ColdTeapot273K marked this conversation as resolved.
Show resolved Hide resolved
>>> for x in X:
... oh.learn_one(x)
... pprint(oh.transform_one(x))
{'c1_a': 0, 'c1_h': 0, 'c2_e': 0, 'c2_x': 0}
{'c1_a': 1, 'c1_h': 0, 'c2_e': 0, 'c2_x': 1}
{'c1_a': 0, 'c1_h': 0, 'c2_e': 0, 'c2_x': 0}
{'c1_a': 0, 'c1_h': 1, 'c2_e': 1, 'c2_x': 0}

>>> for key in sorted(oh.values.keys()):
... print(key)
... print(sorted(oh.values[key]))
c1
['a', 'h']
c2
['e', 'x']


oh.values.items()
[{'c1': {'a', 'h'}, 'c2': {'e', 'x'}}]
[{'c1': {'a', 'h'}, 'c2': {'e', 'x'}}]

{'c1': {'a', 'h', 'i', 'u'}, 'c2': {'d', 'e', 'h', 'x'}}

A subset of the features can be one-hot encoded by piping a `compose.Select` into the
`OneHotEncoder`.

Expand Down Expand Up @@ -192,23 +231,53 @@ class OneHotEncoder(base.MiniBatchTransformer):
c2_x Sparse[uint8, 0]
dtype: object

Explicit categories:

>>> oh = preprocessing.OneHotEncoder(categories=categories)

# oh = preprocessing.OneHotEncoder()
>>> oh.learn_many(X)
>>> df = oh.transform_many(X)
>>> df.sort_index(axis="columns")
c1_a c1_h c2_e c2_x
0 0 0 0 0
1 1 0 0 1
2 0 0 0 0

# c1_a c1_i c1_u c2_d c2_h c2_x
# 0 0 0 1 1 0 0
# 1 1 0 0 0 0 1
# 2 0 1 0 0 1 0

c1_a c1_h c2_e c2_x
0 0 0 0 0
1 1 0 0 1
2 0 0 0 0
"""
ColdTeapot273K marked this conversation as resolved.
Show resolved Hide resolved

def __init__(self, drop_zeros=False, drop_first=False):
def __init__(self, categories: str | dict = "auto", drop_zeros=False, drop_first=False):
self.drop_zeros = drop_zeros
self.drop_first = drop_first
self.values = collections.defaultdict(set)
self.categories = categories

if self.categories == "auto":
self.values = collections.defaultdict(set)
else:
self.values = self.categories

def learn_one(self, x):
if self.drop_zeros:
return

for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
self.values[i].add(xj)
else:
self.values[i].add(xi)
# NOTE: assume if category mappings are explicitly provided,
# they're intended to be kept fixed.
if self.categories == "auto":
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
self.values[i].add(xj)
else:
self.values[i].add(xi)

def transform_one(self, x, y=None):
oh = {}
Expand All @@ -217,13 +286,25 @@ def transform_one(self, x, y=None):
if not self.drop_zeros:
oh = {f"{i}_{v}": 0 for i, values in self.values.items() for v in values}

# Add 1s
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
oh[f"{i}_{xj}"] = 1
else:
oh[f"{i}_{xi}"] = 1
# Add 1
# NOTE: assume if category mappings are explicitly provided,
# no other category values are allowed for output. Aligns with `sklearn` behavior.
if self.categories == "auto":
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
oh[f"{i}_{xj}"] = 1
else:
oh[f"{i}_{xi}"] = 1
else:
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
if xj in self.values[i]:
oh[f"{i}_{xj}"] = 1
else:
if xi in self.values[i]:
oh[f"{i}_{xi}"] = 1

if self.drop_first:
oh.pop(min(oh.keys()))
Expand All @@ -234,12 +315,22 @@ def learn_many(self, X):
if self.drop_zeros:
return

for col in X.columns:
self.values[col].update(X[col].unique())
# NOTE: assume if category mappings are explicitly provided,
# they're intended to be kept fixed.
if self.categories == "auto":
for col in X.columns:
self.values[col].update(X[col].unique())

def transform_many(self, X):
oh = pd.get_dummies(X, columns=X.columns, sparse=True, dtype="uint8")

# NOTE: assume if category mappings are explicitly provided,
# no other category values are allowed for output. Aligns with `sklearn` behavior.
if self.categories != "auto":
seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals}
to_remove = set(oh.columns) - seen_in_the_past
oh.drop(columns=list(to_remove), inplace=True)

if not self.drop_zeros:
seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals}
to_add = seen_in_the_past - set(oh.columns)
Expand Down