Adjust default params, update respective docs

online-ml · ColdTeapot273K · Aug 23, 2024 · Aug 23, 2024 · Aug 24, 2024 · Aug 24, 2024
commit 4ce2ade7f5bc872d6634ed39e46fa5c1af10efa4
@@ -19,13 +19,27 @@ class OneHotEncoder(base.MiniBatchTransformer):
 
     Parameters
     ----------
+    categories
+        Categories (unique values) per feature:
+            `None` : Determine categories automatically from the training data.
+
+            dict of dicts : Expected categories for each feature. The outer dict maps each feature to its inner dict.
+            The inner dict maps each category to its code.
+
+        The used categories can be found in the `values` attribute.
     drop_zeros
         Whether or not 0s should be made explicit or not.
     drop_first
         Whether to get `k - 1` dummies out of `k` categorical levels by removing the first key.
         This is useful in some statistical models where perfectly collinear features cause
         problems.
 
+    Attributes
+    ----------
+    values
+        A dict of dicts. The outer dict maps each feature to its inner dict. The inner dict maps
+        each category to its code.
+
     Examples
     --------
 
@@ -157,6 +171,7 @@ class OneHotEncoder(base.MiniBatchTransformer):
     >>> from pprint import pprint
     >>> import random
     >>> import string
+    >>> import pandas as pd
 
     >>> random.seed(42)
     >>> alphabet = list(string.ascii_lowercase)
@@ -218,23 +233,23 @@ class OneHotEncoder(base.MiniBatchTransformer):
 
     >>> oh = preprocessing.OneHotEncoder(categories=categories)
 
-    # oh = preprocessing.OneHotEncoder()
+
     >>> oh.learn_many(X)
     >>> df = oh.transform_many(X)
     >>> df.sort_index(axis="columns")
        c1_a  c1_h  c2_e  c2_x
     0     0     0     0     0
     1     1     0     0     1
     2     0     0     0     0
-    
+
     """
 
-    def __init__(self, categories = "auto", drop_zeros=False, drop_first=False):
+    def __init__(self, categories: dict | None = None, drop_zeros=False, drop_first=False):
         self.drop_zeros = drop_zeros
         self.drop_first = drop_first
         self.categories = categories
 
-        if self.categories == "auto":
+        if self.categories is None:
             self.values = collections.defaultdict(set)
         else:
             self.values = self.categories
@@ -245,7 +260,7 @@ def learn_one(self, x):
 
         # NOTE: assume if category mappings are explicitly provided,
         # they're intended to be kept fixed.
-        if self.categories == "auto":
+        if self.categories is None:
             for i, xi in x.items():
                 if isinstance(xi, list) or isinstance(xi, set):
                     for xj in xi:
@@ -263,7 +278,7 @@ def transform_one(self, x, y=None):
         # Add 1
         # NOTE: assume if category mappings are explicitly provided,
         # no other category values are allowed for output. Aligns with `sklearn` behavior.
-        if self.categories == "auto":
+        if self.categories is None:
             for i, xi in x.items():
                 if isinstance(xi, list) or isinstance(xi, set):
                     for xj in xi:
@@ -291,7 +306,7 @@ def learn_many(self, X):
 
         # NOTE: assume if category mappings are explicitly provided,
         # they're intended to be kept fixed.
-        if self.categories == "auto":
+        if self.categories is None:
             for col in X.columns:
                 self.values[col].update(X[col].unique())
 
@@ -300,7 +315,7 @@ def transform_many(self, X):
 
         # NOTE: assume if category mappings are explicitly provided,
         # no other category values are allowed for output. Aligns with `sklearn` behavior.
-        if self.categories != "auto":
+        if self.categories is not None:
             seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals}
             to_remove = set(oh.columns) - seen_in_the_past
             oh.drop(columns=list(to_remove), inplace=True)

@@ -22,6 +22,14 @@ class OrdinalEncoder(base.MiniBatchTransformer):
 
     Parameters
     ----------
+    categories
+        Categories (unique values) per feature:
+            `None` : Determine categories automatically from the training data.
+
+            dict of dicts : Expected categories for each feature. The outer dict maps each feature to its inner dict.
+            The inner dict maps each category to its code.
+
+        The used categories can be found in the `values` attribute.
     unknown_value
         The value to use for unknown categories seen during `transform_one`. Unknown categories
         will be mapped to an integer once they are seen during `learn_one`. This value can be set
@@ -31,7 +39,7 @@ class OrdinalEncoder(base.MiniBatchTransformer):
 
     Attributes
     ----------
-    categories
+    values
         A dict of dicts. The outer dict maps each feature to its inner dict. The inner dict maps
         each category to its code.
 
@@ -107,7 +115,7 @@ class OrdinalEncoder(base.MiniBatchTransformer):
 
     def __init__(
         self,
-        categories = "auto",
+        categories: dict | None = None,
         unknown_value: int | None = 0,
         none_value: int = -1,
     ):
@@ -116,7 +124,7 @@ def __init__(
         self.categories = categories
         self.values: collections.defaultdict | dict | None = None
 
-        if self.categories == "auto":
+        if self.categories is None:
             # We're going to have one auto-incrementing counter per feature. This counter will generate
             # the category codes for each feature.
             self._counters: collections.defaultdict = collections.defaultdict(
@@ -129,15 +137,14 @@ def __init__(
         else:
             self.values = self.categories
 
-
     def transform_one(self, x):
         return {
             i: self.none_value if xi is None else self.values[i].get(xi, self.unknown_value)
             for i, xi in x.items()
         }
 
     def learn_one(self, x):
-        if self.categories == "auto":
+        if self.categories is None:
             for i, xi in x.items():
                 if xi is not None and xi not in self.values[i]:
                     self.values[i][xi] = next(self._counters[i])
@@ -146,17 +153,15 @@ def transform_many(self, X):
         return pd.DataFrame(
             {
                 i: pd.Series(
-                    X[i]
-                    .map({**self.values[i], None: self.none_value})
-                    .fillna(self.unknown_value),
+                    X[i].map({**self.values[i], None: self.none_value}).fillna(self.unknown_value),
                     dtype=np.int64,
                 )
                 for i in X.columns
             }
         )
 
     def learn_many(self, X, y=None):
-        if self.categories == "auto":
+        if self.categories is None:
             for i in X.columns:
                 for xi in X[i].dropna().unique():
                     if xi not in self.values[i]: