-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Description
pycaret version checks
-
I have checked that this issue has not already been reported here.
-
I have confirmed this bug exists on the latest version of pycaret.
-
I have confirmed this bug exists on the master branch of pycaret (pip install -U git+https://github.com/pycaret/pycaret.git@master).
Issue Description
I am working on a classification problem. When using fix_imbalance = True option in setup, compare_models works fine. Post this, I run tuned_model, which also works fine.
When I try to run finalize_model, I get the below error. I am not able to reproduce this with a toy dataset like below. Any pointers would be very helpful.
import pandas as pd
from pycaret.datasets import get_data
from pycaret.classification import setup, compare_models, finalize_model
# Load a sample dataset
data = get_data('credit')
# Introduce imbalance by reducing the number of instances of one class
# Assuming 'default' is the target column
class_0 = data[data['default'] == 0]
class_1 = data[data['default'] == 1]
# Reduce the number of instances in class 1 to create imbalance
class_1_reduced = class_1.sample(frac=0.1, random_state=42)
# Combine the datasets to form an imbalanced dataset
imbalanced_data = pd.concat([class_0, class_1_reduced], axis=0)
# Setup the pycaret classification environment
clf1 = setup(data=imbalanced_data, target='default', fix_imbalance=True)
# Compare models to see how they perform on imbalanced data
best_model = compare_models()
finalize_model(best_model)
Reproducible Example
N/AExpected Behavior
finalize_model(tuned_model) should work flawlessly.
Actual Results
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pycaret/internal/preprocess/transformers.py:135, in TransformerWrapper._reorder_cols(self, df, original_df)
134 try:
--> 135 original_df.index = df.index
136 except ValueError: # Length mismatch
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pandas/core/generic.py:5588, in NDFrame.__setattr__(self, name, value)
5587 object.__getattribute__(self, name)
-> 5588 return object.__setattr__(self, name, value)
5589 except AttributeError:
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pandas/_libs/properties.pyx:70, in pandas._libs.properties.AxisProperty.__set__()
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pandas/core/generic.py:769, in NDFrame._set_axis(self, axis, labels)
768 labels = ensure_index(labels)
--> 769 self._mgr.set_axis(axis, labels)
770 self._clear_item_cache()
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pandas/core/internals/managers.py:214, in BaseBlockManager.set_axis(self, axis, new_labels)
212 def set_axis(self, axis: int, new_labels: Index) -> None:
213 # Caller is responsible for ensuring we have an Index object.
--> 214 self._validate_set_axis(axis, new_labels)
215 self.axes[axis] = new_labels
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pandas/core/internals/base.py:69, in DataManager._validate_set_axis(self, axis, new_labels)
68 elif new_len != old_len:
---> 69 raise ValueError(
70 f"Length mismatch: Expected axis has {old_len} elements, new "
71 f"values have {new_len} elements"
72 )
ValueError: Length mismatch: Expected axis has 10000 elements, new values have 19894 elements
During handling of the above exception, another exception occurred:
IndexError Traceback (most recent call last)
Cell In[72], line 2
1 # finalize_model(best_model)
----> 2 finalize_model(tuned_model)
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pycaret/utils/generic.py:965, in check_if_global_is_not_none.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
963 if globals_d[name] is None:
964 raise ValueError(message)
--> 965 return func(*args, **kwargs)
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pycaret/classification/functional.py:2229, in finalize_model(estimator, fit_kwargs, groups, model_only, experiment_custom_tags)
2177 @check_if_global_is_not_none(globals(), _CURRENT_EXPERIMENT_DECORATOR_DICT)
2178 def finalize_model(
2179 estimator,
(...)
2183 experiment_custom_tags: Optional[Dict[str, Any]] = None,
2184 ) -> Any:
2185 """
2186 This function trains a given estimator on the entire dataset including the
2187 holdout set.
(...)
2226
2227 """
-> 2229 return _CURRENT_EXPERIMENT.finalize_model(
2230 estimator=estimator,
2231 fit_kwargs=fit_kwargs,
2232 groups=groups,
2233 model_only=model_only,
2234 experiment_custom_tags=experiment_custom_tags,
2235 )
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pycaret/classification/oop.py:2875, in ClassificationExperiment.finalize_model(self, estimator, fit_kwargs, groups, model_only, experiment_custom_tags)
2823 def finalize_model(
2824 self,
2825 estimator,
(...)
2829 experiment_custom_tags: Optional[Dict[str, Any]] = None,
2830 ) -> Any:
2831 """
2832 This function trains a given estimator on the entire dataset including the
2833 holdout set.
(...)
2872
2873 """
-> 2875 return super().finalize_model(
2876 estimator=estimator,
2877 fit_kwargs=fit_kwargs,
2878 groups=groups,
2879 model_only=model_only,
2880 experiment_custom_tags=experiment_custom_tags,
2881 )
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pycaret/internal/pycaret_experiment/supervised_experiment.py:4730, in _SupervisedExperiment.finalize_model(self, estimator, fit_kwargs, groups, model_only, experiment_custom_tags)
4727 np.random.seed(self.seed)
4729 self.logger.info(f"Finalizing {estimator}")
-> 4730 pipeline_final, model_fit_time = self._create_model(
4731 estimator=estimator,
4732 cross_validation=False,
4733 verbose=False,
4734 system=False,
4735 X_train_data=self.X,
4736 y_train_data=self.y,
4737 fit_kwargs=fit_kwargs or {},
4738 predict=False,
4739 groups=self._get_groups(groups, data=self.X),
4740 add_to_model_list=False,
4741 model_only=False,
4742 )
4744 # dashboard logging
4745 if self.logging_param:
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pycaret/internal/pycaret_experiment/supervised_experiment.py:1507, in _SupervisedExperiment._create_model(self, estimator, fold, round, cross_validation, predict, fit_kwargs, groups, refit, probability_threshold, experiment_custom_tags, verbose, system, add_to_model_list, X_train_data, y_train_data, metrics, display, model_only, return_train_score, error_score, **kwargs)
1502 """
1503 MONITOR UPDATE ENDS
1504 """
1506 if not cross_validation:
-> 1507 model, model_fit_time = self._create_model_without_cv(
1508 model=model,
1509 data_X=data_X,
1510 data_y=data_y,
1511 fit_kwargs=fit_kwargs,
1512 round=round,
1513 predict=predict,
1514 system=system,
1515 display=display,
1516 model_only=model_only,
1517 return_train_score=return_train_score,
1518 )
1520 display.move_progress()
1522 self.logger.info(str(model))
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pycaret/internal/pycaret_experiment/supervised_experiment.py:1032, in _SupervisedExperiment._create_model_without_cv(self, model, data_X, data_y, fit_kwargs, round, predict, system, display, model_only, return_train_score)
1030 model_fit_start = time.time()
1031 with redirect_output(self.logger):
-> 1032 pipeline_with_model.fit(data_X, data_y, **fit_kwargs)
1033 model_fit_end = time.time()
1035 model_fit_time = np.array(model_fit_end - model_fit_start).round(2)
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pycaret/internal/pipeline.py:270, in Pipeline.fit(self, X, y, **fit_params)
268 def fit(self, X=None, y=None, **fit_params):
269 fit_params_steps = self._check_fit_params(**fit_params)
--> 270 X, y, _ = self._fit(X, y, **fit_params_steps)
272 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
273 if self._final_estimator != "passthrough":
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pycaret/internal/pipeline.py:253, in Pipeline._fit(self, X, y, **fit_params_steps)
245 # Fit or load the current transformer from cache
246 fitted_transformer = self._memory_fit(
247 transformer=cloned,
248 X=X,
(...)
251 **fit_params_steps.get(name, {}),
252 )
--> 253 X, y = self._memory_transform(
254 transformer=fitted_transformer,
255 X=X,
256 y=y,
257 )
259 # Replace the transformer of the step with the fitted
260 # transformer (necessary when loading from the cache)
261 self.steps[step_idx] = (name, fitted_transformer)
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/joblib/memory.py:353, in NotMemorizedFunc.__call__(self, *args, **kwargs)
352 def __call__(self, *args, **kwargs):
--> 353 return self.func(*args, **kwargs)
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pycaret/internal/pipeline.py:79, in _transform_one(transformer, X, y)
77 if "y" in signature(transformer.transform).parameters:
78 args.append(y)
---> 79 output = transformer.transform(*args)
81 if isinstance(output, tuple):
82 X, y = output[0], output[1]
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/sklearn/utils/_set_output.py:140, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
138 @wraps(f)
139 def wrapped(self, X, *args, **kwargs):
--> 140 data_to_wrap = f(self, X, *args, **kwargs)
141 if isinstance(data_to_wrap, tuple):
142 # only wrap the first output for cross decomposition
143 return (
144 _wrap_data_with_container(method, data_to_wrap[0], X, self),
145 *data_to_wrap[1:],
146 )
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pycaret/internal/preprocess/transformers.py:259, in TransformerWrapper.transform(self, X, y)
257 # Transform can return X, y or both
258 if isinstance(output, tuple):
--> 259 new_X = self._prepare_df(X, output[0])
260 new_y = to_series(output[1], index=new_X.index, name=y.name)
261 else:
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pycaret/internal/preprocess/transformers.py:196, in TransformerWrapper._prepare_df(self, X, out)
194 # Reorder columns if only a subset was used
195 if len(self._include) != X.shape[1]:
--> 196 return self._reorder_cols(out, X)
197 else:
198 return out
File ~/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/pycaret/internal/preprocess/transformers.py:137, in TransformerWrapper._reorder_cols(self, df, original_df)
135 original_df.index = df.index
136 except ValueError: # Length mismatch
--> 137 raise IndexError(
138 f"Length of values ({len(df)}) does not match length of "
139 f"index ({len(original_df)}). This usually happens when "
140 "transformations that drop rows aren't applied on all "
141 "the columns."
142 )
144 # Define new column order
145 # Use OrderedDict as ordered set (only keys matter)
146 # We want a set to avoid duplicate column names, which can happen
147 # if we have eg. COL_A and COL_A_2 encoded using OHE
148 columns = OrderedDict()
IndexError: Length of values (19894) does not match length of index (10000). This usually happens when transformations that drop rows aren't applied on all the columns.Installed Versions
Details
System: python: 3.8.18 (default, Sep 11 2023, 08:17:33) [Clang 14.0.6 ] executable: [/Users/uxf/opt/anaconda3/envs/pycaret/bin/python](https://file+.vscode-resource.vscode-cdn.net/Users/uxf/opt/anaconda3/envs/pycaret/bin/python) machine: macOS-10.16-x86_64-i386-64bitPyCaret required dependencies:
pip: 24.2
setuptools: 68.0.0
pycaret: 3.2.0
IPython: 8.12.0
ipywidgets: 8.1.3
tqdm: 4.66.5
numpy: 1.24.3
pandas: 1.4.4
jinja2: 3.1.2
scipy: 1.10.1
joblib: 1.3.2
sklearn: 1.2.2
pyod: 2.0.1
imblearn: 0.12.3
category_encoders: 2.6.3
lightgbm: 4.5.0
numba: 0.58.1
requests: 2.31.0
matplotlib: 3.6.0
scikitplot: 0.3.7
yellowbrick: 1.5
plotly: 5.23.0
plotly-resampler: Not installed
kaleido: 0.2.1
schemdraw: 0.15
statsmodels: 0.14.1
sktime: 0.21.1
tbats: 1.1.3
pmdarima: 2.0.4
psutil: 6.0.0
markupsafe: 2.1.3
pickle5: Not installed
cloudpickle: 3.0.0
deprecation: 2.1.0
xxhash: 3.5.0
wurlitzer: 3.1.1
PyCaret optional dependencies:
shap: 0.44.1
interpret: Not installed
umap: Not installed
ydata_profiling: Not installed
explainerdashboard: 0.4.7
autoviz: Not installed
fairlearn: Not installed
deepchecks: Not installed
xgboost: 2.1.1
catboost: Not installed
kmodes: Not installed
mlxtend: Not installed
statsforecast: Not installed
tune_sklearn: Not installed
ray: Not installed
hyperopt: Not installed
optuna: Not installed
skopt: Not installed
mlflow: Not installed
gradio: Not installed
fastapi: Not installed
uvicorn: Not installed
m2cgen: Not installed
evidently: Not installed
fugue: Not installed
streamlit: Not installed
prophet: Not installed