-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Description
pycaret version checks
-
I have checked that this issue has not already been reported here.
-
I have confirmed this bug exists on the latest version of pycaret.
-
I have confirmed this bug exists on the master branch of pycaret (pip install -U git+https://github.com/pycaret/pycaret.git@master).
Issue Description
The setup function in PyCaret's ClassificationExperiment is failing when trying to process categorical variables during data preprocessing. The error occurs specifically when attempting to impute missing values in categorical columns using the iterative imputer.
Reproducible Example
# Perform stratified cross-validation for the
skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
for fold, (idx_train, idx_test) in enumerate(skf.split(data_df, data_df[label_name])):
# Get the training and test data if not independent test was given
test_df = data_df.iloc[idx_test] if test_df is None else test_df
# Initialize and Set up the classification experiment Object
clf_exp = ClassificationExperiment()
clf_exp.setup(data=data_df.iloc[idx_train], target=label_name, test_data=data_df.iloc[idx_test], **exp_params)
exp_params = { exp_name: f'classification-exp-{fold}'
'train_size': 0.8,
'numeric_imputation': 'knn',
'categorical_imputation': 'mode',
'imputation_type': 'iterative',
'feature_selection': True,
'feature_selection_method': 'univariate',
'feature_selection_estimator': 'lightgbm',
'n_features_to_select': 0.4,
'iterative_imputation_iters': 5,
'numeric_iterative_imputer': 'rf',
'categorical_iterative_imputer': 'knn',
'text_features_method': 'tf-idf',
'max_encoding_ohe': 25,
'encoding_method': None,
'rare_to_value': None,
'rare_value': 'rare',
'polynomial_features': False,
'polynomial_degree': 2,
'low_variance_threshold': 0.01,
'remove_multicollinearity': True,
'multicollinearity_threshold': 0.8,
'bin_numeric_features': None,
'remove_outliers': False,
'outliers_method': 'iforest',
'outliers_threshold': 0.05,
'fix_imbalance': True,
'fix_imbalance_method': 'RandomOverSampler',
'transformation': False,
'transformation_method': 'yeo-johnson',
'normalize': True,
'normalize_method': 'zscore',
'pca': False,
'pca_method': 'linear',
'pca_components': None,
'fold_strategy': 'stratifiedkfold',
'fold': 5,
'fold_groups': None,
'n_jobs': -1,
'use_gpu': False,
'custom_pipeline': None,
'custom_pipeline_position': -1,
'data_split_shuffle': True,
'data_split_stratify': True,
'log_experiment': False,
'log_plots': False,
'log_profile': False,
'log_data': False,
'verbose': True,
'memory': True,
'profile': True,
'profile_kwargs': {},
'html': True,
'session_id': 24,
'system_log': True,
'experiment_custom_tags': None,Expected Behavior
The setup function should successfully preprocess the data, including imputing missing values in categorical columns using the iterative imputer. if I ommit the test_data in the experiment setup and then use the function preds = clf_exp.predict_model(best_model, data=test_df) works perfectly. the limitation is in that case is that i am not able to fully set my train test to 1.0 (whole dataset).
Actual Results
Traceback (most recent call last):
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/typer/main.py", line 326, in __call__
raise e
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/typer/main.py", line 309, in __call__
return get_command(self)(*args, **kwargs)
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/click/core.py", line 1157, in __call__
return self.main(*args, **kwargs)
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/typer/core.py", line 661, in main
return _main(
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/typer/core.py", line 193, in _main
rv = self.invoke(ctx)
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/typer/main.py", line 692, in wrapper
return callback(**use_params)
File "/Users/mmonzon/Projects/piord-data-analysis/bin/4_classification_with_pycaret.py", line 325, in main
predictions, best_models, training_metrics = classification_experiment(features_df, exp_params, label_name, score =score, n=n_ensemble,
File "/Users/mmonzon/Projects/piord-data-analysis/bin/4_classification_with_pycaret.py", line 175, in classification_experiment
clf_exp.setup(data=data_df.iloc[idx_train], target=label_name, test_data=data_df.iloc[idx_test], **exp_params)
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/pycaret/classification/oop.py", line 897, in setup
self.pipeline.fit(self.X_train, self.y_train)
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/pycaret/internal/pipeline.py", line 277, in fit
X, y, _ = self._fit(X, y, routed_params)
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/pycaret/internal/pipeline.py", line 253, in _fit
fitted_transformer = self._memory_fit(
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/joblib/memory.py", line 655, in __call__
return self._cached_call(args, kwargs)[0]
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/pycaret/internal/memory.py", line 392, in _cached_call
out, metadata = self.call(*args, **kwargs)
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/pycaret/internal/memory.py", line 308, in call
output = self.func(*args, **kwargs)
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/pycaret/internal/pipeline.py", line 73, in _fit_one
transformer.fit(*args)
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/pycaret/internal/preprocess/transformers.py", line 229, in fit
self.transformer.fit(*args, **fit_params)
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/sklearn/impute/_iterative.py", line 880, in fit
self.fit_transform(X)
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 295, in wrapped
data_to_wrap = f(self, X, *args, **kwargs)
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/pycaret/internal/preprocess/iterative_imputer.py", line 339, in fit_transform
X, Xt, mask_missing_values, complete_mask = self._initial_imputation(
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/pycaret/internal/preprocess/iterative_imputer.py", line 189, in _initial_imputation
X = self._validate_data(
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/sklearn/base.py", line 633, in _validate_data
out = check_array(X, input_name="X", **check_params)
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/sklearn/utils/validation.py", line 997, in check_array
array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/sklearn/utils/_array_api.py", line 521, in _asarray_with_order
array = numpy.asarray(array, order=order, dtype=dtype)
File "/Users/mmonzon/Projects/piord-data-analysis/venv-eda/lib/python3.10/site-packages/pandas/core/generic.py", line 2153, in __array__
arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'Doctorate'