Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

How to use categorical data without one hot encoding in SequentialFeatureSelector, I am receiving an error #1093

Open
ago302 opened this issue Apr 24, 2024 · 0 comments
Labels

Comments

@ago302
Copy link

ago302 commented Apr 24, 2024

Hi, This is my code but I am receiving and error, Could you please hep me with this error?

import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool, EShapCalcType, EFeaturesSelectionAlgorithm
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

Generating numerical features

X, y = make_regression(n_samples=100, n_features=9, n_informative=2, noise=0.1, random_state=0)

Generating categorical features

X_categorical = np.random.choice(['pooh', 'rabbit', 'piglet', 'Christopher'], size=(100, 2))

Combine into a DataFrame

X_combined = np.hstack((X, X_categorical))
feature_names = ['F{}'.format(i) for i in range(X_combined.shape[1])]
df = pd.DataFrame(X_combined, columns=feature_names)
df[['F9', 'F10']]=df[['F9', 'F10']].astype("category")
df.loc[:, ~df.columns.isin(['F9', 'F10'])]=df.loc[:, ~df.columns.isin(['F9', 'F10'])].astype("float")
num_col=df.columns.drop(['F9', 'F10'])
for column in num_col:
df[column] = pd.to_numeric(df[column], errors='coerce')

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.25, random_state=0)

import xgboost as xgb
clf = xgb.XGBRegressor( enable_categorical=True)
clf.fit(X_train, y_train)

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import xgboost as xgb
clf = xgb.XGBRegressor( enable_categorical=True)
sfs1 = SFS(clf,
k_features="best",
forward=False,
floating=False,
verbose=2,
scoring="neg_mean_absolute_error",
clone_estimator=False,
n_jobs=1,
cv=0,
)

sfs1 = sfs1.fit(X_train, y_train)

and this is the error


ValueError Traceback (most recent call last)
Cell In[47], line 13
3 clf = xgb.XGBRegressor( enable_categorical=True)
4 sfs1 = SFS(clf,
5 k_features="best",
6 forward=False,
(...)
10 n_jobs=-1,
11 cv=5)
---> 13 sfs1 = sfs1.fit(X_train, y_train)

File c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\mlxtend\feature_selection\sequential_feature_selector.py:518, in SequentialFeatureSelector.fit(self, X, y, groups, **fit_params)
516 k = len(k_idx)
517 if k > 0:
--> 518 k_idx, k_score = calc_score(
519 self,
520 X
,
521 y,
522 k_idx,
523 groups=groups,
524 feature_groups=self.feature_groups_,
525 **fit_params,
526 )
527 self.subsets_[k] = {
528 "feature_idx": k_idx,
529 "cv_scores": k_score,
530 "avg_score": np.nanmean(k_score),
531 }
533 orig_set = set(range(self.k_ub))

File c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\mlxtend\feature_selection\utilities.py:98, in calc_score(selector, X, y, indices, groups, feature_groups, **fit_params)
96 IDX = merge_lists(feature_groups, indices)
97 if selector.cv:
---> 98 scores = cross_val_score(
99 selector.est
,
100 X[:, IDX],
101 y,
102 groups=groups,
103 cv=selector.cv,
104 scoring=selector.scorer,
105 n_jobs=1,
106 pre_dispatch=selector.pre_dispatch,
107 fit_params=fit_params,
108 )
109 else:
110 selector.est
.fit(X[:, IDX], y, **fit_params)

File c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils_param_validation.py:213, in validate_params..decorator..wrapper(*args, **kwargs)
207 try:
208 with config_context(
209 skip_parameter_validation=(
210 prefer_skip_nested_validation or global_skip_validation
211 )
212 ):
--> 213 return func(*args, **kwargs)
214 except InvalidParameterError as e:
215 # When the function is just a wrapper around an estimator, we allow
216 # the function to delegate validation to the estimator, but we replace
217 # the name of the estimator by the name of the function in the error
218 # message to avoid confusion.
219 msg = re.sub(
220 r"parameter of \w+ must be",
221 f"parameter of {func.qualname} must be",
222 str(e),
223 )

File c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection_validation.py:719, in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, params, pre_dispatch, error_score)
716 # To ensure multimetric format is not supported
717 scorer = check_scoring(estimator, scoring=scoring)
--> 719 cv_results = cross_validate(
720 estimator=estimator,
721 X=X,
722 y=y,
723 groups=groups,
724 scoring={"score": scorer},
725 cv=cv,
726 n_jobs=n_jobs,
727 verbose=verbose,
728 fit_params=fit_params,
729 params=params,
730 pre_dispatch=pre_dispatch,
731 error_score=error_score,
732 )
733 return cv_results["test_score"]

File c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils_param_validation.py:213, in validate_params..decorator..wrapper(*args, **kwargs)
207 try:
208 with config_context(
209 skip_parameter_validation=(
210 prefer_skip_nested_validation or global_skip_validation
211 )
212 ):
--> 213 return func(*args, **kwargs)
214 except InvalidParameterError as e:
215 # When the function is just a wrapper around an estimator, we allow
216 # the function to delegate validation to the estimator, but we replace
217 # the name of the estimator by the name of the function in the error
218 # message to avoid confusion.
219 msg = re.sub(
220 r"parameter of \w+ must be",
221 f"parameter of {func.qualname} must be",
222 str(e),
223 )

File c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection_validation.py:450, in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, params, pre_dispatch, return_train_score, return_estimator, return_indices, error_score)
429 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
430 results = parallel(
431 delayed(_fit_and_score)(
432 clone(estimator),
(...)
447 for train, test in indices
448 )
--> 450 _warn_or_raise_about_fit_failures(results, error_score)
452 # For callable scoring, the return type is only know after calling. If the
453 # return type is a dictionary, the error scores can now be inserted with
454 # the correct key.
455 if callable(scoring):

File c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection_validation.py:536, in _warn_or_raise_about_fit_failures(results, error_score)
529 if num_failed_fits == num_fits:
530 all_fits_failed_message = (
531 f"\nAll the {num_fits} fits failed.\n"
532 "It is very likely that your model is misconfigured.\n"
533 "You can try to debug the error by setting error_score='raise'.\n\n"
534 f"Below are more details about the failures:\n{fit_errors_summary}"
535 )
--> 536 raise ValueError(all_fits_failed_message)
538 else:
539 some_fits_failed_message = (
540 f"\n{num_failed_fits} fits failed out of a total of {num_fits}.\n"
541 "The score on these train-test partitions for these parameters"
(...)
545 f"Below are more details about the failures:\n{fit_errors_summary}"
546 )

ValueError:
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:

1 fits failed with the following error:
Traceback (most recent call last):
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection_validation.py", line 895, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f
return func(**kwargs)
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 1055, in fit
train_dmatrix, evals = _wrap_evaluation_matrices(
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 521, in _wrap_evaluation_matrices
train_dmatrix = create_dmatrix(
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 958, in _create_dmatrix
return QuantileDMatrix(
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f
return func(**kwargs)
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 1529, in init
self._init(
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 1588, in _init
it.reraise()
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 576, in reraise
raise exc # pylint: disable=raising-bad-type
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 557, in _handle_exception
return fn()
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 641, in
return self._handle_exception(lambda: self.next(input_data), 0)
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\data.py", line 1280, in next
input_data(**self.kwargs)
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f
return func(**kwargs)
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 624, in input_data
new, cat_codes, feature_names, feature_types = _proxy_transform(
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\data.py", line 1305, in _proxy_transform
data, _ = _ensure_np_dtype(data, data.dtype)
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\data.py", line 176, in _ensure_np_dtype
data = data.astype(dtype, copy=False)
ValueError: could not convert string to float: 'piglet'


4 fits failed with the following error:
Traceback (most recent call last):
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection_validation.py", line 895, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f
return func(**kwargs)
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 1055, in fit
train_dmatrix, evals = _wrap_evaluation_matrices(
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 521, in _wrap_evaluation_matrices
train_dmatrix = create_dmatrix(
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 958, in _create_dmatrix
return QuantileDMatrix(
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f
return func(**kwargs)
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 1529, in init
self._init(
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 1588, in _init
it.reraise()
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 576, in reraise
raise exc # pylint: disable=raising-bad-type
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 557, in _handle_exception
return fn()
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 641, in
return self._handle_exception(lambda: self.next(input_data), 0)
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\data.py", line 1280, in next
input_data(**self.kwargs)
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f
return func(**kwargs)
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 624, in input_data
new, cat_codes, feature_names, feature_types = _proxy_transform(
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\data.py", line 1305, in _proxy_transform
data, _ = _ensure_np_dtype(data, data.dtype)
File "c:\Users\Aligo\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\data.py", line 176, in _ensure_np_dtype
data = data.astype(dtype, copy=False)
ValueError: could not convert string to float: 'Christopher'

Why this package does not work with categorical feature?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

1 participant