-
Notifications
You must be signed in to change notification settings - Fork 546
Description
Describe the bug
Sometimes (when there are cat columns, for example) flaml (at least zero-shot) re-arranges columns. But for XGBRegressor/XGBClassifier, it misses to do that for the validation dataframe.
Steps to reproduce
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
# --- 1. Create synthetic dataset with numeric + categorical features ---
np.random.seed(42)
n = 1000
df = pd.DataFrame({
"num1": np.random.randn(n),
"num2": np.random.rand(n) * 10,
"cat1": np.random.choice(["A", "B", "C"], size=n),
"cat2": np.random.choice(["X", "Y"], size=n),
"target": np.random.choice([0, 1], size=n)
})
# --- 2. Split data ---
X = df.drop(columns="target")
y = df["target"]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)
# --- 3. Convert categorical columns to pandas 'category' dtype ---
for col in X_train.select_dtypes(include="object").columns:
X_train[col] = X_train[col].astype("category")
X_valid[col] = X_valid[col].astype("category")
# --- 4. Define XGBoost model ---
model = XGBClassifier(
tree_method="hist", # Efficient, supports categorical features
enable_categorical=True, # Important!
eval_metric="logloss",
use_label_encoder=False,
early_stopping_rounds=10,
random_state=0
)
# --- 5. Fit model with early stopping ---
model.fit(
X_train, y_train,
eval_set=[(X_valid, y_valid)], # validation set for early stopping
verbose=True
)[0] validation_0-logloss:0.69096
[1] validation_0-logloss:0.69439
[2] validation_0-logloss:0.70184
[3] validation_0-logloss:0.70530
[4] validation_0-logloss:0.70542
[5] validation_0-logloss:0.70719
[6] validation_0-logloss:0.71508
[7] validation_0-logloss:0.71836
[8] validation_0-logloss:0.72136
[9] validation_0-logloss:0.72541
import flaml.default as flaml_zeroshot
model = flaml_zeroshot.XGBClassifier(
tree_method="hist", # Efficient, supports categorical features
enable_categorical=True, # Important!
eval_metric="logloss",
use_label_encoder=False,
early_stopping_rounds=10,
random_state=0
)
# --- 5. Fit model with early stopping ---
model.fit(
X_train, y_train,
eval_set=[(X_valid, y_valid)], # validation set for early stopping
verbose=True
)
ValueError Traceback (most recent call last)
Cell In[4], line 13
3 model = flaml_zeroshot.XGBClassifier(
4 tree_method="hist", # Efficient, supports categorical features
5 enable_categorical=True, # Important!
(...) 9 random_state=0
10 )
12 # --- 5. Fit model with early stopping ---
---> 13 model.fit(
14 X_train, y_train,
15 eval_set=[(X_valid, y_valid)], # validation set for early stopping
16 verbose=True
17 )File /venv/main/lib/python3.12/site-packages/flaml/default/estimator.py:106, in flamlize_estimator..EstimatorClass.fit(self, X, y, *args, **params)
97 self.set_params(**hyperparams)
98 if self._label_transformer and estimator_name in [
99 "rf",
100 "extra_tree",
(...) 104 ]:
105 # rf and et have trouble in handling boolean labels; xgboost requires integer labels
--> 106 fitted = super().fit(X, y_transformed, *args, **params)
107 # if hasattr(self, "_classes"):
108 # self._classes = self.label_transformer.classes
109 # else:
110 try:File /venv/main/lib/python3.12/site-packages/xgboost/core.py:774, in require_keyword_args..throw_if..inner_f(*args, **kwargs)
772 for k, arg in zip(sig.parameters, args):
773 kwargs[k] = arg
--> 774 return func(**kwargs)File /venv/main/lib/python3.12/site-packages/xgboost/sklearn.py:1803, in XGBClassifier.fit(self, X, y, sample_weight, base_margin, eval_set, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights)
1783 evals_result: EvalsLog = {}
1784 train_dmatrix, evals = _wrap_evaluation_matrices(
1785 missing=self.missing,
1786 X=X,
(...) 1800 feature_types=feature_types,
1801 )
-> 1803 self._Booster = train(
1804 params,
1805 train_dmatrix,
1806 self.get_num_boosting_rounds(),
1807 evals=evals,
1808 early_stopping_rounds=self.early_stopping_rounds,
1809 evals_result=evals_result,
1810 obj=obj,
1811 custom_metric=metric,
1812 verbose_eval=verbose,
1813 xgb_model=model,
1814 callbacks=self.callbacks,
1815 )
1817 if not callable(self.objective):
1818 self.objective = params["objective"]File /venv/main/lib/python3.12/site-packages/xgboost/core.py:774, in require_keyword_args..throw_if..inner_f(*args, **kwargs)
772 for k, arg in zip(sig.parameters, args):
773 kwargs[k] = arg
--> 774 return func(**kwargs)File /venv/main/lib/python3.12/site-packages/xgboost/training.py:182, in train(params, dtrain, num_boost_round, evals, obj, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, custom_metric)
175 if (
176 isinstance(va, _RefMixIn)
177 and va.ref is not weakref.ref(dtrain)
178 and va is not dtrain
179 ):
180 raise ValueError(_RefError)
--> 182 bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model)
183 start_iteration = 0
185 if verbose_eval:File /venv/main/lib/python3.12/site-packages/xgboost/core.py:2005, in Booster.init(self, params, cache, model_file)
1998 _check_call(
1999 _LIB.XGBoosterCreate(
2000 dmats, c_bst_ulong(len(cache)), ctypes.byref(self.handle)
2001 )
2002 )
2003 for d in cache:
2004 # Validate feature only after the feature names are saved into booster.
-> 2005 self._assign_dmatrix_features(d)
2007 if isinstance(model_file, Booster):
2008 assert self.handle is not NoneFile /venv/main/lib/python3.12/site-packages/xgboost/core.py:3396, in Booster._assign_dmatrix_features(self, data)
3393 if self.feature_types is None:
3394 self.feature_types = ft
-> 3396 self._validate_features(fn)File /venv/main/lib/python3.12/site-packages/xgboost/core.py:3431, in Booster._validate_features(self, feature_names)
3425 if my_missing:
3426 msg += (
3427 "\ntraining data did not have the following fields: "
3428 + ", ".join(str(s) for s in my_missing)
3429 )
-> 3431 raise ValueError(msg.format(self.feature_names, feature_names))ValueError: feature_names mismatch: ['cat1', 'cat2', 'num1', 'num2'] ['num1', 'num2', 'cat1', 'cat2']
Model Used
No response
Expected Behavior
To process all validation sets.
Surprisingly, LGBMClassifier does that:
model = flaml_zeroshot.LGBMClassifier(
early_stopping_rounds=10,
verbose=2,
random_state=0
)
# --- 5. Fit model with early stopping ---
model.fit(
X_train, y_train,
eval_set=(X_valid, y_valid), # validation set for early stopping
)Early stopping, best iteration is:
[1] valid_0's binary_logloss: 0.691358
I'm also not a fan of copying entire X dataframe. This makes FLAML not ready for big datasets. Can there be added some flag to tell FLAML to do its preprocessing inplace instead of copying entire X dataframe?
Screenshots and logs
No response
Additional Information
No response