Skip to content

[Bug]: flaml.default.XGBRegressor does not preprocess eval_set #1463

@fingoldo

Description

@fingoldo

Describe the bug

Sometimes (when there are cat columns, for example) flaml (at least zero-shot) re-arranges columns. But for XGBRegressor/XGBClassifier, it misses to do that for the validation dataframe.

Steps to reproduce

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# --- 1. Create synthetic dataset with numeric + categorical features ---
np.random.seed(42)
n = 1000

df = pd.DataFrame({
    "num1": np.random.randn(n),
    "num2": np.random.rand(n) * 10,
    "cat1": np.random.choice(["A", "B", "C"], size=n),
    "cat2": np.random.choice(["X", "Y"], size=n),
    "target": np.random.choice([0, 1], size=n)
})

# --- 2. Split data ---
X = df.drop(columns="target")
y = df["target"]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

# --- 3. Convert categorical columns to pandas 'category' dtype ---
for col in X_train.select_dtypes(include="object").columns:
    X_train[col] = X_train[col].astype("category")
    X_valid[col] = X_valid[col].astype("category")


# --- 4. Define XGBoost model ---
model = XGBClassifier(
    tree_method="hist",              # Efficient, supports categorical features
    enable_categorical=True,         # Important!
    eval_metric="logloss",
    use_label_encoder=False,
    early_stopping_rounds=10,
    random_state=0
)

# --- 5. Fit model with early stopping ---
model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],   # validation set for early stopping    
    verbose=True
)

[0] validation_0-logloss:0.69096
[1] validation_0-logloss:0.69439
[2] validation_0-logloss:0.70184
[3] validation_0-logloss:0.70530
[4] validation_0-logloss:0.70542
[5] validation_0-logloss:0.70719
[6] validation_0-logloss:0.71508
[7] validation_0-logloss:0.71836
[8] validation_0-logloss:0.72136
[9] validation_0-logloss:0.72541

import flaml.default as flaml_zeroshot

model = flaml_zeroshot.XGBClassifier(
    tree_method="hist",              # Efficient, supports categorical features
    enable_categorical=True,         # Important!
    eval_metric="logloss",
    use_label_encoder=False,
    early_stopping_rounds=10,
    random_state=0
)

# --- 5. Fit model with early stopping ---
model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],   # validation set for early stopping    
    verbose=True
)

ValueError Traceback (most recent call last)
Cell In[4], line 13
3 model = flaml_zeroshot.XGBClassifier(
4 tree_method="hist", # Efficient, supports categorical features
5 enable_categorical=True, # Important!
(...) 9 random_state=0
10 )
12 # --- 5. Fit model with early stopping ---
---> 13 model.fit(
14 X_train, y_train,
15 eval_set=[(X_valid, y_valid)], # validation set for early stopping
16 verbose=True
17 )

File /venv/main/lib/python3.12/site-packages/flaml/default/estimator.py:106, in flamlize_estimator..EstimatorClass.fit(self, X, y, *args, **params)
97 self.set_params(**hyperparams)
98 if self._label_transformer and estimator_name in [
99 "rf",
100 "extra_tree",
(...) 104 ]:
105 # rf and et have trouble in handling boolean labels; xgboost requires integer labels
--> 106 fitted = super().fit(X, y_transformed, *args, **params)
107 # if hasattr(self, "_classes"):
108 # self._classes = self.label_transformer.classes
109 # else:
110 try:

File /venv/main/lib/python3.12/site-packages/xgboost/core.py:774, in require_keyword_args..throw_if..inner_f(*args, **kwargs)
772 for k, arg in zip(sig.parameters, args):
773 kwargs[k] = arg
--> 774 return func(**kwargs)

File /venv/main/lib/python3.12/site-packages/xgboost/sklearn.py:1803, in XGBClassifier.fit(self, X, y, sample_weight, base_margin, eval_set, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights)
1783 evals_result: EvalsLog = {}
1784 train_dmatrix, evals = _wrap_evaluation_matrices(
1785 missing=self.missing,
1786 X=X,
(...) 1800 feature_types=feature_types,
1801 )
-> 1803 self._Booster = train(
1804 params,
1805 train_dmatrix,
1806 self.get_num_boosting_rounds(),
1807 evals=evals,
1808 early_stopping_rounds=self.early_stopping_rounds,
1809 evals_result=evals_result,
1810 obj=obj,
1811 custom_metric=metric,
1812 verbose_eval=verbose,
1813 xgb_model=model,
1814 callbacks=self.callbacks,
1815 )
1817 if not callable(self.objective):
1818 self.objective = params["objective"]

File /venv/main/lib/python3.12/site-packages/xgboost/core.py:774, in require_keyword_args..throw_if..inner_f(*args, **kwargs)
772 for k, arg in zip(sig.parameters, args):
773 kwargs[k] = arg
--> 774 return func(**kwargs)

File /venv/main/lib/python3.12/site-packages/xgboost/training.py:182, in train(params, dtrain, num_boost_round, evals, obj, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, custom_metric)
175 if (
176 isinstance(va, _RefMixIn)
177 and va.ref is not weakref.ref(dtrain)
178 and va is not dtrain
179 ):
180 raise ValueError(_RefError)
--> 182 bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model)
183 start_iteration = 0
185 if verbose_eval:

File /venv/main/lib/python3.12/site-packages/xgboost/core.py:2005, in Booster.init(self, params, cache, model_file)
1998 _check_call(
1999 _LIB.XGBoosterCreate(
2000 dmats, c_bst_ulong(len(cache)), ctypes.byref(self.handle)
2001 )
2002 )
2003 for d in cache:
2004 # Validate feature only after the feature names are saved into booster.
-> 2005 self._assign_dmatrix_features(d)
2007 if isinstance(model_file, Booster):
2008 assert self.handle is not None

File /venv/main/lib/python3.12/site-packages/xgboost/core.py:3396, in Booster._assign_dmatrix_features(self, data)
3393 if self.feature_types is None:
3394 self.feature_types = ft
-> 3396 self._validate_features(fn)

File /venv/main/lib/python3.12/site-packages/xgboost/core.py:3431, in Booster._validate_features(self, feature_names)
3425 if my_missing:
3426 msg += (
3427 "\ntraining data did not have the following fields: "
3428 + ", ".join(str(s) for s in my_missing)
3429 )
-> 3431 raise ValueError(msg.format(self.feature_names, feature_names))

ValueError: feature_names mismatch: ['cat1', 'cat2', 'num1', 'num2'] ['num1', 'num2', 'cat1', 'cat2']

Model Used

No response

Expected Behavior

To process all validation sets.

Surprisingly, LGBMClassifier does that:

model = flaml_zeroshot.LGBMClassifier(
    early_stopping_rounds=10,
    verbose=2,
    random_state=0
)

# --- 5. Fit model with early stopping ---
model.fit(
    X_train, y_train,
    eval_set=(X_valid, y_valid),   # validation set for early stopping    
)

Early stopping, best iteration is:
[1] valid_0's binary_logloss: 0.691358

I'm also not a fan of copying entire X dataframe. This makes FLAML not ready for big datasets. Can there be added some flag to tell FLAML to do its preprocessing inplace instead of copying entire X dataframe?

Screenshots and logs

No response

Additional Information

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions