Random Forest TabPFN doesn't handle NaNs #32

noahho · 2025-02-24T15:12:07Z

ValueError Traceback (most recent call last)
in <cell line: 6>()
4 #clf = TabPFNRegressor()
5 clf = AutoTabPFNRegressor(max_time=30, device="cuda")
----> 6 clf.fit(X_t, y)
7 test_preds = clf.predict(X_test)

/kaggle/working/tabpfn_extensions/post_hoc_ensembles/sklearn_interface.py in fit(self, X, y, categorical_feature_indices)
220 )
221
--> 222 self.predictor_.fit(
223 X,
224 y,

/kaggle/working/tabpfn_extensions/post_hoc_ensembles/pfn_phe.py in fit(self, X, y, categorical_feature_indices)
331 )
332
--> 333 self._ens_model.fit(X, y)
334
335 return self

/kaggle/working/tabpfn_extensions/post_hoc_ensembles/greedy_weighted_ensemble.py in fit(self, X, y)
232
233 def fit(self, X, y):
--> 234 weights = self.get_weights(X, y)
235
236 final_weights = []

/kaggle/working/tabpfn_extensions/post_hoc_ensembles/greedy_weighted_ensemble.py in get_weights(self, X, y)
171
172 def get_weights(self, X, y):
--> 173 oof_proba = self.get_oof_per_estimator(X, y)
174 self.model_family_per_estimator = (
175 self.model_family_per_estimator

/kaggle/working/tabpfn_extensions/post_hoc_ensembles/abstract_validation_utils.py in get_oof_per_estimator(self, X, y, return_loss_per_estimator, impute_dropped_instances, _extra_processing)
370 holdout_index_hit_counts = current_repeat
371
--> 372 self._fill_predictions_in_place(
373 model_i=model_i,
374 base_model=base_model,

/kaggle/working/tabpfn_extensions/post_hoc_ensembles/abstract_validation_utils.py in _fill_predictions_in_place(self, model_i, base_model, oof_proba_list, X, y, train_index, test_index, loss_per_estimator, holdout_index_hits, _extra_processing, split_i)
125
126 # Default base models case
--> 127 base_model.fit(fold_X_train, fold_y_train)
128
129 pred = self._predict_oof(base_model, fold_X_test)

/kaggle/working/tabpfn_extensions/rf_pfn/SklearnBasedRandomForestTabPFN.py in fit(self, X, y, sample_weight)
74 if torch.is_tensor(y):
75 y = y.numpy()
---> 76 super().fit(X, y)
77 except TypeError as e:
78 print("Error in fit with data", X, y)

/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py in fit(self, X, y, sample_weight)
343 if issparse(y):
344 raise ValueError("sparse multilabel-indicator for y is not supported.")
--> 345 X, y = self._validate_data(
346 X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
347 )

/usr/local/lib/python3.10/dist-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
582 y = check_array(y, input_name="y", **check_y_params)
583 else:
--> 584 X, y = check_X_y(X, y, **check_params)
585 out = X, y
586

/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1104 )
1105
-> 1106 X = check_array(
1107 X,
1108 accept_sparse=accept_sparse,

/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
919
920 if force_all_finite:
--> 921 _assert_all_finite(
922 array,
923 input_name=input_name,

/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype, estimator_name, input_name)
159 "#estimators-that-handle-nan-values"
160 )
--> 161 raise ValueError(msg_err)
162
163

ValueError: Input X contains NaN.
RandomForestTabPFNRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Random Forest TabPFN doesn't handle NaNs #32

Random Forest TabPFN doesn't handle NaNs #32

noahho commented Feb 24, 2025

Random Forest TabPFN doesn't handle NaNs #32

Random Forest TabPFN doesn't handle NaNs #32

Comments

noahho commented Feb 24, 2025