Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Random Forest TabPFN doesn't handle NaNs #32

Open
noahho opened this issue Feb 24, 2025 · 0 comments
Open

Random Forest TabPFN doesn't handle NaNs #32

noahho opened this issue Feb 24, 2025 · 0 comments

Comments

@noahho
Copy link
Contributor

noahho commented Feb 24, 2025


ValueError Traceback (most recent call last)
in <cell line: 6>()
4 #clf = TabPFNRegressor()
5 clf = AutoTabPFNRegressor(max_time=30, device="cuda")
----> 6 clf.fit(X_t, y)
7 test_preds = clf.predict(X_test)

/kaggle/working/tabpfn_extensions/post_hoc_ensembles/sklearn_interface.py in fit(self, X, y, categorical_feature_indices)
220 )
221
--> 222 self.predictor_.fit(
223 X,
224 y,

/kaggle/working/tabpfn_extensions/post_hoc_ensembles/pfn_phe.py in fit(self, X, y, categorical_feature_indices)
331 )
332
--> 333 self._ens_model.fit(X, y)
334
335 return self

/kaggle/working/tabpfn_extensions/post_hoc_ensembles/greedy_weighted_ensemble.py in fit(self, X, y)
232
233 def fit(self, X, y):
--> 234 weights = self.get_weights(X, y)
235
236 final_weights = []

/kaggle/working/tabpfn_extensions/post_hoc_ensembles/greedy_weighted_ensemble.py in get_weights(self, X, y)
171
172 def get_weights(self, X, y):
--> 173 oof_proba = self.get_oof_per_estimator(X, y)
174 self.model_family_per_estimator = (
175 self.model_family_per_estimator

/kaggle/working/tabpfn_extensions/post_hoc_ensembles/abstract_validation_utils.py in get_oof_per_estimator(self, X, y, return_loss_per_estimator, impute_dropped_instances, _extra_processing)
370 holdout_index_hit_counts = current_repeat
371
--> 372 self._fill_predictions_in_place(
373 model_i=model_i,
374 base_model=base_model,

/kaggle/working/tabpfn_extensions/post_hoc_ensembles/abstract_validation_utils.py in _fill_predictions_in_place(self, model_i, base_model, oof_proba_list, X, y, train_index, test_index, loss_per_estimator, holdout_index_hits, _extra_processing, split_i)
125
126 # Default base models case
--> 127 base_model.fit(fold_X_train, fold_y_train)
128
129 pred = self._predict_oof(base_model, fold_X_test)

/kaggle/working/tabpfn_extensions/rf_pfn/SklearnBasedRandomForestTabPFN.py in fit(self, X, y, sample_weight)
74 if torch.is_tensor(y):
75 y = y.numpy()
---> 76 super().fit(X, y)
77 except TypeError as e:
78 print("Error in fit with data", X, y)

/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py in fit(self, X, y, sample_weight)
343 if issparse(y):
344 raise ValueError("sparse multilabel-indicator for y is not supported.")
--> 345 X, y = self._validate_data(
346 X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
347 )

/usr/local/lib/python3.10/dist-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
582 y = check_array(y, input_name="y", **check_y_params)
583 else:
--> 584 X, y = check_X_y(X, y, **check_params)
585 out = X, y
586

/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1104 )
1105
-> 1106 X = check_array(
1107 X,
1108 accept_sparse=accept_sparse,

/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
919
920 if force_all_finite:
--> 921 _assert_all_finite(
922 array,
923 input_name=input_name,

/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype, estimator_name, input_name)
159 "#estimators-that-handle-nan-values"
160 )
--> 161 raise ValueError(msg_err)
162
163

ValueError: Input X contains NaN.
RandomForestTabPFNRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant