emdgroup · Feb 17, 2025
diff --git a/‎baybe/campaign.py
+16-42 b/‎baybe/campaign.py
+16-42
diff --git a/‎baybe/recommenders/pure/base.py
+21 b/‎baybe/recommenders/pure/base.py
+21
diff --git a/‎baybe/recommenders/pure/bayesian/base.py
+13-1 b/‎baybe/recommenders/pure/bayesian/base.py
+13-1
diff --git a/‎baybe/recommenders/pure/nonpredictive/base.py
+1-1 b/‎baybe/recommenders/pure/nonpredictive/base.py
+1-1
diff --git a/‎baybe/simulation/core.py
+1-1 b/‎baybe/simulation/core.py
+1-1
diff --git a/‎baybe/telemetry.py
+4-19 b/‎baybe/telemetry.py
+4-19
@@ -42,8 +42,9 @@
 )
 from baybe.utils.basic import UNSPECIFIED, UnspecifiedType, is_all_instance
 from baybe.utils.boolean import eq_dataframe
-from baybe.utils.dataframe import filter_df, fuzzy_row_match
+from baybe.utils.dataframe import _ValidatedDataFrame, filter_df, fuzzy_row_match
 from baybe.utils.plotting import to_string
+from baybe.utils.validation import validate_parameter_input, validate_target_input
 
 if TYPE_CHECKING:
     from botorch.posteriors import Posterior
@@ -264,48 +265,25 @@ def add_measurements(
         Each addition of data is considered a new batch. Added results are checked for
         validity. Categorical values need to have an exact match. For numerical values,
         a campaign flag determines if values that lie outside a specified tolerance
-        are accepted.
-        Note that this modifies the provided data in-place.
+        are accepted. Possible validation exceptions are documented in
+        :func:`baybe.utils.validation.validate_target_input` and
+        :func:`baybe.utils.validation.validate_parameter_input`.
 
         Args:
             data: The data to be added (with filled values for targets). Preferably
                 created via :func:`baybe.campaign.Campaign.recommend`.
             numerical_measurements_must_be_within_tolerance: Flag indicating if
                 numerical parameters need to be within their tolerances.
-
-        Raises:
-            ValueError: If one of the targets has missing values or NaNs in the provided
-                dataframe.
-            TypeError: If the target has non-numeric entries in the provided dataframe.
         """
         # Invalidate recommendation cache first (in case of uncaught exceptions below)
         self._cached_recommendation = pd.DataFrame()
 
-        # Check if all targets have valid values
-        for target in self.targets:
-            if data[target.name].isna().any():
-                raise ValueError(
-                    f"The target '{target.name}' has missing values or NaNs in the "
-                    f"provided dataframe. Missing target values are not supported."
-                )
-            if data[target.name].dtype.kind not in "iufb":
-                raise TypeError(
-                    f"The target '{target.name}' has non-numeric entries in the "
-                    f"provided dataframe. Non-numeric target values are not supported."
-                )
-
-        # Check if all targets have valid values
-        for param in self.parameters:
-            if data[param.name].isna().any():
-                raise ValueError(
-                    f"The parameter '{param.name}' has missing values or NaNs in the "
-                    f"provided dataframe. Missing parameter values are not supported."
-                )
-            if param.is_numerical and (data[param.name].dtype.kind not in "iufb"):
-                raise TypeError(
-                    f"The numerical parameter '{param.name}' has non-numeric entries in"
-                    f" the provided dataframe."
-                )
+        # Validate target and parameter input values
+        validate_target_input(data, self.targets)
+        validate_parameter_input(
+            data, self.parameters, numerical_measurements_must_be_within_tolerance
+        )
+        data.__class__ = _ValidatedDataFrame
 
         # Read in measurements and add them to the database
         self.n_batches_done += 1
@@ -320,20 +298,14 @@ def add_measurements(
         # Update metadata
         if self.searchspace.type in (SearchSpaceType.DISCRETE, SearchSpaceType.HYBRID):
             idxs_matched = fuzzy_row_match(
-                self.searchspace.discrete.exp_rep,
-                data,
-                self.parameters,
-                numerical_measurements_must_be_within_tolerance,
+                self.searchspace.discrete.exp_rep, data, self.parameters
             )
             self._searchspace_metadata.loc[idxs_matched, _MEASURED] = True
 
         # Telemetry
         telemetry_record_value(TELEM_LABELS["COUNT_ADD_RESULTS"], 1)
         telemetry_record_recommended_measurement_percentage(
-            self._cached_recommendation,
-            data,
-            self.parameters,
-            numerical_measurements_must_be_within_tolerance,
+            self._cached_recommendation, data, self.parameters
         )
 
     def toggle_discrete_candidates(  # noqa: DOC501
@@ -423,8 +395,10 @@ def recommend(
             )
 
         # Invalidate cached recommendation if pending experiments are provided
-        if (pending_experiments is not None) and (len(pending_experiments) > 0):
+        if (pending_experiments is not None) and not pending_experiments.empty:
             self._cached_recommendation = pd.DataFrame()
+            validate_parameter_input(pending_experiments, self.parameters)
+            pending_experiments.__class__ = _ValidatedDataFrame
 
         # If there are cached recommendations and the batch size of those is equal to
         # the previously requested one, we just return those
 
@@ -15,6 +15,8 @@
 from baybe.searchspace.continuous import SubspaceContinuous
 from baybe.searchspace.core import SearchSpaceType
 from baybe.searchspace.discrete import SubspaceDiscrete
+from baybe.utils.dataframe import _ValidatedDataFrame
+from baybe.utils.validation import validate_parameter_input, validate_target_input
 
 _DEPRECATION_ERROR_MESSAGE = (
     "The attribute '{}' is no longer available for recommenders. "
@@ -96,6 +98,25 @@ def recommend(
         measurements: pd.DataFrame | None = None,
         pending_experiments: pd.DataFrame | None = None,
     ) -> pd.DataFrame:
+        # Validation
+        if (
+            measurements is not None
+            and not isinstance(measurements, _ValidatedDataFrame)
+            and not measurements.empty
+            and objective is not None
+            and searchspace is not None
+        ):
+            validate_target_input(measurements, objective.targets)
+            validate_parameter_input(measurements, searchspace.parameters)
+            measurements.__class__ = _ValidatedDataFrame
+        if (
+            pending_experiments is not None
+            and not isinstance(pending_experiments, _ValidatedDataFrame)
+            and searchspace is not None
+        ):
+            validate_parameter_input(pending_experiments, searchspace.parameters)
+            pending_experiments.__class__ = _ValidatedDataFrame
+
         if searchspace.type is SearchSpaceType.CONTINUOUS:
             return self._recommend_continuous(
                 subspace_continuous=searchspace.continuous, batch_size=batch_size
 
@@ -17,6 +17,8 @@
 from baybe.searchspace import SearchSpace
 from baybe.surrogates import CustomONNXSurrogate, GaussianProcessSurrogate
 from baybe.surrogates.base import IndependentGaussianSurrogate, SurrogateProtocol
+from baybe.utils.dataframe import _ValidatedDataFrame
+from baybe.utils.validation import validate_parameter_input, validate_target_input
 
 
 @define
@@ -104,11 +106,21 @@ def recommend(
                 f"that an objective is specified."
             )
 
-        if (measurements is None) or (len(measurements) == 0):
+        # Experimental input validation
+        if (measurements is None) or measurements.empty:
             raise NotImplementedError(
                 f"Recommenders of type '{BayesianRecommender.__name__}' do not support "
                 f"empty training data."
             )
+        if not isinstance(measurements, _ValidatedDataFrame):
+            validate_target_input(measurements, objective.targets)
+            validate_parameter_input(measurements, searchspace.parameters)
+            measurements.__class__ = _ValidatedDataFrame
+        if pending_experiments is not None and not isinstance(
+            pending_experiments, _ValidatedDataFrame
+        ):
+            validate_parameter_input(pending_experiments, searchspace.parameters)
+            pending_experiments.__class__ = _ValidatedDataFrame
 
         if (
             isinstance(self._surrogate_model, IndependentGaussianSurrogate)
 
@@ -35,7 +35,7 @@ def recommend(
                 f"experiments from the candidate set, adjust the search space "
                 f"accordingly."
             )
-        if (measurements is not None) and (len(measurements) != 0):
+        if (measurements is not None) and not measurements.empty:
             warnings.warn(
                 f"'{self.recommend.__name__}' was called with a non-empty "
                 f"set of measurements but '{self.__class__.__name__}' does not "
 
@@ -118,7 +118,7 @@ def simulate_experiment(
         campaign = deepcopy(campaign)
 
         # Add the initial data
-        if initial_data is not None:
+        if (initial_data is not None) and not initial_data.empty:
             campaign.add_measurements(initial_data)
 
         # For impute_mode 'ignore', do not recommend space entries that are not
 
@@ -214,7 +214,6 @@ def telemetry_record_recommended_measurement_percentage(
     cached_recommendation: pd.DataFrame,
     measurements: pd.DataFrame,
     parameters: Sequence[Parameter],
-    numerical_measurements_must_be_within_tolerance: bool,
 ) -> None:
     """Submit the percentage of added measurements.
 
@@ -232,31 +231,17 @@ def telemetry_record_recommended_measurement_percentage(
         measurements: The measurements which are supposed to be checked against cached
             recommendations.
         parameters: The list of parameters spanning the entire search space.
-        numerical_measurements_must_be_within_tolerance: If ``True``, numerical
-            parameter entries are matched with the reference elements only if there is
-            a match within the parameter tolerance. If ``False``, the closest match
-            is considered, irrespective of the distance.
     """
     if is_enabled():
-        if len(cached_recommendation) > 0:
+        if cached_recommendation.empty:
+            _submit_scalar_value(TELEM_LABELS["NAKED_INITIAL_MEASUREMENTS"], 1)
+        else:
             recommended_measurements_percentage = (
-                len(
-                    fuzzy_row_match(
-                        cached_recommendation,
-                        measurements,
-                        parameters,
-                        numerical_measurements_must_be_within_tolerance,
-                    )
-                )
+                len(fuzzy_row_match(cached_recommendation, measurements, parameters))
                 / len(cached_recommendation)
                 * 100.0
             )
             _submit_scalar_value(
                 TELEM_LABELS["RECOMMENDED_MEASUREMENTS_PERCENTAGE"],
                 recommended_measurements_percentage,
             )
-        else:
-            _submit_scalar_value(
-                TELEM_LABELS["NAKED_INITIAL_MEASUREMENTS"],
-                1,
-            )
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ def recommend(`
`35`	`35`	`f"experiments from the candidate set, adjust the search space "`
`36`	`36`	`f"accordingly."`
`37`	`37`	`)`
`38`		`- if (measurements is not None) and (len(measurements) != 0):`
	`38`	`+ if (measurements is not None) and not measurements.empty:`
`39`	`39`	`warnings.warn(`
`40`	`40`	`f"'{self.recommend.__name__}' was called with a non-empty "`
`41`	`41`	`f"set of measurements but '{self.__class__.__name__}' does not "`