Skip to content

Commit 949b897

Browse files
authoredFeb 17, 2025··
Merge: Rework validation for measurements and pending_experiments (#456)
Fixes #453 Notes re `measurements` validation - Two utilities (one for parameters and one for targets) for input validation has been extracted. Additional validations for binary targets have been added. The utilities contain parts from `add_measurements` and `fuzzy_row_match` - As a result `fuzzy_row_match` does not perform any validation anymore. Whether numerical parameter measurements outside of numerical tolerances are allowed is now decided by outside of this function, it will always match numerical parts on the smallest distance - `add_measurements` now simply calls the utilities - tests for invalid parameter input have been extended - Any duplicated validation is excluded due to validated dataframes being wrapped in `ValidatedDataFrame` Notes re `pending_experiments` validation - `Campaign.recommend` now performs validation of `pending_experiments` - tests for invalid `pending_experiments` have been added Notes on recommenders - Pure recommenders now validate `measurements` and `pending_experiments` in `recommend` - Despite being derived from pure recommenders, bayesian recommenders have their own implementation of `measurements` and `pending_experiments` because the validation needs to happen before the call to the base class - meta recommenders dont perform any validation as at the base level there will always be a pure recommender which performs validation if still needed Notes on `numerical_measurements_must_be_within_tolerance` - The only option to activate this behavior is for `measurements` in `Campaign.add_measurements` - In particular, i) `pending_experiments` are always valid independent of any numerical tolerance deviation (both in recommenders and campaign) and ii) `measurements` are always valid independent of numerical tolerances for recommenders
2 parents 4fb609b + cdeafea commit 949b897

File tree

12 files changed

+337
-171
lines changed

12 files changed

+337
-171
lines changed
 

‎baybe/campaign.py

+16-42
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,9 @@
4242
)
4343
from baybe.utils.basic import UNSPECIFIED, UnspecifiedType, is_all_instance
4444
from baybe.utils.boolean import eq_dataframe
45-
from baybe.utils.dataframe import filter_df, fuzzy_row_match
45+
from baybe.utils.dataframe import _ValidatedDataFrame, filter_df, fuzzy_row_match
4646
from baybe.utils.plotting import to_string
47+
from baybe.utils.validation import validate_parameter_input, validate_target_input
4748

4849
if TYPE_CHECKING:
4950
from botorch.posteriors import Posterior
@@ -264,48 +265,25 @@ def add_measurements(
264265
Each addition of data is considered a new batch. Added results are checked for
265266
validity. Categorical values need to have an exact match. For numerical values,
266267
a campaign flag determines if values that lie outside a specified tolerance
267-
are accepted.
268-
Note that this modifies the provided data in-place.
268+
are accepted. Possible validation exceptions are documented in
269+
:func:`baybe.utils.validation.validate_target_input` and
270+
:func:`baybe.utils.validation.validate_parameter_input`.
269271
270272
Args:
271273
data: The data to be added (with filled values for targets). Preferably
272274
created via :func:`baybe.campaign.Campaign.recommend`.
273275
numerical_measurements_must_be_within_tolerance: Flag indicating if
274276
numerical parameters need to be within their tolerances.
275-
276-
Raises:
277-
ValueError: If one of the targets has missing values or NaNs in the provided
278-
dataframe.
279-
TypeError: If the target has non-numeric entries in the provided dataframe.
280277
"""
281278
# Invalidate recommendation cache first (in case of uncaught exceptions below)
282279
self._cached_recommendation = pd.DataFrame()
283280

284-
# Check if all targets have valid values
285-
for target in self.targets:
286-
if data[target.name].isna().any():
287-
raise ValueError(
288-
f"The target '{target.name}' has missing values or NaNs in the "
289-
f"provided dataframe. Missing target values are not supported."
290-
)
291-
if data[target.name].dtype.kind not in "iufb":
292-
raise TypeError(
293-
f"The target '{target.name}' has non-numeric entries in the "
294-
f"provided dataframe. Non-numeric target values are not supported."
295-
)
296-
297-
# Check if all targets have valid values
298-
for param in self.parameters:
299-
if data[param.name].isna().any():
300-
raise ValueError(
301-
f"The parameter '{param.name}' has missing values or NaNs in the "
302-
f"provided dataframe. Missing parameter values are not supported."
303-
)
304-
if param.is_numerical and (data[param.name].dtype.kind not in "iufb"):
305-
raise TypeError(
306-
f"The numerical parameter '{param.name}' has non-numeric entries in"
307-
f" the provided dataframe."
308-
)
281+
# Validate target and parameter input values
282+
validate_target_input(data, self.targets)
283+
validate_parameter_input(
284+
data, self.parameters, numerical_measurements_must_be_within_tolerance
285+
)
286+
data.__class__ = _ValidatedDataFrame
309287

310288
# Read in measurements and add them to the database
311289
self.n_batches_done += 1
@@ -320,20 +298,14 @@ def add_measurements(
320298
# Update metadata
321299
if self.searchspace.type in (SearchSpaceType.DISCRETE, SearchSpaceType.HYBRID):
322300
idxs_matched = fuzzy_row_match(
323-
self.searchspace.discrete.exp_rep,
324-
data,
325-
self.parameters,
326-
numerical_measurements_must_be_within_tolerance,
301+
self.searchspace.discrete.exp_rep, data, self.parameters
327302
)
328303
self._searchspace_metadata.loc[idxs_matched, _MEASURED] = True
329304

330305
# Telemetry
331306
telemetry_record_value(TELEM_LABELS["COUNT_ADD_RESULTS"], 1)
332307
telemetry_record_recommended_measurement_percentage(
333-
self._cached_recommendation,
334-
data,
335-
self.parameters,
336-
numerical_measurements_must_be_within_tolerance,
308+
self._cached_recommendation, data, self.parameters
337309
)
338310

339311
def toggle_discrete_candidates( # noqa: DOC501
@@ -423,8 +395,10 @@ def recommend(
423395
)
424396

425397
# Invalidate cached recommendation if pending experiments are provided
426-
if (pending_experiments is not None) and (len(pending_experiments) > 0):
398+
if (pending_experiments is not None) and not pending_experiments.empty:
427399
self._cached_recommendation = pd.DataFrame()
400+
validate_parameter_input(pending_experiments, self.parameters)
401+
pending_experiments.__class__ = _ValidatedDataFrame
428402

429403
# If there are cached recommendations and the batch size of those is equal to
430404
# the previously requested one, we just return those

‎baybe/recommenders/pure/base.py

+21
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
from baybe.searchspace.continuous import SubspaceContinuous
1616
from baybe.searchspace.core import SearchSpaceType
1717
from baybe.searchspace.discrete import SubspaceDiscrete
18+
from baybe.utils.dataframe import _ValidatedDataFrame
19+
from baybe.utils.validation import validate_parameter_input, validate_target_input
1820

1921
_DEPRECATION_ERROR_MESSAGE = (
2022
"The attribute '{}' is no longer available for recommenders. "
@@ -96,6 +98,25 @@ def recommend(
9698
measurements: pd.DataFrame | None = None,
9799
pending_experiments: pd.DataFrame | None = None,
98100
) -> pd.DataFrame:
101+
# Validation
102+
if (
103+
measurements is not None
104+
and not isinstance(measurements, _ValidatedDataFrame)
105+
and not measurements.empty
106+
and objective is not None
107+
and searchspace is not None
108+
):
109+
validate_target_input(measurements, objective.targets)
110+
validate_parameter_input(measurements, searchspace.parameters)
111+
measurements.__class__ = _ValidatedDataFrame
112+
if (
113+
pending_experiments is not None
114+
and not isinstance(pending_experiments, _ValidatedDataFrame)
115+
and searchspace is not None
116+
):
117+
validate_parameter_input(pending_experiments, searchspace.parameters)
118+
pending_experiments.__class__ = _ValidatedDataFrame
119+
99120
if searchspace.type is SearchSpaceType.CONTINUOUS:
100121
return self._recommend_continuous(
101122
subspace_continuous=searchspace.continuous, batch_size=batch_size

‎baybe/recommenders/pure/bayesian/base.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
from baybe.searchspace import SearchSpace
1818
from baybe.surrogates import CustomONNXSurrogate, GaussianProcessSurrogate
1919
from baybe.surrogates.base import IndependentGaussianSurrogate, SurrogateProtocol
20+
from baybe.utils.dataframe import _ValidatedDataFrame
21+
from baybe.utils.validation import validate_parameter_input, validate_target_input
2022

2123

2224
@define
@@ -104,11 +106,21 @@ def recommend(
104106
f"that an objective is specified."
105107
)
106108

107-
if (measurements is None) or (len(measurements) == 0):
109+
# Experimental input validation
110+
if (measurements is None) or measurements.empty:
108111
raise NotImplementedError(
109112
f"Recommenders of type '{BayesianRecommender.__name__}' do not support "
110113
f"empty training data."
111114
)
115+
if not isinstance(measurements, _ValidatedDataFrame):
116+
validate_target_input(measurements, objective.targets)
117+
validate_parameter_input(measurements, searchspace.parameters)
118+
measurements.__class__ = _ValidatedDataFrame
119+
if pending_experiments is not None and not isinstance(
120+
pending_experiments, _ValidatedDataFrame
121+
):
122+
validate_parameter_input(pending_experiments, searchspace.parameters)
123+
pending_experiments.__class__ = _ValidatedDataFrame
112124

113125
if (
114126
isinstance(self._surrogate_model, IndependentGaussianSurrogate)

‎baybe/recommenders/pure/nonpredictive/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def recommend(
3535
f"experiments from the candidate set, adjust the search space "
3636
f"accordingly."
3737
)
38-
if (measurements is not None) and (len(measurements) != 0):
38+
if (measurements is not None) and not measurements.empty:
3939
warnings.warn(
4040
f"'{self.recommend.__name__}' was called with a non-empty "
4141
f"set of measurements but '{self.__class__.__name__}' does not "

‎baybe/simulation/core.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def simulate_experiment(
118118
campaign = deepcopy(campaign)
119119

120120
# Add the initial data
121-
if initial_data is not None:
121+
if (initial_data is not None) and not initial_data.empty:
122122
campaign.add_measurements(initial_data)
123123

124124
# For impute_mode 'ignore', do not recommend space entries that are not

‎baybe/telemetry.py

+4-19
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,6 @@ def telemetry_record_recommended_measurement_percentage(
214214
cached_recommendation: pd.DataFrame,
215215
measurements: pd.DataFrame,
216216
parameters: Sequence[Parameter],
217-
numerical_measurements_must_be_within_tolerance: bool,
218217
) -> None:
219218
"""Submit the percentage of added measurements.
220219
@@ -232,31 +231,17 @@ def telemetry_record_recommended_measurement_percentage(
232231
measurements: The measurements which are supposed to be checked against cached
233232
recommendations.
234233
parameters: The list of parameters spanning the entire search space.
235-
numerical_measurements_must_be_within_tolerance: If ``True``, numerical
236-
parameter entries are matched with the reference elements only if there is
237-
a match within the parameter tolerance. If ``False``, the closest match
238-
is considered, irrespective of the distance.
239234
"""
240235
if is_enabled():
241-
if len(cached_recommendation) > 0:
236+
if cached_recommendation.empty:
237+
_submit_scalar_value(TELEM_LABELS["NAKED_INITIAL_MEASUREMENTS"], 1)
238+
else:
242239
recommended_measurements_percentage = (
243-
len(
244-
fuzzy_row_match(
245-
cached_recommendation,
246-
measurements,
247-
parameters,
248-
numerical_measurements_must_be_within_tolerance,
249-
)
250-
)
240+
len(fuzzy_row_match(cached_recommendation, measurements, parameters))
251241
/ len(cached_recommendation)
252242
* 100.0
253243
)
254244
_submit_scalar_value(
255245
TELEM_LABELS["RECOMMENDED_MEASUREMENTS_PERCENTAGE"],
256246
recommended_measurements_percentage,
257247
)
258-
else:
259-
_submit_scalar_value(
260-
TELEM_LABELS["NAKED_INITIAL_MEASUREMENTS"],
261-
1,
262-
)

0 commit comments

Comments
 (0)
Please sign in to comment.