Merge: Vectorize fuzzy_row_match (#489)

Scienfitz · web-flow · commit 62818683e7a6 · 2025-03-05T11:23:23.000+01:00
- use vectorized operations instead of the for loop - fixed column validations - I tested that the result of the new version is always exactly equal to the old version - added some basic pytests for the utility - related to #344 Here a resulting test looking at the speedup: <img width="815" alt="image" src="https://github.com/user-attachments/assets/094bd96c-1e0f-4c4b-a10e-fdd5d680eb16" /> - speedup for the most realistic cases (`left_df` large versus `right_df`) approaches 4x from above - for less relevant cases (`left_df` and `right_df` comparable in size or overall very small) the speedup can even be 40x
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   `supports_batching` and `supports_pending_experiments`
 - `SHAPInsight` now allows explanation input that has additional columns compared to 
   the background data (will be ignored)
+- `fuzzy_row_match` now uses vectorized operations, resulting in a speedup of matching 
+  measurements to the search space between 4x and 40x
 
 ### Fixed
 - Incorrect optimization direction with `PSTD` with a single minimization target
diff --git a/baybe/exceptions.py b/baybe/exceptions.py
@@ -1,5 +1,9 @@
 """Custom exceptions and warnings."""
 
+import pandas as pd
+from attr.validators import instance_of
+from attrs import define, field
+from typing_extensions import override
 
 ##### Warnings #####
 
@@ -11,6 +15,24 @@ class UnusedObjectWarning(UserWarning):
     """
 
 
+@define
+class SearchSpaceMatchWarning(UserWarning):
+    """
+    When trying to match data to entries in the search space, something unexpected
+    happened.
+    """
+
+    message: str = field(validator=instance_of(str))
+    data: pd.DataFrame = field(validator=instance_of(pd.DataFrame))
+
+    def __attrs_pre_init(self):
+        super().__init__(self.message)
+
+    @override
+    def __str__(self):
+        return self.message
+
+
 ##### Exceptions #####
 
 
diff --git a/baybe/utils/dataframe.py b/baybe/utils/dataframe.py
@@ -10,6 +10,7 @@
 import numpy as np
 import pandas as pd
 
+from baybe.exceptions import SearchSpaceMatchWarning
 from baybe.targets.base import Target
 from baybe.targets.binary import BinaryTarget
 from baybe.targets.enum import TargetMode
@@ -462,7 +463,7 @@ def fuzzy_row_match(
     right_df: pd.DataFrame,
     parameters: Sequence[Parameter],
 ) -> pd.Index:
-    """Match row of the right dataframe to the rows of the left dataframe.
+    """Match rows of the right dataframe to rows of the left dataframe.
 
     This is useful for matching measurements to entries in the search space, e.g. to
     detect which ones have been measured. For categorical parameters, there needs to be
@@ -476,57 +477,89 @@ def fuzzy_row_match(
 
     Args:
         left_df: The data that serves as lookup reference.
-        right_df: The data that should be checked for matching rows in the left
-            dataframe.
-        parameters: List of baybe parameter objects that are needed to identify
-            potential tolerances.
+        right_df: The data that is checked for matching rows in the left dataframe.
+        parameters: Parameter objects that identify the relevant column names and how
+            matching is performed.
 
     Returns:
         The index of the matching rows in ``left_df``.
 
     Raises:
-        ValueError: If some rows are present in the right but not in the left dataframe.
+        ValueError: If either ``left_df`` or ``right_df`` does not contain columns for
+            each entry in parameters.
     """
-    # Assert that all parameters appear in the given dataframe
-    if not all(col in right_df.columns for col in left_df.columns):
+    # Separate columns types
+    cat_cols = {p.name for p in parameters if (not p.is_numerical and p.is_discrete)}
+    num_cols = {p.name for p in parameters if (p.is_numerical and p.is_discrete)}
+    non_discrete_cols = {p.name for p in parameters if not p.is_discrete}
+
+    # Assert that all parameters appear in the given dataframes
+    if diff := (cat_cols | num_cols).difference(left_df.columns):
+        raise ValueError(
+            f"For fuzzy row matching, all discrete parameters need to have a "
+            f"corresponding column in the left dataframe. Parameters not found: {diff})"
+        )
+    if diff := (cat_cols | num_cols).difference(right_df.columns):
         raise ValueError(
-            "For fuzzy row matching all rows of the right dataframe need to be present"
-            " in the left dataframe."
+            f"For fuzzy row matching, all discrete parameters need to have a "
+            f"corresponding column in the right dataframe. Parameters not found: "
+            f"{diff})"
         )
 
-    # Iterate over all input rows
-    inds_matched = []
-    for ind, row in right_df.iterrows():
-        # Differentiate category-like and discrete numerical parameters
-        cat_cols = [p.name for p in parameters if not p.is_numerical]
-        num_cols = [p.name for p in parameters if (p.is_numerical and p.is_discrete)]
-
-        # Discrete parameters must match exactly
-        match = left_df[cat_cols].eq(row[cat_cols]).all(axis=1, skipna=False)
-
-        # For numeric parameters, match the entry with the smallest deviation
-        for col in num_cols:
-            abs_diff = (left_df[col] - row[col]).abs()
-            match &= abs_diff == abs_diff.min()
-
-        # We expect exactly one match. If that's not the case, print a warning.
-        inds_found = left_df.index[match].to_list()
-        if len(inds_found) == 0 and len(num_cols) > 0:
-            warnings.warn(
-                f"Input row with index {ind} could not be matched to the search space. "
-                f"This could indicate that something went wrong."
-            )
-        elif len(inds_found) > 1:
-            warnings.warn(
-                f"Input row with index {ind} has multiple matches with the search "
-                f"space. This could indicate that something went wrong. Matching only "
-                f"first occurrence."
-            )
-            inds_matched.append(inds_found[0])
-        else:
-            inds_matched.extend(inds_found)
+    provided_cols = {p.name for p in parameters}
+    allowed_cols = cat_cols | num_cols | non_discrete_cols
+    assert allowed_cols == provided_cols, (
+        f"There are parameter types that would be silently ignored: "
+        f"{provided_cols.difference(allowed_cols)}"
+    )
+
+    # Initialize the match matrix. We will later filter it down using other
+    # matrices (representing the matches for individual parameters) via logical 'and'.
+    match_matrix = pd.DataFrame(
+        True, index=right_df.index, columns=left_df.index, dtype=bool
+    )
+
+    # Match categorical parameters
+    for col in cat_cols:
+        # Per categorical parameter, this identifies matches between all elements of
+        # left and right and stores them in a matrix.
+        match_matrix &= right_df[col].values[:, None] == left_df[col].values[None, :]
+
+    # Match numerical parameters
+    for col in num_cols:
+        # Per numerical parameter, this identifies the rows with the smallest absolute
+        # difference and records them in a matrix.
+        abs_diff = np.abs(right_df[col].values[:, None] - left_df[col].values[None, :])
+        min_diff = abs_diff.min(axis=1, keepdims=True)
+        match_matrix &= abs_diff == min_diff
+
+    # Find the matching indices. If a right row is not matched to any of the rows in
+    # left, idxmax would return the first index of left_df. Hence, we remember these
+    # cases and drop them explicitly.
+    matched_indices = pd.Index(match_matrix.idxmax(axis=1).values)
+    mask_no_match = ~match_matrix.any(axis=1)
+    matched_indices = matched_indices[~mask_no_match]
+
+    # Warn if there are multiple or no matches
+    if no_match_indices := right_df.index[mask_no_match].tolist():
+        w = SearchSpaceMatchWarning(
+            f"Some input rows could not be matched to the search space. Indices with "
+            f"no matches: {no_match_indices}",
+            right_df.loc[no_match_indices],
+        )
+        warnings.warn(w)
+
+    mask_multiple_matches = match_matrix.sum(axis=1) > 1
+    if multiple_match_indices := right_df.index[mask_multiple_matches].tolist():
+        w = SearchSpaceMatchWarning(
+            f"Some input rows have multiple matches with the search space. "
+            f"Matching only first occurrence for these rows. Indices with multiple "
+            f"matches: {multiple_match_indices}",
+            right_df.loc[multiple_match_indices],
+        )
+        warnings.warn(w)
 
-    return pd.Index(inds_matched)
+    return matched_indices
 
 
 def pretty_print_df(
diff --git a/tests/utils/test_dataframe.py b/tests/utils/test_dataframe.py
@@ -1,10 +1,24 @@
 """Tests for dataframe utilities."""
 
+from contextlib import nullcontext
+
 import numpy as np
 import pandas as pd
 import pytest
+from pandas.testing import assert_frame_equal
+from pytest import param
+
+from baybe.exceptions import SearchSpaceMatchWarning
+from baybe.utils.dataframe import (
+    add_noise_to_perturb_degenerate_rows,
+    add_parameter_noise,
+    fuzzy_row_match,
+)
 
-from baybe.utils.dataframe import add_noise_to_perturb_degenerate_rows
+
+@pytest.fixture()
+def n_grid_points():
+    return 5
 
 
 def test_degenerate_rows():
@@ -41,3 +55,118 @@ def test_degenerate_rows_invalid_input():
     # Add noise
     with pytest.raises(TypeError):
         add_noise_to_perturb_degenerate_rows(df)
+
+
+@pytest.mark.parametrize(
+    ("parameter_names", "noise", "duplicated"),
+    [
+        param(
+            ["Categorical_1", "Num_disc_1", "Some_Setting"],
+            False,
+            True,
+            id="discrete_num_noiseless_duplicated",
+        ),
+        param(
+            ["Categorical_1", "Num_disc_1", "Some_Setting"],
+            False,
+            False,
+            id="discrete_num_noiseless_unique",
+        ),
+        param(
+            ["Categorical_1", "Num_disc_1", "Some_Setting"],
+            True,
+            False,
+            id="discrete_num_noisy_unique",
+        ),
+        param(
+            ["Categorical_1", "Switch_1", "Some_Setting"],
+            False,
+            False,
+            id="discrete_cat",
+        ),
+        param(
+            ["Categorical_1", "Switch_1", "Conti_finite_1"],
+            False,
+            False,
+            id="hybrid_cat",
+        ),
+        param(
+            ["Categorical_1", "Num_disc_1", "Conti_finite_1"],
+            False,
+            False,
+            id="hybrid_num_noiseless_unique",
+        ),
+        param(
+            ["Categorical_1", "Num_disc_1", "Conti_finite_1"],
+            True,
+            False,
+            id="hybrid_num_noisy_unique",
+        ),
+        param(
+            ["Categorical_1", "Num_disc_1", "Conti_finite_1"],
+            False,
+            True,
+            id="hybrid_num_noiseless_duplicated",
+        ),
+    ],
+)
+def test_fuzzy_row_match(searchspace, noise, duplicated):
+    """Fuzzy row matching returns expected indices."""
+    left_df = searchspace.discrete.exp_rep.copy()
+    selected = np.random.choice(left_df.index, 4, replace=False)
+    right_df = left_df.loc[selected].reset_index(drop=True)
+
+    context = nullcontext()
+    if duplicated:
+        # Set one of the input values to exactly the midpoint between two values to
+        # cause a degenerate match
+        vals = searchspace.get_parameters_by_name(["Num_disc_1"])[0].values
+        right_df.loc[0, "Num_disc_1"] = vals[0] + (vals[1] - vals[0]) / 2.0
+        context = pytest.warns(SearchSpaceMatchWarning, match="multiple matches")
+
+    if noise:
+        add_parameter_noise(
+            right_df,
+            searchspace.discrete.parameters,
+            noise_type="relative_percent",
+            noise_level=0.1,
+        )
+
+    with context as c:
+        matched = fuzzy_row_match(left_df, right_df, searchspace.parameters)
+
+    if duplicated:
+        # Assert correct identification of problematic df parts
+        w = next(x for x in c if isinstance(x.message, SearchSpaceMatchWarning)).message
+        assert_frame_equal(right_df.loc[[0]], w.data)
+
+        # Ignore problematic indices for subsequent equality check
+        selected = selected[1:]
+        matched = matched[1:]
+
+    assert set(selected) == set(matched), (selected, matched)
+
+
+@pytest.mark.parametrize(
+    "parameter_names",
+    [
+        param(["Categorical_1", "Categorical_2", "Switch_1"], id="discrete"),
+        param(["Categorical_1", "Num_disc_1", "Conti_finite1"], id="hybrid"),
+    ],
+)
+@pytest.mark.parametrize("invalid", ["left", "right"])
+def test_invalid_fuzzy_row_match(searchspace, invalid):
+    """Returns expected errors when dataframes don't contain all expected columns."""
+    left_df = searchspace.discrete.exp_rep.copy()
+    selected = np.random.choice(left_df.index, 4, replace=False)
+    right_df = left_df.loc[selected].copy()
+
+    # Drop first column
+    if invalid == "left":
+        left_df = left_df.iloc[:, 1:]
+    else:
+        right_df = right_df.iloc[:, 1:]
+
+    match = f"corresponding column in the {invalid} dataframe."
+    with pytest.raises(ValueError, match=match):
+        fuzzy_row_match(left_df, right_df, searchspace.parameters)