Merge: Minor SHAP improvements (#494)

Scienfitz · web-flow · commit 70b4978e50ac · 2025-02-26T13:05:43.000+01:00
- added support for the `waterfall` plot type
- I've loosened the restriction that the set of `data` and
`background_data` columns needs to exactly match. This makes no sense to
me as it prohibits e.g. patterns like
`insight.explain(campaign.measurements.sample(frac=0.1))` because there
are also 3 meta columns present. The return oder is still rearranged to
match the one in `data` and additional columns are ignored
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,10 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `BCUT2D` encoding for `SubstanceParameter`
 - Stored benchmarking results now include the Python environment and version
 - `qPSTD` acquisition function
+- `SHAPInsight` now supports the `waterfall` plot type
 
 ### Changed
 - Acquisition function indicator `is_mc` has been removed in favor of new indicators 
   `supports_batching` and `supports_pending_experiments`
+- `SHAPInsight` now allows explanation input that has additional columns compared to 
+  the background data (will be ignored)
 
 ### Fixed
 - Incorrect optimization direction with `PSTD` with a single minimization target
diff --git a/baybe/insights/shap.py b/baybe/insights/shap.py
@@ -37,7 +37,7 @@
 EXPLAINERS = SHAP_EXPLAINERS | NON_SHAP_EXPLAINERS
 """Supported explainer types for :class:`baybe.insights.shap.SHAPInsight`"""
 
-SHAP_PLOTS = {"bar", "beeswarm", "force", "heatmap", "scatter"}
+SHAP_PLOTS = {"bar", "beeswarm", "force", "heatmap", "scatter", "waterfall"}
 """Supported plot types for :meth:`baybe.insights.shap.SHAPInsight.plot`"""
 
 
@@ -265,15 +265,15 @@ def explain(self, data: pd.DataFrame | None = None, /) -> shap.Explanation:
             The computed Shapley explanation.
 
         Raises:
-            ValueError: If the columns of the given dataframe cannot be aligned with the
-                columns of the explainer background dataframe.
+            ValueError: If not all the columns of the explainer background dataframe
+                are present in the given data.
         """
         if data is None:
             data = self.background_data
-        elif set(self.background_data.columns) != set(data.columns):
+        elif not set(self.background_data.columns).issubset(data.columns):
             raise ValueError(
-                "The provided dataframe must have the same column names as used by "
-                "the explainer object."
+                "The provided dataframe must contain all columns that were used for "
+                "the background data."
             )
 
         # Align columns with background data
@@ -302,6 +302,7 @@ def explain(self, data: pd.DataFrame | None = None, /) -> shap.Explanation:
         # (`base_values` can be a scalar or vector)
         # TODO: https://github.com/shap/shap/issues/3958
         idx = self.background_data.columns.get_indexer(data.columns)
+        idx = idx[idx != -1]  # Additional columns in data are ignored.
         for attr in ["values", "data", "base_values"]:
             try:
                 setattr(explanations, attr, getattr(explanations, attr)[:, idx])
@@ -327,7 +328,9 @@ def explain(self, data: pd.DataFrame | None = None, /) -> shap.Explanation:
 
     def plot(
         self,
-        plot_type: Literal["bar", "beeswarm", "force", "heatmap", "scatter"],
+        plot_type: Literal[
+            "bar", "beeswarm", "force", "heatmap", "scatter", "waterfall"
+        ],
         data: pd.DataFrame | None = None,
         /,
         *,
@@ -367,16 +370,20 @@ def plot(
         plot_func = getattr(shap.plots, plot_type)
 
         # Handle plot types that only explain a single data point
-        if plot_type == "force":
+        if plot_type in ["force", "waterfall"]:
             if explanation_index is None:
                 warnings.warn(
                     f"When using plot type '{plot_type}', an 'explanation_index' must "
                     f"be chosen to identify a single data point that should be "
                     f"explained. Choosing the first entry at position 0."
                 )
                 explanation_index = 0
+
             toplot = self.explain(data.iloc[[explanation_index]])
-            kwargs["matplotlib"] = True
+            toplot = toplot[0]
+
+            if plot_type == "force":
+                kwargs["matplotlib"] = True
         else:
             toplot = self.explain(data)
 
diff --git a/tests/insights/test_shap.py b/tests/insights/test_shap.py
@@ -140,11 +140,7 @@ def test_invalid_explained_data(ongoing_campaign, explainer_cls, use_comp_rep):
         use_comp_rep=use_comp_rep,
     )
     df = pd.DataFrame({"Num_disc_1": [0, 2]})
-    with pytest.raises(
-        ValueError,
-        match="The provided dataframe must have the same column names as used by "
-        "the explainer object.",
-    ):
+    with pytest.raises(ValueError, match="must contain all columns that were used"):
         shap_insight.explain(df)