diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index f0dd908f81043..0075b7095aa93 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -216,4 +216,18 @@ def time_rolling_offset(self, method): getattr(self.groupby_roll_offset, method)() +class GroupbyLargeGroups: + # https://github.com/pandas-dev/pandas/issues/38038 + # specific example where the rolling operation on a larger dataframe + # is relatively cheap (few but large groups), but creation of + # MultiIndex of result can be expensive + + def setup(self): + N = 100000 + self.df = pd.DataFrame({"A": [1, 2] * int(N / 2), "B": np.random.randn(N)}) + + def time_rolling_multiindex_creation(self): + self.df.groupby("A").rolling(3).mean() + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 4770ab37e08d2..0e2e510147603 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -24,6 +24,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`) - Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) - Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`). +- Fixed performance regression in ``df.groupby(..).rolling(..)`` (:issue:`38038`) - Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index ce7281988e105..237c29a9b0c26 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2216,22 +2216,29 @@ def _apply( # Our result will have still kept the column in the result result = result.drop(columns=column_keys, errors="ignore") - result_index_data = [] - for key, values in self._groupby.grouper.indices.items(): - for value in values: - data = [ - *com.maybe_make_list(key), - *com.maybe_make_list( - grouped_object_index[value] - if grouped_object_index is not None - else [] - ), - ] - result_index_data.append(tuple(data)) - - result_index = MultiIndex.from_tuples( - result_index_data, names=result_index_names + codes = self._groupby.grouper.codes + levels = self._groupby.grouper.levels + + group_indices = self._groupby.grouper.indices.values() + if group_indices: + indexer = np.concatenate(list(group_indices)) + else: + indexer = np.array([], dtype=np.intp) + codes = [c.take(indexer) for c in codes] + + # if the index of the original dataframe needs to be preserved, append + # this index (but reordered) to the codes/levels from the groupby + if grouped_object_index is not None: + idx = grouped_object_index.take(indexer) + if not isinstance(idx, MultiIndex): + idx = MultiIndex.from_arrays([idx]) + codes.extend(list(idx.codes)) + levels.extend(list(idx.levels)) + + result_index = MultiIndex( + levels, codes, names=result_index_names, verify_integrity=False ) + result.index = result_index return result diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index da31fbaddc6e4..a5f759fb90dab 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import DataFrame, MultiIndex, Series +from pandas import DataFrame, Index, MultiIndex, Series import pandas._testing as tm from pandas.core.groupby.groupby import get_groupby @@ -396,14 +396,25 @@ def test_groupby_rolling_index_changed(self, func): def test_groupby_rolling_empty_frame(self): # GH 36197 - expected = pd.DataFrame({"s1": []}) + expected = DataFrame({"s1": []}) result = expected.groupby("s1").rolling(window=1).sum() - expected.index = pd.MultiIndex.from_tuples([], names=["s1", None]) + # GH-38057 from_tuples gives empty object dtype, we now get float/int levels + # expected.index = MultiIndex.from_tuples([], names=["s1", None]) + expected.index = MultiIndex.from_product( + [Index([], dtype="float64"), Index([], dtype="int64")], names=["s1", None] + ) tm.assert_frame_equal(result, expected) - expected = pd.DataFrame({"s1": [], "s2": []}) + expected = DataFrame({"s1": [], "s2": []}) result = expected.groupby(["s1", "s2"]).rolling(window=1).sum() - expected.index = pd.MultiIndex.from_tuples([], names=["s1", "s2", None]) + expected.index = MultiIndex.from_product( + [ + Index([], dtype="float64"), + Index([], dtype="float64"), + Index([], dtype="int64"), + ], + names=["s1", "s2", None], + ) tm.assert_frame_equal(result, expected) def test_groupby_rolling_string_index(self): @@ -479,3 +490,57 @@ def test_groupby_rolling_index_level_and_column_label(self): ), ) tm.assert_frame_equal(result, expected) + + def test_groupby_rolling_resulting_multiindex(self): + # a few different cases checking the created MultiIndex of the result + # https://github.com/pandas-dev/pandas/pull/38057 + + # grouping by 1 columns -> 2-level MI as result + df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4}) + result = df.groupby("b").rolling(3).mean() + expected_index = MultiIndex.from_tuples( + [(1, 0), (1, 2), (1, 4), (1, 6), (2, 1), (2, 3), (2, 5), (2, 7)], + names=["b", None], + ) + tm.assert_index_equal(result.index, expected_index) + + # grouping by 2 columns -> 3-level MI as result + df = DataFrame({"a": np.arange(12.0), "b": [1, 2] * 6, "c": [1, 2, 3, 4] * 3}) + result = df.groupby(["b", "c"]).rolling(2).sum() + expected_index = MultiIndex.from_tuples( + [ + (1, 1, 0), + (1, 1, 4), + (1, 1, 8), + (1, 3, 2), + (1, 3, 6), + (1, 3, 10), + (2, 2, 1), + (2, 2, 5), + (2, 2, 9), + (2, 4, 3), + (2, 4, 7), + (2, 4, 11), + ], + names=["b", "c", None], + ) + tm.assert_index_equal(result.index, expected_index) + + # grouping with 1 level on dataframe with 2-level MI -> 3-level MI as result + df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4, "c": [1, 2, 3, 4] * 2}) + df = df.set_index("c", append=True) + result = df.groupby("b").rolling(3).mean() + expected_index = MultiIndex.from_tuples( + [ + (1, 0, 1), + (1, 2, 3), + (1, 4, 1), + (1, 6, 3), + (2, 1, 2), + (2, 3, 4), + (2, 5, 2), + (2, 7, 4), + ], + names=["b", None, "c"], + ) + tm.assert_index_equal(result.index, expected_index)