From 05470fce275800be45288f32bf7d2e8b56447061 Mon Sep 17 00:00:00 2001 From: heoh Date: Wed, 18 Jun 2025 01:02:34 +0900 Subject: [PATCH 1/4] BUG: Fix GroupBy aggregate coersion of outputs inconsistency for pyarrow dtypes (#61636) --- pandas/core/arrays/arrow/array.py | 13 +++++++++++++ pandas/core/arrays/string_arrow.py | 7 +++++++ 2 files changed, 20 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0b90bcea35100..7e7d7e8f83bc9 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -185,6 +185,7 @@ def floordiv_compat( ArrayLike, AxisInt, Dtype, + DtypeObj, FillnaOptions, InterpolateOptions, Iterator, @@ -313,6 +314,18 @@ def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: ) self._dtype = ArrowDtype(self._pa_array.type) + @classmethod + def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: + try: + pa_array = cls._from_sequence(scalars, dtype=dtype) + except pa.ArrowNotImplementedError: + # _from_scalars should only raise ValueError or TypeError. + raise ValueError + + if lib.infer_dtype(scalars, skipna=True) != lib.infer_dtype(pa_array, skipna=True): + raise ValueError + return pa_array + @classmethod def _from_sequence( cls, scalars, *, dtype: Dtype | None = None, copy: bool = False diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 9668981df827b..ac04d7eb1c751 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -53,6 +53,7 @@ from pandas._typing import ( ArrayLike, Dtype, + DtypeObj, NpDtype, Self, npt, @@ -180,6 +181,12 @@ def __len__(self) -> int: """ return len(self._pa_array) + @classmethod + def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: + if lib.infer_dtype(scalars, skipna=True) not in ["string", "empty"]: + raise ValueError + return cls._from_sequence(scalars, dtype=dtype) + @classmethod def _from_sequence( cls, scalars, *, dtype: Dtype | None = None, copy: bool = False From 455435226878e479f7f7dbaf8563eb2554c9b903 Mon Sep 17 00:00:00 2001 From: heoh Date: Tue, 24 Jun 2025 03:38:51 +0900 Subject: [PATCH 2/4] Reformat code style --- pandas/core/arrays/arrow/array.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7e7d7e8f83bc9..4680375db7dc5 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -316,13 +316,15 @@ def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: @classmethod def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: + inferred_dtype = lib.infer_dtype(scalars, skipna=True) try: pa_array = cls._from_sequence(scalars, dtype=dtype) - except pa.ArrowNotImplementedError: + except pa.ArrowNotImplementedError as err: # _from_scalars should only raise ValueError or TypeError. - raise ValueError + raise ValueError from err - if lib.infer_dtype(scalars, skipna=True) != lib.infer_dtype(pa_array, skipna=True): + same_dtype = lib.infer_dtype(pa_array, skipna=True) == inferred_dtype + if not same_dtype: raise ValueError return pa_array From f4cd453c5d1c64feb9200b68c8a73aa8907ff023 Mon Sep 17 00:00:00 2001 From: heoh Date: Tue, 24 Jun 2025 03:39:09 +0900 Subject: [PATCH 3/4] Add test code --- pandas/tests/extension/test_arrow.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fc5930ebcd8ac..1e96385c79d62 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3257,6 +3257,27 @@ def test_groupby_count_return_arrow_dtype(data_missing): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "func, func_dtype", + [ + [lambda x: x.to_dict(), "object"], + [lambda x: 1, "int64"], + [lambda x: "s", ArrowDtype(pa.string())], + ], +) +def test_groupby_aggregate_coersion(func, func_dtype): + # GH 61636 + df = pd.DataFrame( + { + "b": pd.array([0, 1]), + "c": pd.array(["X", "Y"], dtype=ArrowDtype(pa.string())), + }, + index=pd.Index(["A", "B"], name="a"), + ) + result = df.groupby("b").agg(func) + assert result["c"].dtype == func_dtype + + def test_fixed_size_list(): # GH#55000 ser = pd.Series( From f445ac57ad2663810290a582a1ef6d8feaa5a17f Mon Sep 17 00:00:00 2001 From: heoh Date: Tue, 24 Jun 2025 03:51:36 +0900 Subject: [PATCH 4/4] Update whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c04de6fd071b7..2de2785a5efb2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -828,6 +828,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` were not keeping the index name when the index had :class:`ArrowDtype` timestamp dtype (:issue:`61222`) - Bug in :meth:`DataFrame.resample` changing index type to :class:`MultiIndex` when the dataframe is empty and using an upsample method (:issue:`55572`) - Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`) +- Bug in :meth:`DataFrameGroupBy.agg` when applied to columns with :class:`ArrowDtype`, where pandas attempted to cast the result back to the original dtype (:issue:`61636`) - Bug in :meth:`DataFrameGroupBy.agg` where applying a user-defined function to an empty DataFrame returned a Series instead of an empty DataFrame. (:issue:`61503`) - Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)