From d78d3bb56f48b82eb0756cf10a83df8dbee9bd85 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 13 Jan 2023 15:57:03 +0100 Subject: [PATCH 01/12] ENH: Add global nullable option --- pandas/_config/__init__.py | 5 +++++ pandas/core/config_init.py | 16 ++++++++++++++ pandas/core/tools/numeric.py | 10 ++++++++- pandas/io/clipboards.py | 13 ++++++++++- pandas/io/excel/_base.py | 18 +++++++++++---- pandas/io/html.py | 11 ++++++++- pandas/io/orc.py | 14 ++++++++++-- pandas/io/parquet.py | 11 ++++++++- pandas/io/parsers/readers.py | 43 ++++++++++++++++++++++++++---------- pandas/io/sql.py | 38 +++++++++++++++++++++++-------- pandas/io/xml.py | 11 ++++++++- 11 files changed, 158 insertions(+), 32 deletions(-) diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index 5219abc697dbd..d12dd3b4cb8aa 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -33,3 +33,8 @@ def using_copy_on_write(): _mode_options = _global_config["mode"] return _mode_options["copy_on_write"] and _mode_options["data_manager"] == "block" + + +def using_nullable_dtypes(): + _mode_options = _global_config["mode"] + return _mode_options["nullable_dtypes"] diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index da9e7de9821b1..51491d47403ae 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -560,6 +560,22 @@ def use_inf_as_na_cb(key) -> None: validator=is_one_of_factory(["pandas", "pyarrow"]), ) + +nullable_dtypes_doc = """ +: bool + If nullable dtypes should be returned. This is only applicable to functions + where ``use_nullable_dtypes`` is implemented. +""" + +with cf.config_prefix("mode"): + cf.register_option( + "nullable_dtypes", + False, + nullable_dtypes_doc, + validator=is_bool, + ) + + # Set up the io.excel specific reader configuration. reader_engine_doc = """ : string diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index a8ae8c47b0d19..e5d7d70e03c9f 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -4,6 +4,8 @@ import numpy as np +from pandas._config import using_nullable_dtypes + from pandas._libs import lib from pandas._typing import ( DateTimeErrorChoices, @@ -36,7 +38,7 @@ def to_numeric( arg, errors: DateTimeErrorChoices = "raise", downcast: Literal["integer", "signed", "unsigned", "float"] | None = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, ): """ Convert argument to a numeric type. @@ -155,6 +157,12 @@ def to_numeric( if errors not in ("ignore", "raise", "coerce"): raise ValueError("invalid error value specified") + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.NoDefault + else using_nullable_dtypes() + ) + is_series = False is_index = False is_scalars = False diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 44bee11518cd3..495451f8e6aae 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -4,6 +4,9 @@ from io import StringIO import warnings +from pandas._config import using_nullable_dtypes + +from pandas._libs import lib from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.generic import ABCDataFrame @@ -15,7 +18,9 @@ def read_clipboard( - sep: str = r"\s+", use_nullable_dtypes: bool = False, **kwargs + sep: str = r"\s+", + use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, + **kwargs, ): # pragma: no cover r""" Read text from clipboard and pass to read_csv. @@ -56,6 +61,12 @@ def read_clipboard( if encoding is not None and encoding.lower().replace("-", "") != "utf8": raise NotImplementedError("reading from clipboard only supports utf-8 encoding") + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.NoDefault + else using_nullable_dtypes() + ) + from pandas.io.clipboard import clipboard_get from pandas.io.parsers import read_csv diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 6f706a4554855..9eaa0886ed371 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -23,8 +23,12 @@ ) import zipfile -from pandas._config import config +from pandas._config import ( + config, + using_nullable_dtypes, +) +from pandas._libs import lib from pandas._libs.parsers import STR_NA_VALUES from pandas._typing import ( DtypeArg, @@ -380,7 +384,7 @@ def read_excel( comment: str | None = ..., skipfooter: int = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> DataFrame: ... @@ -419,7 +423,7 @@ def read_excel( comment: str | None = ..., skipfooter: int = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> dict[IntStrT, DataFrame]: ... @@ -458,7 +462,7 @@ def read_excel( comment: str | None = None, skipfooter: int = 0, storage_options: StorageOptions = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, ) -> DataFrame | dict[IntStrT, DataFrame]: should_close = False @@ -471,6 +475,12 @@ def read_excel( "an ExcelFile - ExcelFile already has the engine set" ) + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.NoDefault + else using_nullable_dtypes() + ) + try: data = io.parse( sheet_name=sheet_name, diff --git a/pandas/io/html.py b/pandas/io/html.py index 7dcbd76b77b28..fd40375b50102 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -18,6 +18,9 @@ cast, ) +from pandas._config import using_nullable_dtypes + +from pandas._libs import lib from pandas._typing import ( FilePath, ReadBuffer, @@ -1043,7 +1046,7 @@ def read_html( keep_default_na: bool = True, displayed_only: bool = True, extract_links: Literal[None, "header", "footer", "body", "all"] = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = False, ) -> list[DataFrame]: r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. @@ -1213,6 +1216,12 @@ def read_html( ) validate_header_arg(header) + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.NoDefault + else using_nullable_dtypes() + ) + io = stringify_path(io) return _parse( diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 169cb5d16da8d..efd1785aee112 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -8,8 +8,12 @@ Literal, ) -from pandas._config import get_option +from pandas._config import ( + get_option, + using_nullable_dtypes, +) +from pandas._libs import lib from pandas._typing import ( FilePath, ReadBuffer, @@ -33,7 +37,7 @@ def read_orc( path: FilePath | ReadBuffer[bytes], columns: list[str] | None = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, **kwargs, ) -> DataFrame: """ @@ -86,6 +90,12 @@ def read_orc( orc = import_optional_dependency("pyarrow.orc") + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.NoDefault + else using_nullable_dtypes() + ) + with get_handle(path, "rb", is_text=False) as handles: orc_file = orc.ORCFile(handles.handle) pa_table = orc_file.read(columns=columns, **kwargs) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index cb66d1a422811..19df1f8a8bd82 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -9,6 +9,9 @@ ) from warnings import catch_warnings +from pandas._config import using_nullable_dtypes + +from pandas._libs import lib from pandas._typing import ( FilePath, ReadBuffer, @@ -453,7 +456,7 @@ def read_parquet( engine: str = "auto", columns: list[str] | None = None, storage_options: StorageOptions = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, **kwargs, ) -> DataFrame: """ @@ -511,6 +514,12 @@ def read_parquet( """ impl = get_engine(engine) + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.NoDefault + else using_nullable_dtypes() + ) + return impl.read( path, columns=columns, diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index b97c0161958fa..ffa0309b089c1 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -24,7 +24,10 @@ import numpy as np -from pandas._config import get_option +from pandas._config import ( + get_option, + using_nullable_dtypes, +) from pandas._libs import lib from pandas._libs.parsers import STR_NA_VALUES @@ -639,7 +642,7 @@ def read_csv( memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> TextFileReader: ... @@ -695,7 +698,7 @@ def read_csv( memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> TextFileReader: ... @@ -751,7 +754,7 @@ def read_csv( memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> DataFrame: ... @@ -807,7 +810,7 @@ def read_csv( memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> DataFrame | TextFileReader: ... @@ -879,7 +882,7 @@ def read_csv( memory_map: bool = False, float_precision: Literal["high", "legacy"] | None = None, storage_options: StorageOptions = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, ) -> DataFrame | TextFileReader: if infer_datetime_format is not lib.no_default: warnings.warn( @@ -904,6 +907,7 @@ def read_csv( on_bad_lines, names, defaults={"delimiter": ","}, + use_nullable_dtypes=use_nullable_dtypes, ) kwds.update(kwds_defaults) @@ -961,7 +965,7 @@ def read_table( memory_map: bool = ..., float_precision: str | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> TextFileReader: ... @@ -1017,7 +1021,7 @@ def read_table( memory_map: bool = ..., float_precision: str | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> TextFileReader: ... @@ -1073,7 +1077,7 @@ def read_table( memory_map: bool = ..., float_precision: str | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> DataFrame: ... @@ -1129,7 +1133,7 @@ def read_table( memory_map: bool = ..., float_precision: str | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> DataFrame | TextFileReader: ... @@ -1201,7 +1205,7 @@ def read_table( memory_map: bool = False, float_precision: str | None = None, storage_options: StorageOptions = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, ) -> DataFrame | TextFileReader: # locals() should never be modified kwds = locals().copy() @@ -1217,6 +1221,7 @@ def read_table( on_bad_lines, names, defaults={"delimiter": "\t"}, + use_nullable_dtypes=use_nullable_dtypes, ) kwds.update(kwds_defaults) @@ -1229,7 +1234,7 @@ def read_fwf( colspecs: Sequence[tuple[int, int]] | str | None = "infer", widths: Sequence[int] | None = None, infer_nrows: int = 100, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, **kwds, ) -> DataFrame | TextFileReader: r""" @@ -1292,6 +1297,12 @@ def read_fwf( if colspecs not in (None, "infer") and widths is not None: raise ValueError("You must specify only one of 'widths' and 'colspecs'") + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.NoDefault + else using_nullable_dtypes() + ) + # Compute 'colspecs' from 'widths', if specified. if widths is not None: colspecs, col = [], 0 @@ -1858,6 +1869,7 @@ def _refine_defaults_read( on_bad_lines: str | Callable, names: Sequence[Hashable] | None | lib.NoDefault, defaults: dict[str, Any], + use_nullable_dtypes: bool | lib.NoDefault, ): """Validate/refine default values of input parameters of read_csv, read_table. @@ -1971,6 +1983,13 @@ def _refine_defaults_read( else: raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.NoDefault + else using_nullable_dtypes() + ) + kwds["use_nullable_dtypes"] = use_nullable_dtypes + return kwds diff --git a/pandas/io/sql.py b/pandas/io/sql.py index eea97fc0d9760..83a45d8cc8039 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -29,6 +29,8 @@ import numpy as np +from pandas._config import using_nullable_dtypes + from pandas._libs import lib from pandas._typing import ( DateTimeErrorChoices, @@ -230,7 +232,7 @@ def read_sql_table( parse_dates: list[str] | dict[str, str] | None = ..., columns: list[str] | None = ..., chunksize: None = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> DataFrame: ... @@ -245,7 +247,7 @@ def read_sql_table( parse_dates: list[str] | dict[str, str] | None = ..., columns: list[str] | None = ..., chunksize: int = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> Iterator[DataFrame]: ... @@ -259,7 +261,7 @@ def read_sql_table( parse_dates: list[str] | dict[str, str] | None = None, columns: list[str] | None = None, chunksize: int | None = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, ) -> DataFrame | Iterator[DataFrame]: """ Read SQL database table into a DataFrame. @@ -322,6 +324,12 @@ def read_sql_table( -------- >>> pd.read_sql_table('table_name', 'postgres:///db_name') # doctest:+SKIP """ + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.NoDefault + else using_nullable_dtypes() + ) + with pandasSQL_builder(con, schema=schema) as pandas_sql: if not pandas_sql.has_table(table_name): raise ValueError(f"Table {table_name} not found") @@ -352,7 +360,7 @@ def read_sql_query( parse_dates: list[str] | dict[str, str] | None = ..., chunksize: None = ..., dtype: DtypeArg | None = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> DataFrame: ... @@ -367,7 +375,7 @@ def read_sql_query( parse_dates: list[str] | dict[str, str] | None = ..., chunksize: int = ..., dtype: DtypeArg | None = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> Iterator[DataFrame]: ... @@ -381,7 +389,7 @@ def read_sql_query( parse_dates: list[str] | dict[str, str] | None = None, chunksize: int | None = None, dtype: DtypeArg | None = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, ) -> DataFrame | Iterator[DataFrame]: """ Read SQL query into a DataFrame. @@ -446,6 +454,12 @@ def read_sql_query( Any datetime values with time zone information parsed via the `parse_dates` parameter will be converted to UTC. """ + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.NoDefault + else using_nullable_dtypes() + ) + with pandasSQL_builder(con) as pandas_sql: return pandas_sql.read_query( sql, @@ -469,7 +483,7 @@ def read_sql( parse_dates=..., columns: list[str] = ..., chunksize: None = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> DataFrame: ... @@ -484,7 +498,7 @@ def read_sql( parse_dates=..., columns: list[str] = ..., chunksize: int = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> Iterator[DataFrame]: ... @@ -498,7 +512,7 @@ def read_sql( parse_dates=None, columns: list[str] | None = None, chunksize: int | None = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, ) -> DataFrame | Iterator[DataFrame]: """ Read SQL query or database table into a DataFrame. @@ -621,6 +635,12 @@ def read_sql( 0 0 2012-11-10 1 1 2010-11-12 """ + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.NoDefault + else using_nullable_dtypes() + ) + with pandasSQL_builder(con) as pandas_sql: if isinstance(pandas_sql, SQLiteDatabase): diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 6ffa3356cc9de..ec9bfa46e1064 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -11,6 +11,9 @@ Sequence, ) +from pandas._config import using_nullable_dtypes + +from pandas._libs import lib from pandas._typing import ( TYPE_CHECKING, CompressionOptions, @@ -868,7 +871,7 @@ def read_xml( iterparse: dict[str, list[str]] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, ) -> DataFrame: r""" Read XML document into a ``DataFrame`` object. @@ -1110,6 +1113,12 @@ def read_xml( 2 triangle 180 3.0 """ + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.NoDefault + else using_nullable_dtypes() + ) + return _parse( path_or_buffer=path_or_buffer, xpath=xpath, From 2081d9c650387aa0cb2b224b8f4f85b01da8aef9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 13 Jan 2023 16:30:06 +0100 Subject: [PATCH 02/12] Add tests --- pandas/io/html.py | 2 +- pandas/tests/io/excel/test_readers.py | 13 +++++++---- .../io/parser/dtypes/test_dtypes_basic.py | 16 ++++++++++++++ pandas/tests/io/parser/test_read_fwf.py | 13 +++++++++++ pandas/tests/io/test_clipboard.py | 17 ++++++++++++++ pandas/tests/io/test_html.py | 11 ++++++++++ pandas/tests/io/test_orc.py | 13 +++++++++++ pandas/tests/io/test_parquet.py | 21 ++++++++++++++++++ pandas/tests/io/test_sql.py | 22 ++++++++++++++----- pandas/tests/io/xml/test_xml.py | 18 +++++++++++++++ pandas/tests/tools/test_to_numeric.py | 9 ++++++++ 11 files changed, 144 insertions(+), 11 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index fd40375b50102..98f88edac201a 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1046,7 +1046,7 @@ def read_html( keep_default_na: bool = True, displayed_only: bool = True, extract_links: Literal[None, "header", "footer", "body", "all"] = None, - use_nullable_dtypes: bool | lib.NoDefault = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, ) -> list[DataFrame]: r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 5899125ca2904..f194cadbc73d8 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -540,7 +540,8 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): "dtype_backend", ["pandas", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow"))], ) - def test_use_nullable_dtypes(self, read_ext, dtype_backend): + @pytest.mark.parametrize("option", [True, False]) + def test_use_nullable_dtypes(self, read_ext, dtype_backend, option): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") @@ -562,9 +563,13 @@ def test_use_nullable_dtypes(self, read_ext, dtype_backend): with tm.ensure_clean(read_ext) as file_path: df.to_excel(file_path, "test", index=False) with pd.option_context("mode.dtype_backend", dtype_backend): - result = pd.read_excel( - file_path, sheet_name="test", use_nullable_dtypes=True - ) + if not option: + result = pd.read_excel( + file_path, sheet_name="test", use_nullable_dtypes=True + ) + else: + with pd.option_context("mode.nullable_dtypes", True): + result = pd.read_excel(file_path, sheet_name="test") if dtype_backend == "pyarrow": import pyarrow as pa diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 8fd08122f0834..daec204a0e040 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -527,3 +527,19 @@ def test_use_nullable_dtypes_pyarrow_backend(all_parsers, request): } ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.usefixtures("pyarrow_xfail") +def test_use_nullable_dtypes_option(all_parsers): + # GH#99999 + + parser = all_parsers + + data = """a +1 +3 +""" + with pd.option_context("mode.nullable_dtypes", True): + result = parser.read_csv(StringIO(data)) + expected = DataFrame({"a": pd.Series([1, 3], dtype="Int64")}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 0dc8ee81278dd..facd828ff15d1 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -980,3 +980,16 @@ def test_use_nullable_dtypes(string_storage): } ) tm.assert_frame_equal(result, expected) + + +def test_use_nullable_dtypes_option(): + # GH#99999 + + data = """a +1 +3""" + with pd.option_context("mode.nullable_dtypes", True): + result = read_fwf(StringIO(data)) + + expected = DataFrame({"a": pd.Series([1, 3], dtype="Int64")}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index ae9c5aacf6e6b..0c55fbb1d4bdd 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -466,3 +466,20 @@ def test_read_clipboard_nullable_dtypes( expected["g"] = ArrowExtensionArray(pa.array([None, None])) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_read_clipboard_nullable_dtypes_option( + self, request, mock_clipboard, engine + ): + # GH#99999 + + text = """a +1 +2""" + mock_clipboard[request.node.name] = text + + with pd.option_context("mode.nullable_dtypes", True): + result = read_clipboard(sep=",", engine=engine) + + expected = DataFrame({"a": Series([1, 2], dtype="Int64")}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index f8284b5ab1c65..0d674b3019eb2 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -196,6 +196,17 @@ def test_use_nullable_dtypes(self, storage, dtype_backend): tm.assert_frame_equal(result, expected) + def test_use_nullable_dtypes_option(self): + # GH#99999 + df = DataFrame({"a": Series([1, np.nan, 3], dtype="Int64")}) + + out = df.to_html(index=False) + with pd.option_context("mode.nullable_dtypes", True): + result = self.read_html(out)[0] + + expected = DataFrame({"a": Series([1, np.nan, 3], dtype="Int64")}) + tm.assert_frame_equal(result, expected) + @pytest.mark.network @tm.network( url=( diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index a519d9536eb32..2d55ef2a5d924 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -383,3 +383,16 @@ def test_orc_use_nullable_dtypes_pandas_backend(): ) tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow", min_version="7.0.0") +def test_orc_use_nullable_dtypes_option(): + # GH#99999 + df = pd.DataFrame({"int": list(range(1, 4))}) + + bytes_data = df.copy().to_orc() + with pd.option_context("mode.nullable_dtypes", True): + result = read_orc(BytesIO(bytes_data)) + + expected = pd.DataFrame({"int": pd.Series([1, 2, 3], dtype="Int64")}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 12a3801ef1344..b0efa4a265785 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -640,6 +640,27 @@ def test_use_nullable_dtypes(self, engine, request): expected = expected.drop("c", axis=1) tm.assert_frame_equal(result2, expected) + def test_use_nullable_dtypes_option(self, engine, request): + import pyarrow.parquet as pq + + if engine == "fastparquet": + # We are manually disabling fastparquet's + # nullable dtype support pending discussion + mark = pytest.mark.xfail( + reason="Fastparquet nullable dtype support is disabled" + ) + request.node.add_marker(mark) + + table = pyarrow.table({"a": pyarrow.array([1, 2, 3, None], "int64")}) + with tm.ensure_clean() as path: + # write manually with pyarrow to write integers + pq.write_table(table, path) + with pd.option_context("mode.nullable_dtypes", True): + result2 = read_parquet(path, engine=engine) + + expected = pd.DataFrame({"a": pd.array([1, 2, 3, None], dtype="Int64")}) + tm.assert_frame_equal(result2, expected) + @pytest.mark.parametrize( "dtype", [ diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index f83b6b0373a87..f3a63535e06c6 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2286,17 +2286,22 @@ def test_get_engine_auto_error_message(self): pass # TODO(GH#36893) fill this in when we add more engines + @pytest.mark.parametrize("option", [True, False]) @pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) - def test_read_sql_nullable_dtypes(self, string_storage, func): + def test_read_sql_nullable_dtypes(self, string_storage, func, option): # GH#50048 table = "test" df = self.nullable_data() df.to_sql(table, self.conn, index=False, if_exists="replace") with pd.option_context("mode.string_storage", string_storage): - result = getattr(pd, func)( - f"Select * from {table}", self.conn, use_nullable_dtypes=True - ) + if option: + with pd.option_context("mode.nullable_dtypes", True): + result = getattr(pd, func)(f"Select * from {table}", self.conn) + else: + result = getattr(pd, func)( + f"Select * from {table}", self.conn, use_nullable_dtypes=True + ) expected = self.nullable_expected(string_storage) tm.assert_frame_equal(result, expected) @@ -2311,15 +2316,20 @@ def test_read_sql_nullable_dtypes(self, string_storage, func): for result in iterator: tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("option", [True, False]) @pytest.mark.parametrize("func", ["read_sql", "read_sql_table"]) - def test_read_sql_nullable_dtypes_table(self, string_storage, func): + def test_read_sql_nullable_dtypes_table(self, string_storage, func, option): # GH#50048 table = "test" df = self.nullable_data() df.to_sql(table, self.conn, index=False, if_exists="replace") with pd.option_context("mode.string_storage", string_storage): - result = getattr(pd, func)(table, self.conn, use_nullable_dtypes=True) + if option: + with pd.option_context("mode.nullable_dtypes", True): + result = getattr(pd, func)(table, self.conn) + else: + result = getattr(pd, func)(table, self.conn, use_nullable_dtypes=True) expected = self.nullable_expected(string_storage) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index e3724f2a40409..e184f32e815ac 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1779,3 +1779,21 @@ def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): expected["g"] = ArrowExtensionArray(pa.array([None, None])) tm.assert_frame_equal(result, expected) + + +def test_use_nullable_dtypes_option(parser): + # GH#99999 + + data = """ + + + 1 + + + 3 + + """ + with pd.option_context("mode.nullable_dtypes", True): + result = read_xml(data, parser=parser) + expected = DataFrame({"a": Series([1, 3], dtype="Int64")}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 1c0a8301d65cc..9473f3a69b4aa 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -831,6 +831,15 @@ def test_to_numeric_use_nullable_dtypes_na(val, dtype): tm.assert_series_equal(result, expected) +def test_to_numeric_use_nullable_dtypes_option(): + # GH#99999 + ser = Series([1, None], dtype=object) + with pd.option_context("mode.nullable_dtypes", True): + result = to_numeric(ser) + expected = Series([1, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "val, dtype, downcast", [(1, "Int8", "integer"), (1.5, "Float32", "float"), (1, "Int8", "signed")], From 62e64ab07aad156407790e55635d511cc1085c08 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 13 Jan 2023 16:34:22 +0100 Subject: [PATCH 03/12] Add docs --- doc/source/whatsnew/v2.0.0.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b22590759ea3f..50205acba73a4 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -47,6 +47,15 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_orc` * :func:`to_numeric` +To simplify a global opt-in a new option ``nullable_dtypes`` was added that allows to set +the keyword argument globally to ``True`` if not specified directly. The option can be enabled +through: + +.. ipython:: python + pd.options.mode.nullable_dtypes = True + +The option will work only in context with the keyword ``use_nullable_dtypes``. + Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions to select the nullable dtypes implementation. From 1e40255db892e08811e0c243fc5626b43b1fbe34 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 14 Jan 2023 20:23:47 +0100 Subject: [PATCH 04/12] Fix whatsnew --- doc/source/whatsnew/v2.0.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 50205acba73a4..b72d2705d0908 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -47,14 +47,14 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_orc` * :func:`to_numeric` -To simplify a global opt-in a new option ``nullable_dtypes`` was added that allows to set +To simplify the global opt-in a new option ``nullable_dtypes`` was added that allows to set the keyword argument globally to ``True`` if not specified directly. The option can be enabled through: .. ipython:: python pd.options.mode.nullable_dtypes = True -The option will work only in context with the keyword ``use_nullable_dtypes``. +The option will only work in context with the keyword ``use_nullable_dtypes``. Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions to select the nullable dtypes implementation. From aaba68ffa20435a5cb6aef53e99f0251a3251cbd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 14 Jan 2023 22:13:21 +0100 Subject: [PATCH 05/12] Fix mypy --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/tools/numeric.py | 12 ++++++------ pandas/io/clipboards.py | 4 ++-- pandas/io/excel/_base.py | 4 ++-- pandas/io/html.py | 4 ++-- pandas/io/orc.py | 4 ++-- pandas/io/parquet.py | 4 ++-- pandas/io/parsers/readers.py | 10 +++++----- pandas/io/sql.py | 12 ++++++------ pandas/io/xml.py | 4 ++-- 10 files changed, 30 insertions(+), 29 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b72d2705d0908..dc668ffef0aae 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -52,6 +52,7 @@ the keyword argument globally to ``True`` if not specified directly. The option through: .. ipython:: python + pd.options.mode.nullable_dtypes = True The option will only work in context with the keyword ``use_nullable_dtypes``. diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index e5d7d70e03c9f..dcecd1a33f4c8 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -38,7 +38,7 @@ def to_numeric( arg, errors: DateTimeErrorChoices = "raise", downcast: Literal["integer", "signed", "unsigned", "float"] | None = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ): """ Convert argument to a numeric type. @@ -157,9 +157,9 @@ def to_numeric( if errors not in ("ignore", "raise", "coerce"): raise ValueError("invalid error value specified") - use_nullable_dtypes = ( + _use_nullable_dtypes = ( use_nullable_dtypes - if use_nullable_dtypes is not lib.NoDefault + if use_nullable_dtypes is not lib.no_default else using_nullable_dtypes() ) @@ -207,11 +207,11 @@ def to_numeric( values = ensure_object(values) coerce_numeric = errors not in ("ignore", "raise") try: - values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload] + values, new_mask = lib.maybe_convert_numeric( values, set(), coerce_numeric=coerce_numeric, - convert_to_masked_nullable=use_nullable_dtypes, + convert_to_masked_nullable=_use_nullable_dtypes, ) except (ValueError, TypeError): if errors == "raise": @@ -221,7 +221,7 @@ def to_numeric( # Remove unnecessary values, is expected later anyway and enables # downcasting values = values[~new_mask] - elif use_nullable_dtypes and new_mask is None: + elif _use_nullable_dtypes and new_mask is None: new_mask = np.zeros(values.shape, dtype=np.bool_) # attempt downcast only if the data has been successfully converted diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 495451f8e6aae..0ba3846f415ad 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -19,7 +19,7 @@ def read_clipboard( sep: str = r"\s+", - use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, **kwargs, ): # pragma: no cover r""" @@ -63,7 +63,7 @@ def read_clipboard( use_nullable_dtypes = ( use_nullable_dtypes - if use_nullable_dtypes is not lib.NoDefault + if use_nullable_dtypes is not lib.no_default else using_nullable_dtypes() ) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 9eaa0886ed371..d44bdc466aed9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -462,7 +462,7 @@ def read_excel( comment: str | None = None, skipfooter: int = 0, storage_options: StorageOptions = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | dict[IntStrT, DataFrame]: should_close = False @@ -477,7 +477,7 @@ def read_excel( use_nullable_dtypes = ( use_nullable_dtypes - if use_nullable_dtypes is not lib.NoDefault + if use_nullable_dtypes is not lib.no_default else using_nullable_dtypes() ) diff --git a/pandas/io/html.py b/pandas/io/html.py index 98f88edac201a..8a9339bd5b56f 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1046,7 +1046,7 @@ def read_html( keep_default_na: bool = True, displayed_only: bool = True, extract_links: Literal[None, "header", "footer", "body", "all"] = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> list[DataFrame]: r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. @@ -1218,7 +1218,7 @@ def read_html( use_nullable_dtypes = ( use_nullable_dtypes - if use_nullable_dtypes is not lib.NoDefault + if use_nullable_dtypes is not lib.no_default else using_nullable_dtypes() ) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index efd1785aee112..ccc7afe7ee0f7 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -37,7 +37,7 @@ def read_orc( path: FilePath | ReadBuffer[bytes], columns: list[str] | None = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, **kwargs, ) -> DataFrame: """ @@ -92,7 +92,7 @@ def read_orc( use_nullable_dtypes = ( use_nullable_dtypes - if use_nullable_dtypes is not lib.NoDefault + if use_nullable_dtypes is not lib.no_default else using_nullable_dtypes() ) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 19df1f8a8bd82..2a33ec8969838 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -456,7 +456,7 @@ def read_parquet( engine: str = "auto", columns: list[str] | None = None, storage_options: StorageOptions = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, **kwargs, ) -> DataFrame: """ @@ -516,7 +516,7 @@ def read_parquet( use_nullable_dtypes = ( use_nullable_dtypes - if use_nullable_dtypes is not lib.NoDefault + if use_nullable_dtypes is not lib.no_default else using_nullable_dtypes() ) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index ffa0309b089c1..410b4fc0bf9c0 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -882,7 +882,7 @@ def read_csv( memory_map: bool = False, float_precision: Literal["high", "legacy"] | None = None, storage_options: StorageOptions = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: if infer_datetime_format is not lib.no_default: warnings.warn( @@ -1205,7 +1205,7 @@ def read_table( memory_map: bool = False, float_precision: str | None = None, storage_options: StorageOptions = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: # locals() should never be modified kwds = locals().copy() @@ -1234,7 +1234,7 @@ def read_fwf( colspecs: Sequence[tuple[int, int]] | str | None = "infer", widths: Sequence[int] | None = None, infer_nrows: int = 100, - use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, **kwds, ) -> DataFrame | TextFileReader: r""" @@ -1299,7 +1299,7 @@ def read_fwf( use_nullable_dtypes = ( use_nullable_dtypes - if use_nullable_dtypes is not lib.NoDefault + if use_nullable_dtypes is not lib.no_default else using_nullable_dtypes() ) @@ -1985,7 +1985,7 @@ def _refine_defaults_read( use_nullable_dtypes = ( use_nullable_dtypes - if use_nullable_dtypes is not lib.NoDefault + if use_nullable_dtypes is not lib.no_default else using_nullable_dtypes() ) kwds["use_nullable_dtypes"] = use_nullable_dtypes diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 83a45d8cc8039..59663aab73bfa 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -261,7 +261,7 @@ def read_sql_table( parse_dates: list[str] | dict[str, str] | None = None, columns: list[str] | None = None, chunksize: int | None = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | Iterator[DataFrame]: """ Read SQL database table into a DataFrame. @@ -326,7 +326,7 @@ def read_sql_table( """ use_nullable_dtypes = ( use_nullable_dtypes - if use_nullable_dtypes is not lib.NoDefault + if use_nullable_dtypes is not lib.no_default else using_nullable_dtypes() ) @@ -389,7 +389,7 @@ def read_sql_query( parse_dates: list[str] | dict[str, str] | None = None, chunksize: int | None = None, dtype: DtypeArg | None = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | Iterator[DataFrame]: """ Read SQL query into a DataFrame. @@ -456,7 +456,7 @@ def read_sql_query( """ use_nullable_dtypes = ( use_nullable_dtypes - if use_nullable_dtypes is not lib.NoDefault + if use_nullable_dtypes is not lib.no_default else using_nullable_dtypes() ) @@ -512,7 +512,7 @@ def read_sql( parse_dates=None, columns: list[str] | None = None, chunksize: int | None = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | Iterator[DataFrame]: """ Read SQL query or database table into a DataFrame. @@ -637,7 +637,7 @@ def read_sql( """ use_nullable_dtypes = ( use_nullable_dtypes - if use_nullable_dtypes is not lib.NoDefault + if use_nullable_dtypes is not lib.no_default else using_nullable_dtypes() ) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index ec9bfa46e1064..de6b1b8fcfbf8 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -871,7 +871,7 @@ def read_xml( iterparse: dict[str, list[str]] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.NoDefault, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> DataFrame: r""" Read XML document into a ``DataFrame`` object. @@ -1115,7 +1115,7 @@ def read_xml( use_nullable_dtypes = ( use_nullable_dtypes - if use_nullable_dtypes is not lib.NoDefault + if use_nullable_dtypes is not lib.no_default else using_nullable_dtypes() ) From 91afaa1296d30e2549aea6c2bfc0e79e543fa048 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 15 Jan 2023 12:44:37 +0100 Subject: [PATCH 06/12] Fix gh ref --- pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 2 +- pandas/tests/io/parser/test_read_fwf.py | 2 +- pandas/tests/io/test_clipboard.py | 2 +- pandas/tests/io/test_html.py | 2 +- pandas/tests/io/test_orc.py | 2 +- pandas/tests/io/xml/test_xml.py | 2 +- pandas/tests/tools/test_to_numeric.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index daec204a0e040..84932fd4f6bc5 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -531,7 +531,7 @@ def test_use_nullable_dtypes_pyarrow_backend(all_parsers, request): @pytest.mark.usefixtures("pyarrow_xfail") def test_use_nullable_dtypes_option(all_parsers): - # GH#99999 + # GH#50748 parser = all_parsers diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index facd828ff15d1..e995e6e9af2ff 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -983,7 +983,7 @@ def test_use_nullable_dtypes(string_storage): def test_use_nullable_dtypes_option(): - # GH#99999 + # GH#50748 data = """a 1 diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 0c55fbb1d4bdd..5e4b2c1ebad9d 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -471,7 +471,7 @@ def test_read_clipboard_nullable_dtypes( def test_read_clipboard_nullable_dtypes_option( self, request, mock_clipboard, engine ): - # GH#99999 + # GH#50748 text = """a 1 diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 0d674b3019eb2..de36548f08a12 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -197,7 +197,7 @@ def test_use_nullable_dtypes(self, storage, dtype_backend): tm.assert_frame_equal(result, expected) def test_use_nullable_dtypes_option(self): - # GH#99999 + # GH#50748 df = DataFrame({"a": Series([1, np.nan, 3], dtype="Int64")}) out = df.to_html(index=False) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 2d55ef2a5d924..2a95240a5f83d 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -387,7 +387,7 @@ def test_orc_use_nullable_dtypes_pandas_backend(): @td.skip_if_no("pyarrow", min_version="7.0.0") def test_orc_use_nullable_dtypes_option(): - # GH#99999 + # GH#50748 df = pd.DataFrame({"int": list(range(1, 4))}) bytes_data = df.copy().to_orc() diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index e184f32e815ac..4daa28b164ace 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1782,7 +1782,7 @@ def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): def test_use_nullable_dtypes_option(parser): - # GH#99999 + # GH#50748 data = """ diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 9473f3a69b4aa..3bd6a40ab3d59 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -832,7 +832,7 @@ def test_to_numeric_use_nullable_dtypes_na(val, dtype): def test_to_numeric_use_nullable_dtypes_option(): - # GH#99999 + # GH#50748 ser = Series([1, None], dtype=object) with pd.option_context("mode.nullable_dtypes", True): result = to_numeric(ser) From 3b64a9ebf37876fedc1325da0c4b63d63ef9ba7e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 15 Jan 2023 12:46:07 +0100 Subject: [PATCH 07/12] Add gh ref --- pandas/tests/io/test_parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b0efa4a265785..0dfe2a7e01b8c 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -641,6 +641,7 @@ def test_use_nullable_dtypes(self, engine, request): tm.assert_frame_equal(result2, expected) def test_use_nullable_dtypes_option(self, engine, request): + # GH#50748 import pyarrow.parquet as pq if engine == "fastparquet": From 51f8906b7ef69e5f634b307cb9259fbbdc3a1df6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 17 Jan 2023 23:31:35 +0100 Subject: [PATCH 08/12] Add to new functions --- pandas/io/feather_format.py | 11 ++++++++++- pandas/io/json/_json.py | 11 ++++++++++- pandas/tests/io/json/test_pandas.py | 9 +++++++-- pandas/tests/io/test_feather.py | 9 +++++++-- 4 files changed, 34 insertions(+), 6 deletions(-) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index cb2890777621a..136f49fef156e 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -6,6 +6,9 @@ Sequence, ) +from pandas._config import using_nullable_dtypes + +from pandas._libs import lib from pandas._typing import ( FilePath, ReadBuffer, @@ -103,7 +106,7 @@ def read_feather( columns: Sequence[Hashable] | None = None, use_threads: bool = True, storage_options: StorageOptions = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ): """ Load a feather-format object from the file path. @@ -143,6 +146,12 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + with get_handle( path, "rb", storage_options=storage_options, is_text=False ) as handles: diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index aa1342d0f135f..afb0be0729344 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -21,6 +21,9 @@ import numpy as np +from pandas._config import using_nullable_dtypes + +from pandas._libs import lib from pandas._libs.json import ( dumps, loads, @@ -496,7 +499,7 @@ def read_json( compression: CompressionOptions = "infer", nrows: int | None = None, storage_options: StorageOptions = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | Series | JsonReader: """ Convert a JSON string to pandas object. @@ -732,6 +735,12 @@ def read_json( if orient == "table" and convert_axes: raise ValueError("cannot pass both convert_axes and orient='table'") + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + if dtype is None and orient != "table": # error: Incompatible types in assignment (expression has type "bool", variable # has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float], diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 31566f67bef2c..7b473a56aa200 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1873,7 +1873,8 @@ def test_json_uint64(self): @pytest.mark.parametrize( "orient", ["split", "records", "values", "index", "columns"] ) - def test_read_json_nullable(self, string_storage, dtype_backend, orient): + @pytest.mark.parametrize("option", [True, False]) + def test_read_json_nullable(self, string_storage, dtype_backend, orient, option): # GH#50750 pa = pytest.importorskip("pyarrow") df = DataFrame( @@ -1900,7 +1901,11 @@ def test_read_json_nullable(self, string_storage, dtype_backend, orient): out = df.to_json(orient=orient) with pd.option_context("mode.string_storage", string_storage): with pd.option_context("mode.dtype_backend", dtype_backend): - result = read_json(out, use_nullable_dtypes=True, orient=orient) + if option: + with pd.option_context("mode.nullable_dtypes", option): + result = read_json(out, orient=orient) + else: + result = read_json(out, use_nullable_dtypes=True, orient=orient) expected = DataFrame( { diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 28a6054098a6f..7e07ad0ec2ad3 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -200,7 +200,8 @@ def test_http_path(self, feather_file): tm.assert_frame_equal(expected, res) @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) - def test_read_json_nullable(self, string_storage, dtype_backend): + @pytest.mark.parametrize("option", [True, False]) + def test_read_json_nullable(self, string_storage, dtype_backend, option): # GH#50765 pa = pytest.importorskip("pyarrow") df = pd.DataFrame( @@ -228,7 +229,11 @@ def test_read_json_nullable(self, string_storage, dtype_backend): to_feather(df, path) with pd.option_context("mode.string_storage", string_storage): with pd.option_context("mode.dtype_backend", dtype_backend): - result = read_feather(path, use_nullable_dtypes=True) + if option: + with pd.option_context("mode.nullable_dtypes", option): + result = read_feather(path) + else: + result = read_feather(path, use_nullable_dtypes=True) expected = pd.DataFrame( { From 495d20295c8f720c8cc764a9d7ead8694b492c33 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 18 Jan 2023 22:22:17 +0100 Subject: [PATCH 09/12] Update doc/source/whatsnew/v2.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3cbc3d242049c..f2746f815c96b 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -49,7 +49,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_feather` * :func:`to_numeric` -To simplify the global opt-in a new option ``nullable_dtypes`` was added that allows to set +To simplify opting-in to nullable dtypes for these functions, a new option ``nullable_dtypes`` was added that allows setting the keyword argument globally to ``True`` if not specified directly. The option can be enabled through: From fe24a5740acdaec8767671f014747487e7593c74 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 18 Jan 2023 22:22:24 +0100 Subject: [PATCH 10/12] Update doc/source/whatsnew/v2.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index f2746f815c96b..dd09f070c8fda 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -57,7 +57,7 @@ through: pd.options.mode.nullable_dtypes = True -The option will only work in context with the keyword ``use_nullable_dtypes``. +The option will only work for functions with the keyword ``use_nullable_dtypes``. Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions to select the nullable dtypes implementation. From 9bf13cb0f1bf77f5caf3fd7c55c65a2da95449d0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 18 Jan 2023 22:22:31 +0100 Subject: [PATCH 11/12] Update pandas/core/config_init.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/config_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 51491d47403ae..2e1ddb3c0a628 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -564,7 +564,7 @@ def use_inf_as_na_cb(key) -> None: nullable_dtypes_doc = """ : bool If nullable dtypes should be returned. This is only applicable to functions - where ``use_nullable_dtypes`` is implemented. + where the ``use_nullable_dtypes`` keyword is implemented. """ with cf.config_prefix("mode"): From c575123a05330018a39d8e028d7f3595bc4bacf9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 23 Jan 2023 20:07:46 -0500 Subject: [PATCH 12/12] Update test_to_numeric.py --- pandas/tests/tools/test_to_numeric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 3fe3c273b5ef5..8b57bbe03f9e7 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -841,7 +841,7 @@ def test_to_numeric_use_nullable_dtypes_na(val, dtype): def test_to_numeric_use_nullable_dtypes_option(): # GH#50748 ser = Series([1, None], dtype=object) - with pd.option_context("mode.nullable_dtypes", True): + with option_context("mode.nullable_dtypes", True): result = to_numeric(ser) expected = Series([1, pd.NA], dtype="Int64") tm.assert_series_equal(result, expected)