diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c3fbd3ee4853e..924cf360a35cc 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1299,6 +1299,7 @@ cdef class Seen: bint datetimetz_ # seen_datetimetz bint period_ # seen_period bint interval_ # seen_interval + bint str_ # seen_str def __cinit__(self, bint coerce_numeric=False): """ @@ -1325,6 +1326,7 @@ cdef class Seen: self.datetimetz_ = False self.period_ = False self.interval_ = False + self.str_ = False self.coerce_numeric = coerce_numeric cdef bint check_uint64_conflict(self) except -1: @@ -2615,6 +2617,13 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break + elif isinstance(val, str): + if convert_non_numeric: + seen.str_ = True + break + else: + seen.object_ = True + break else: seen.object_ = True break @@ -2669,6 +2678,19 @@ def maybe_convert_objects(ndarray[object] objects, return pi._data seen.object_ = True + elif seen.str_: + if is_string_array(objects, skipna=True): + from pandas._config import get_option + opt = get_option("future.infer_string") + if opt is True: + import pyarrow as pa + + from pandas.core.dtypes.dtypes import ArrowDtype + + dtype = ArrowDtype(pa.string()) + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + + seen.object_ = True elif seen.interval_: if is_interval_array(objects): from pandas import IntervalIndex diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 3f662073f0357..27e9bf8958ab0 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -889,3 +889,14 @@ def register_converter_cb(key) -> None: styler_environment, validator=is_instance_factory([type(None), str]), ) + + +with cf.config_prefix("future"): + cf.register_option( + "infer_string", + False, + "Whether to infer sequence of str objects as pyarrow string " + "dtype, which will be the default in pandas 3.0 " + "(at which point this option will be deprecated).", + validator=is_one_of_factory([True, False]), + ) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 09105bf49c050..9d2530ddc4e12 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -18,6 +18,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import lib from pandas._libs.missing import ( NA, @@ -796,6 +798,12 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: # coming out as np.str_! dtype = _dtype_obj + opt = get_option("future.infer_string") + if opt is True: + import pyarrow as pa + + pa_dtype = pa.string() + dtype = ArrowDtype(pa_dtype) elif isinstance(val, (np.datetime64, dt.datetime)): try: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c87f04efffcf4..b82dc98cd0210 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2670,6 +2670,41 @@ def test_construct_with_strings_and_none(self): expected = DataFrame({"a": ["1", "2", None]}, dtype="str") tm.assert_frame_equal(df, expected) + def test_frame_string_inference(self): + # GH#54430 + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.string()) + expected = DataFrame( + {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) + ) + with pd.option_context("future.infer_string", True): + df = DataFrame({"a": ["a", "b"]}) + tm.assert_frame_equal(df, expected) + + expected = DataFrame( + {"a": ["a", "b"]}, + dtype=dtype, + columns=Index(["a"], dtype=dtype), + index=Index(["x", "y"], dtype=dtype), + ) + with pd.option_context("future.infer_string", True): + df = DataFrame({"a": ["a", "b"]}, index=["x", "y"]) + tm.assert_frame_equal(df, expected) + + expected = DataFrame( + {"a": ["a", 1]}, dtype="object", columns=Index(["a"], dtype=dtype) + ) + with pd.option_context("future.infer_string", True): + df = DataFrame({"a": ["a", 1]}) + tm.assert_frame_equal(df, expected) + + expected = DataFrame( + {"a": ["a", "b"]}, dtype="object", columns=Index(["a"], dtype=dtype) + ) + with pd.option_context("future.infer_string", True): + df = DataFrame({"a": ["a", "b"]}, dtype="object") + tm.assert_frame_equal(df, expected) + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index cf8b7214f3b91..638124ac20e06 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd from pandas import ( Index, MultiIndex, @@ -42,3 +43,17 @@ def test_construct_empty_tuples(self, tuple_list): expected = MultiIndex.from_tuples(tuple_list) tm.assert_index_equal(result, expected) + + def test_index_string_inference(self): + # GH#54430 + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.string()) + expected = Index(["a", "b"], dtype=dtype) + with pd.option_context("future.infer_string", True): + ser = Index(["a", "b"]) + tm.assert_index_equal(ser, expected) + + expected = Index(["a", 1], dtype="object") + with pd.option_context("future.infer_string", True): + ser = Index(["a", 1]) + tm.assert_index_equal(ser, expected) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 563f8005bfa72..ba6854c296841 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2094,3 +2094,20 @@ def test_pyarrow_engine_lines_false(): out = ser.to_json() with pytest.raises(ValueError, match="currently pyarrow engine only supports"): read_json(out, engine="pyarrow", lines=False) + + +def test_json_roundtrip_string_inference(orient): + pa = pytest.importorskip("pyarrow") + df = DataFrame( + [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"] + ) + out = df.to_json() + with pd.option_context("future.infer_string", True): + result = read_json(StringIO(out)) + expected = DataFrame( + [["a", "b"], ["c", "d"]], + dtype=pd.ArrowDtype(pa.string()), + index=pd.Index(["row 1", "row 2"], dtype=pd.ArrowDtype(pa.string())), + columns=pd.Index(["col 1", "col 2"], dtype=pd.ArrowDtype(pa.string())), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 915cc9a9a1f95..1a613c91880ea 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -538,3 +538,24 @@ def test_ea_int_avoid_overflow(all_parsers): } ) tm.assert_frame_equal(result, expected) + + +def test_string_inference(all_parsers): + # GH#54430 + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.string()) + + data = """a,b +x,1 +y,2""" + parser = all_parsers + if parser.engine == "pyarrow": + pytest.skip("TODO: Follow up") + with pd.option_context("future.infer_string", True): + result = parser.read_csv(StringIO(data)) + + expected = DataFrame( + {"a": pd.Series(["x", "y"], dtype=dtype), "b": [1, 2]}, + columns=pd.Index(["a", "b"], dtype=dtype), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 6800e55396d7b..63ca91cc89ede 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2920,6 +2920,23 @@ def test_read_sql_dtype_backend_table(self, string_storage, func): # GH#50048 Not supported for sqlite pass + def test_read_sql_string_inference(self): + # GH#54430 + pa = pytest.importorskip("pyarrow") + table = "test" + df = DataFrame({"a": ["x", "y"]}) + df.to_sql(table, self.conn, index=False, if_exists="replace") + + with pd.option_context("future.infer_string", True): + result = read_sql_table(table, self.conn) + + dtype = pd.ArrowDtype(pa.string()) + expected = DataFrame( + {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) + ) + + tm.assert_frame_equal(result, expected) + @pytest.mark.db class TestMySQLAlchemy(_TestSQLAlchemy): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 9540d7a014409..b50b05faa523e 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2070,6 +2070,38 @@ def test_series_from_index_dtype_equal_does_not_copy(self): ser.iloc[0] = 100 tm.assert_index_equal(idx, expected) + def test_series_string_inference(self): + # GH#54430 + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.string()) + expected = Series(["a", "b"], dtype=dtype) + with pd.option_context("future.infer_string", True): + ser = Series(["a", "b"]) + tm.assert_series_equal(ser, expected) + + expected = Series(["a", 1], dtype="object") + with pd.option_context("future.infer_string", True): + ser = Series(["a", 1]) + tm.assert_series_equal(ser, expected) + + @pytest.mark.parametrize("na_value", [None, np.nan, pd.NA]) + def test_series_string_with_na_inference(self, na_value): + # GH#54430 + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.string()) + expected = Series(["a", na_value], dtype=dtype) + with pd.option_context("future.infer_string", True): + ser = Series(["a", na_value]) + tm.assert_series_equal(ser, expected) + + def test_series_string_inference_scalar(self): + # GH#54430 + pa = pytest.importorskip("pyarrow") + expected = Series("a", index=[1], dtype=pd.ArrowDtype(pa.string())) + with pd.option_context("future.infer_string", True): + ser = Series("a", index=[1]) + tm.assert_series_equal(ser, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self):