diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index 7eb43dbd074c9..fea96d861f12c 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -11,6 +11,7 @@ import pandas as pd from pandas.api.types import is_string_dtype +from pandas.core.arrays.arrow.dtype import ArrowDtype from pandas.core.interchange.buffer import PandasBuffer from pandas.core.interchange.dataframe_protocol import ( Column, @@ -134,8 +135,12 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: if kind is None: # Not a NumPy dtype. Check if it's a categorical maybe raise ValueError(f"Data type {dtype} not supported by interchange protocol") + if isinstance(dtype, ArrowDtype): + byteorder = dtype.numpy_dtype.byteorder + else: + byteorder = dtype.byteorder - return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), dtype.byteorder + return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), byteorder @property def describe_categorical(self): diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index 89599818d6814..70485956c1b31 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -13,10 +13,47 @@ from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.arrays.arrow.dtype import ArrowDtype + if typing.TYPE_CHECKING: from pandas._typing import DtypeObj +# Maps str(pyarrow.DataType) = C type format string +# Currently, no pyarrow API for this +PYARROW_CTYPES = { + "null": "n", + "bool": "b", + "uint8": "C", + "uint16": "S", + "uint32": "I", + "uint64": "L", + "int8": "c", + "int16": "S", + "int32": "i", + "int64": "l", + "halffloat": "e", # float16 + "float": "f", # float32 + "double": "g", # float64 + "string": "u", + "binary": "z", + "time32[s]": "tts", + "time32[ms]": "ttm", + "time64[us]": "ttu", + "time64[ns]": "ttn", + "date32[day]": "tdD", + "date64[ms]": "tdm", + "timestamp[s]": "tss:", + "timestamp[ms]": "tsm:", + "timestamp[us]": "tsu:", + "timestamp[ns]": "tsn:", + "duration[s]": "tDs", + "duration[ms]": "tDm", + "duration[us]": "tDu", + "duration[ns]": "tDn", +} + + class ArrowCTypes: """ Enum for Apache Arrow C type format strings. @@ -77,6 +114,17 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: return ArrowCTypes.INT64 elif dtype == np.dtype("O"): return ArrowCTypes.STRING + elif isinstance(dtype, ArrowDtype): + import pyarrow as pa + + pa_type = dtype.pyarrow_dtype + if pa.types.is_decimal(pa_type): + return f"d:{pa_type.precision},{pa_type.scale}" + elif pa.types.is_timestamp(pa_type) and pa_type.tz is not None: + return f"ts{pa_type.unit[0]}:{pa_type.tz}" + format_str = PYARROW_CTYPES.get(str(pa_type), None) + if format_str is not None: + return format_str format_str = getattr(ArrowCTypes, dtype.name.upper(), None) if format_str is not None: diff --git a/pandas/tests/interchange/test_utils.py b/pandas/tests/interchange/test_utils.py index 4fd42abb7f3f1..a47bc2752ff32 100644 --- a/pandas/tests/interchange/test_utils.py +++ b/pandas/tests/interchange/test_utils.py @@ -38,3 +38,52 @@ def test_dtype_to_arrow_c_fmt(pandas_dtype, c_string): # PR01 """Test ``dtype_to_arrow_c_fmt`` utility function.""" assert dtype_to_arrow_c_fmt(pandas_dtype) == c_string + + +@pytest.mark.parametrize( + "pa_dtype, args_kwargs, c_string", + [ + ["null", {}, "n"], + ["bool_", {}, "b"], + ["uint8", {}, "C"], + ["uint16", {}, "S"], + ["uint32", {}, "I"], + ["uint64", {}, "L"], + ["int8", {}, "c"], + ["int16", {}, "S"], + ["int32", {}, "i"], + ["int64", {}, "l"], + ["float16", {}, "e"], + ["float32", {}, "f"], + ["float64", {}, "g"], + ["string", {}, "u"], + ["binary", {}, "z"], + ["time32", ("s",), "tts"], + ["time32", ("ms",), "ttm"], + ["time64", ("us",), "ttu"], + ["time64", ("ns",), "ttn"], + ["date32", {}, "tdD"], + ["date64", {}, "tdm"], + ["timestamp", {"unit": "s"}, "tss:"], + ["timestamp", {"unit": "ms"}, "tsm:"], + ["timestamp", {"unit": "us"}, "tsu:"], + ["timestamp", {"unit": "ns"}, "tsn:"], + ["timestamp", {"unit": "ns", "tz": "UTC"}, "tsn:UTC"], + ["duration", ("s",), "tDs"], + ["duration", ("ms",), "tDm"], + ["duration", ("us",), "tDu"], + ["duration", ("ns",), "tDn"], + ["decimal128", {"precision": 4, "scale": 2}, "d:4,2"], + ], +) +def test_dtype_to_arrow_c_fmt_arrowdtype(pa_dtype, args_kwargs, c_string): + # GH 52323 + pa = pytest.importorskip("pyarrow") + if not args_kwargs: + pa_type = getattr(pa, pa_dtype)() + elif isinstance(args_kwargs, tuple): + pa_type = getattr(pa, pa_dtype)(*args_kwargs) + else: + pa_type = getattr(pa, pa_dtype)(**args_kwargs) + arrow_type = pd.ArrowDtype(pa_type) + assert dtype_to_arrow_c_fmt(arrow_type) == c_string