Skip to content

ENH: Support ArrowDtype in interchange Column.dtype #52792

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion pandas/core/interchange/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.core.arrays.arrow.dtype import ArrowDtype
from pandas.core.interchange.buffer import PandasBuffer
from pandas.core.interchange.dataframe_protocol import (
Column,
Expand Down Expand Up @@ -134,8 +135,12 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:
if kind is None:
# Not a NumPy dtype. Check if it's a categorical maybe
raise ValueError(f"Data type {dtype} not supported by interchange protocol")
if isinstance(dtype, ArrowDtype):
byteorder = dtype.numpy_dtype.byteorder
else:
byteorder = dtype.byteorder

return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), dtype.byteorder
return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), byteorder

@property
def describe_categorical(self):
Expand Down
48 changes: 48 additions & 0 deletions pandas/core/interchange/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,47 @@

from pandas.core.dtypes.dtypes import CategoricalDtype

from pandas.core.arrays.arrow.dtype import ArrowDtype

if typing.TYPE_CHECKING:
from pandas._typing import DtypeObj


# Maps str(pyarrow.DataType) = C type format string
# Currently, no pyarrow API for this
PYARROW_CTYPES = {
"null": "n",
"bool": "b",
"uint8": "C",
"uint16": "S",
"uint32": "I",
"uint64": "L",
"int8": "c",
"int16": "S",
"int32": "i",
"int64": "l",
"halffloat": "e", # float16
"float": "f", # float32
"double": "g", # float64
"string": "u",
"binary": "z",
"time32[s]": "tts",
"time32[ms]": "ttm",
"time64[us]": "ttu",
"time64[ns]": "ttn",
"date32[day]": "tdD",
"date64[ms]": "tdm",
"timestamp[s]": "tss:",
"timestamp[ms]": "tsm:",
"timestamp[us]": "tsu:",
"timestamp[ns]": "tsn:",
"duration[s]": "tDs",
"duration[ms]": "tDm",
"duration[us]": "tDu",
"duration[ns]": "tDn",
}


class ArrowCTypes:
"""
Enum for Apache Arrow C type format strings.
Expand Down Expand Up @@ -77,6 +114,17 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str:
return ArrowCTypes.INT64
elif dtype == np.dtype("O"):
return ArrowCTypes.STRING
elif isinstance(dtype, ArrowDtype):
import pyarrow as pa

pa_type = dtype.pyarrow_dtype
if pa.types.is_decimal(pa_type):
return f"d:{pa_type.precision},{pa_type.scale}"
elif pa.types.is_timestamp(pa_type) and pa_type.tz is not None:
return f"ts{pa_type.unit[0]}:{pa_type.tz}"
format_str = PYARROW_CTYPES.get(str(pa_type), None)
if format_str is not None:
return format_str

format_str = getattr(ArrowCTypes, dtype.name.upper(), None)
if format_str is not None:
Expand Down
49 changes: 49 additions & 0 deletions pandas/tests/interchange/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,52 @@
def test_dtype_to_arrow_c_fmt(pandas_dtype, c_string): # PR01
"""Test ``dtype_to_arrow_c_fmt`` utility function."""
assert dtype_to_arrow_c_fmt(pandas_dtype) == c_string


@pytest.mark.parametrize(
"pa_dtype, args_kwargs, c_string",
[
["null", {}, "n"],
["bool_", {}, "b"],
["uint8", {}, "C"],
["uint16", {}, "S"],
["uint32", {}, "I"],
["uint64", {}, "L"],
["int8", {}, "c"],
["int16", {}, "S"],
["int32", {}, "i"],
["int64", {}, "l"],
["float16", {}, "e"],
["float32", {}, "f"],
["float64", {}, "g"],
["string", {}, "u"],
["binary", {}, "z"],
["time32", ("s",), "tts"],
["time32", ("ms",), "ttm"],
["time64", ("us",), "ttu"],
["time64", ("ns",), "ttn"],
["date32", {}, "tdD"],
["date64", {}, "tdm"],
["timestamp", {"unit": "s"}, "tss:"],
["timestamp", {"unit": "ms"}, "tsm:"],
["timestamp", {"unit": "us"}, "tsu:"],
["timestamp", {"unit": "ns"}, "tsn:"],
["timestamp", {"unit": "ns", "tz": "UTC"}, "tsn:UTC"],
["duration", ("s",), "tDs"],
["duration", ("ms",), "tDm"],
["duration", ("us",), "tDu"],
["duration", ("ns",), "tDn"],
["decimal128", {"precision": 4, "scale": 2}, "d:4,2"],
],
)
def test_dtype_to_arrow_c_fmt_arrowdtype(pa_dtype, args_kwargs, c_string):
# GH 52323
pa = pytest.importorskip("pyarrow")
if not args_kwargs:
pa_type = getattr(pa, pa_dtype)()
elif isinstance(args_kwargs, tuple):
pa_type = getattr(pa, pa_dtype)(*args_kwargs)
else:
pa_type = getattr(pa, pa_dtype)(**args_kwargs)
arrow_type = pd.ArrowDtype(pa_type)
assert dtype_to_arrow_c_fmt(arrow_type) == c_string