Skip to content

BUG: ArrowDtype.construct_from_string round-trip #50689

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions pandas/core/arrays/arrow/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,13 @@ def construct_from_string(cls, string: str) -> ArrowDtype:
except ValueError as err:
has_parameters = re.search(r"\[.*\]", base_type)
if has_parameters:
# Fallback to try common temporal types
try:
return cls._parse_temporal_dtype_string(base_type)
except (NotImplementedError, ValueError):
# Fall through to raise with nice exception message below
pass

raise NotImplementedError(
"Passing pyarrow type specific parameters "
f"({has_parameters.group()}) in the string is not supported. "
Expand All @@ -212,6 +219,35 @@ def construct_from_string(cls, string: str) -> ArrowDtype:
raise TypeError(f"'{base_type}' is not a valid pyarrow data type.") from err
return cls(pa_dtype)

# TODO(arrow#33642): This can be removed once supported by pyarrow
@classmethod
def _parse_temporal_dtype_string(cls, string: str) -> ArrowDtype:
"""
Construct a temporal ArrowDtype from string.
"""
# we assume
# 1) "[pyarrow]" has already been stripped from the end of our string.
# 2) we know "[" is present
head, tail = string.split("[", 1)

if not tail.endswith("]"):
raise ValueError
tail = tail[:-1]

if head == "timestamp":
assert "," in tail # otherwise type_for_alias should work
unit, tz = tail.split(",", 1)
unit = unit.strip()
tz = tz.strip()
if tz.startswith("tz="):
tz = tz[3:]

pa_type = pa.timestamp(unit, tz=tz)
dtype = cls(pa_type)
return dtype

raise NotImplementedError(string)

@property
def _is_numeric(self) -> bool:
"""
Expand Down
18 changes: 7 additions & 11 deletions pandas/tests/extension/base/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,6 @@ def test_kind(self, dtype):
valid = set("biufcmMOSUV")
assert dtype.kind in valid

def test_construct_from_string_own_name(self, dtype):
result = dtype.construct_from_string(dtype.name)
assert type(result) is type(dtype)

# check OK as classmethod
result = type(dtype).construct_from_string(dtype.name)
assert type(result) is type(dtype)

def test_is_dtype_from_name(self, dtype):
result = type(dtype).is_dtype(dtype.name)
assert result is True
Expand Down Expand Up @@ -97,9 +89,13 @@ def test_eq(self, dtype):
assert dtype == dtype.name
assert dtype != "anonther_type"

def test_construct_from_string(self, dtype):
dtype_instance = type(dtype).construct_from_string(dtype.name)
assert isinstance(dtype_instance, type(dtype))
def test_construct_from_string_own_name(self, dtype):
result = dtype.construct_from_string(dtype.name)
assert type(result) is type(dtype)

# check OK as classmethod
result = type(dtype).construct_from_string(dtype.name)
assert type(result) is type(dtype)

def test_construct_from_string_another_type_raises(self, dtype):
msg = f"Cannot construct a '{type(dtype).__name__}' from 'another_type'"
Expand Down
93 changes: 23 additions & 70 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,13 +250,9 @@ def test_astype_str(self, data, request):
class TestConstructors(base.BaseConstructorsTests):
def test_from_dtype(self, data, request):
pa_dtype = data.dtype.pyarrow_dtype
if (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz) or pa.types.is_string(
pa_dtype
):
if pa.types.is_string(pa_dtype):
reason = "ArrowDtype(pa.string()) != StringDtype('pyarrow')"
else:
reason = f"pyarrow.type_for_alias cannot infer {pa_dtype}"

if pa.types.is_string(pa_dtype):
reason = "ArrowDtype(pa.string()) != StringDtype('pyarrow')"
request.node.add_marker(
pytest.mark.xfail(
reason=reason,
Expand Down Expand Up @@ -604,65 +600,24 @@ def test_in_numeric_groupby(self, data_for_grouping):
class TestBaseDtype(base.BaseDtypeTests):
def test_construct_from_string_own_name(self, dtype, request):
pa_dtype = dtype.pyarrow_dtype
if pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None:
request.node.add_marker(
pytest.mark.xfail(
raises=NotImplementedError,
reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}",
)
)
elif pa.types.is_string(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=TypeError,
reason=(
"Still support StringDtype('pyarrow') "
"over ArrowDtype(pa.string())"
),
)
)

if pa.types.is_string(pa_dtype):
# We still support StringDtype('pyarrow') over ArrowDtype(pa.string())
msg = r"string\[pyarrow\] should be constructed by StringDtype"
with pytest.raises(TypeError, match=msg):
dtype.construct_from_string(dtype.name)

return

super().test_construct_from_string_own_name(dtype)

def test_is_dtype_from_name(self, dtype, request):
pa_dtype = dtype.pyarrow_dtype
if pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None:
request.node.add_marker(
pytest.mark.xfail(
raises=NotImplementedError,
reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}",
)
)
elif pa.types.is_string(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
reason=(
"Still support StringDtype('pyarrow') "
"over ArrowDtype(pa.string())"
),
)
)
super().test_is_dtype_from_name(dtype)

def test_construct_from_string(self, dtype, request):
pa_dtype = dtype.pyarrow_dtype
if pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None:
request.node.add_marker(
pytest.mark.xfail(
raises=NotImplementedError,
reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}",
)
)
elif pa.types.is_string(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=TypeError,
reason=(
"Still support StringDtype('pyarrow') "
"over ArrowDtype(pa.string())"
),
)
)
super().test_construct_from_string(dtype)
if pa.types.is_string(pa_dtype):
# We still support StringDtype('pyarrow') over ArrowDtype(pa.string())
assert not type(dtype).is_dtype(dtype.name)
else:
super().test_is_dtype_from_name(dtype)

def test_construct_from_string_another_type_raises(self, dtype):
msg = r"'another_type' must end with '\[pyarrow\]'"
Expand Down Expand Up @@ -753,13 +708,6 @@ def test_EA_types(self, engine, data, request):
request.node.add_marker(
pytest.mark.xfail(raises=TypeError, reason="GH 47534")
)
elif pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None:
request.node.add_marker(
pytest.mark.xfail(
raises=NotImplementedError,
reason=f"Parameterized types with tz={pa_dtype.tz} not supported.",
)
)
elif pa.types.is_timestamp(pa_dtype) and pa_dtype.unit in ("us", "ns"):
request.node.add_marker(
pytest.mark.xfail(
Expand Down Expand Up @@ -1266,7 +1214,12 @@ def test_invalid_other_comp(self, data, comparison_op):

def test_arrowdtype_construct_from_string_type_with_unsupported_parameters():
with pytest.raises(NotImplementedError, match="Passing pyarrow type"):
ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]")
ArrowDtype.construct_from_string("not_a_real_dype[s, tz=UTC][pyarrow]")

# but as of GH#50689, timestamptz is supported
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In a follow up could you make this it's own test? (Nit: doesn't really fit the name of the test"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fair enough, will do

dtype = ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]")
expected = ArrowDtype(pa.timestamp("s", "UTC"))
assert dtype == expected


@pytest.mark.parametrize(
Expand Down