-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
ENH: add Series.struct accessor for ArrowDtype[struct] #54977
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d47b6f9
f3d57c8
d3a7eb4
40103d5
6006e77
3fe9291
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
from pandas.core.arrays.arrow.accessors import StructAccessor | ||
from pandas.core.arrays.arrow.array import ArrowExtensionArray | ||
|
||
__all__ = ["ArrowExtensionArray"] | ||
__all__ = ["ArrowExtensionArray", "StructAccessor"] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
"""Accessors for arrow-backed data.""" | ||
|
||
from __future__ import annotations | ||
|
||
from typing import TYPE_CHECKING | ||
|
||
from pandas.compat import pa_version_under7p0 | ||
|
||
if not pa_version_under7p0: | ||
import pyarrow as pa | ||
import pyarrow.compute as pc | ||
|
||
from pandas.core.dtypes.dtypes import ArrowDtype | ||
|
||
if TYPE_CHECKING: | ||
from pandas import ( | ||
DataFrame, | ||
Series, | ||
) | ||
|
||
|
||
class StructAccessor: | ||
""" | ||
Accessor object for structured data properties of the Series values. | ||
|
||
Parameters | ||
---------- | ||
data : Series | ||
Series containing Arrow struct data. | ||
""" | ||
|
||
_validation_msg = ( | ||
"Can only use the '.struct' accessor with 'struct[pyarrow]' dtype, not {dtype}." | ||
) | ||
|
||
def __init__(self, data=None) -> None: | ||
self._parent = data | ||
self._validate(data) | ||
|
||
def _validate(self, data): | ||
dtype = data.dtype | ||
if not isinstance(dtype, ArrowDtype): | ||
# Raise AttributeError so that inspect can handle non-struct Series. | ||
raise AttributeError(self._validation_msg.format(dtype=dtype)) | ||
|
||
if not pa.types.is_struct(dtype.pyarrow_dtype): | ||
# Raise AttributeError so that inspect can handle non-struct Series. | ||
raise AttributeError(self._validation_msg.format(dtype=dtype)) | ||
|
||
@property | ||
def dtypes(self) -> Series: | ||
""" | ||
Return the dtype object of each child field of the struct. | ||
|
||
Returns | ||
------- | ||
pandas.Series | ||
The data type of each child field. | ||
|
||
Examples | ||
-------- | ||
>>> import pyarrow as pa | ||
>>> s = pd.Series( | ||
... [ | ||
... {"version": 1, "project": "pandas"}, | ||
... {"version": 2, "project": "pandas"}, | ||
... {"version": 1, "project": "numpy"}, | ||
... ], | ||
... dtype=pd.ArrowDtype(pa.struct( | ||
... [("version", pa.int64()), ("project", pa.string())] | ||
... )) | ||
... ) | ||
>>> s.struct.dtypes | ||
version int64[pyarrow] | ||
project string[pyarrow] | ||
dtype: object | ||
""" | ||
from pandas import ( | ||
Index, | ||
Series, | ||
) | ||
|
||
pa_type = self._parent.dtype.pyarrow_dtype | ||
types = [ArrowDtype(struct.type) for struct in pa_type] | ||
names = [struct.name for struct in pa_type] | ||
return Series(types, index=Index(names)) | ||
|
||
def field(self, name_or_index: str | int) -> Series: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see that https://arrow.apache.org/docs/python/generated/pyarrow.compute.struct_field.html says you can do things like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a good idea. Extracting the name to use for the Series with such inputs is turning out to be non-trivial though. Perhaps best left as a follow-up enhancement? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A follow-up makes sense to me. |
||
""" | ||
Extract a child field of a struct as a Series. | ||
|
||
Parameters | ||
---------- | ||
name_or_index : str | int | ||
Name or index of the child field to extract. | ||
|
||
Returns | ||
------- | ||
pandas.Series | ||
The data corresponding to the selected child field. | ||
|
||
See Also | ||
-------- | ||
Series.struct.explode : Return all child fields as a DataFrame. | ||
|
||
Examples | ||
-------- | ||
>>> import pyarrow as pa | ||
>>> s = pd.Series( | ||
... [ | ||
... {"version": 1, "project": "pandas"}, | ||
... {"version": 2, "project": "pandas"}, | ||
... {"version": 1, "project": "numpy"}, | ||
... ], | ||
... dtype=pd.ArrowDtype(pa.struct( | ||
... [("version", pa.int64()), ("project", pa.string())] | ||
... )) | ||
... ) | ||
|
||
Extract by field name. | ||
|
||
>>> s.struct.field("project") | ||
0 pandas | ||
1 pandas | ||
2 numpy | ||
Name: project, dtype: string[pyarrow] | ||
|
||
Extract by field index. | ||
|
||
>>> s.struct.field(0) | ||
0 1 | ||
1 2 | ||
2 1 | ||
Name: version, dtype: int64[pyarrow] | ||
""" | ||
from pandas import Series | ||
|
||
pa_arr = self._parent.array._pa_array | ||
if isinstance(name_or_index, int): | ||
index = name_or_index | ||
elif isinstance(name_or_index, str): | ||
index = pa_arr.type.get_field_index(name_or_index) | ||
else: | ||
raise ValueError( | ||
"name_or_index must be an int or str, " | ||
f"got {type(name_or_index).__name__}" | ||
) | ||
|
||
pa_field = pa_arr.type[index] | ||
field_arr = pc.struct_field(pa_arr, [index]) | ||
return Series( | ||
field_arr, | ||
dtype=ArrowDtype(field_arr.type), | ||
index=self._parent.index, | ||
name=pa_field.name, | ||
) | ||
|
||
def explode(self) -> DataFrame: | ||
""" | ||
Extract all child fields of a struct as a DataFrame. | ||
|
||
Returns | ||
------- | ||
pandas.DataFrame | ||
The data corresponding to all child fields. | ||
|
||
See Also | ||
-------- | ||
Series.struct.field : Return a single child field as a Series. | ||
|
||
Examples | ||
-------- | ||
>>> import pyarrow as pa | ||
>>> s = pd.Series( | ||
... [ | ||
... {"version": 1, "project": "pandas"}, | ||
... {"version": 2, "project": "pandas"}, | ||
... {"version": 1, "project": "numpy"}, | ||
... ], | ||
... dtype=pd.ArrowDtype(pa.struct( | ||
... [("version", pa.int64()), ("project", pa.string())] | ||
... )) | ||
... ) | ||
|
||
>>> s.struct.explode() | ||
version project | ||
0 1 pandas | ||
1 2 pandas | ||
2 1 numpy | ||
""" | ||
from pandas import concat | ||
|
||
pa_type = self._parent.dtype.pyarrow_dtype | ||
return concat( | ||
[self.field(i) for i in range(pa_type.num_fields)], axis="columns" | ||
) |
Uh oh!
There was an error while loading. Please reload this page.