Description
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
I have confirmed this bug exists on the master branch of pandas.
Reproducible Example
import pyarrow
def types_mapper(arrow_type):
if pyarrow.types.is_boolean(arrow_type):
return pd.BooleanDtype
pyarrow_array = pyarrow.array([True, None, False], type=pyarrow.bool_())
expected = pd.Series([True, None, False], dtype="boolean")
# Convert to RecordBatch because types_mapper argument is ignored when
# using a pyarrow.Array. https://issues.apache.org/jira/browse/ARROW-9664
record_batch = pyarrow.RecordBatch.from_arrays([pyarrow_array], ["test_col"])
dataframe = record_batch.to_pandas(date_as_object=False, types_mapper=types_mapper)
series = dataframe["test_col"]
pd.testing.assert_series_equal(series, expected, check_names=False
Issue Description
_______________________________________________________________________ test_from_arrow _______________________________________________________________________
def test_from_arrow():
pyarrow = pytest.importorskip("pyarrow")
def types_mapper(arrow_type):
if pyarrow.types.is_boolean(arrow_type):
return pd.BooleanDtype
pyarrow_array = pyarrow.array([True, None, False], type=pyarrow.bool_())
expected = pd.Series([True, None, False], dtype="boolean")
# Convert to RecordBatch because types_mapper argument is ignored when
# using a pyarrow.Array. https://issues.apache.org/jira/browse/ARROW-9664
record_batch = pyarrow.RecordBatch.from_arrays([pyarrow_array], ["test_col"])
> dataframe = record_batch.to_pandas(date_as_object=False, types_mapper=types_mapper)
pandas/tests/extension/test_boolean.py:415:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
pyarrow/array.pxi:757: in pyarrow.lib._PandasConvertible.to_pandas
???
pyarrow/table.pxi:951: in pyarrow.lib.RecordBatch._to_pandas
???
pyarrow/table.pxi:1748: in pyarrow.lib.Table._to_pandas
???
/usr/local/Caskroom/miniconda/base/envs/pandas-dev/lib/python3.8/site-packages/pyarrow/pandas_compat.py:789: in table_to_blockmanager
blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
/usr/local/Caskroom/miniconda/base/envs/pandas-dev/lib/python3.8/site-packages/pyarrow/pandas_compat.py:1130: in _table_to_blocks
return [_reconstruct_block(item, columns, extension_columns)
/usr/local/Caskroom/miniconda/base/envs/pandas-dev/lib/python3.8/site-packages/pyarrow/pandas_compat.py:1130: in <listcomp>
return [_reconstruct_block(item, columns, extension_columns)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
item = {'placement': array([0]), 'py_array': <pyarrow.lib.ChunkedArray object at 0x7ffb60c6d6d0>
[
[
true,
null,
false
]
]}
columns = ['test_col'], extension_columns = {'test_col': <class 'pandas.core.arrays.boolean.BooleanDtype'>}
def _reconstruct_block(item, columns=None, extension_columns=None):
"""
Construct a pandas Block from the `item` dictionary coming from pyarrow's
serialization or returned by arrow::python::ConvertTableToPandas.
This function takes care of converting dictionary types to pandas
categorical, Timestamp-with-timezones to the proper pandas Block, and
conversion to pandas ExtensionBlock
Parameters
----------
item : dict
For basic types, this is a dictionary in the form of
{'block': np.ndarray of values, 'placement': pandas block placement}.
Additional keys are present for other types (dictionary, timezone,
object).
columns :
Column names of the table being constructed, used for extension types
extension_columns : dict
Dictionary of {column_name: pandas_dtype} that includes all columns
and corresponding dtypes that will be converted to a pandas
ExtensionBlock.
Returns
-------
pandas Block
"""
import pandas.core.internals as _int
block_arr = item.get('block', None)
placement = item['placement']
if 'dictionary' in item:
cat = _pandas_api.categorical_type.from_codes(
block_arr, categories=item['dictionary'],
ordered=item['ordered'])
block = _int.make_block(cat, placement=placement)
elif 'timezone' in item:
dtype = make_datetimetz(item['timezone'])
block = _int.make_block(block_arr, placement=placement,
klass=_int.DatetimeTZBlock,
dtype=dtype)
elif 'object' in item:
block = _int.make_block(builtin_pickle.loads(block_arr),
placement=placement)
elif 'py_array' in item:
# create ExtensionBlock
arr = item['py_array']
assert len(placement) == 1
name = columns[placement[0]]
pandas_dtype = extension_columns[name]
if not hasattr(pandas_dtype, '__from_arrow__'):
raise ValueError("This column does not support to be converted "
"to a pandas ExtensionArray")
> pd_ext_arr = pandas_dtype.__from_arrow__(arr)
E TypeError: __from_arrow__() missing 1 required positional argument: 'array'
/usr/local/Caskroom/miniconda/base/envs/pandas-dev/lib/python3.8/site-packages/pyarrow/pandas_compat.py:749: TypeError
Expected Behavior
Should be able to create boolean series from boolean pyarrow array.
Installed Versions
INSTALLED VERSIONS
commit : 10e23e8
python : 3.8.10.final.0
python-bits : 64
OS : Darwin
OS-release : 20.6.0
Version : Darwin Kernel Version 20.6.0: Tue Oct 12 18:33:42 PDT 2021; root:xnu-7195.141.8~1/RELEASE_X86_64
machine : x86_64
processor : i386
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 1.4.0.dev0+1064.g10e23e882b.dirty
numpy : 1.21.4
pytz : 2021.3
dateutil : 2.8.2
pip : 21.3.1
setuptools : 57.4.0
Cython : 0.29.24
pytest : 6.2.5
hypothesis : 6.24.2
sphinx : 4.2.0
blosc : None
feather : None
xlsxwriter : 3.0.2
lxml.etree : 4.6.4
html5lib : 1.1
pymysql : None
psycopg2 : None
jinja2 : 3.0.2
IPython : 7.29.0
pandas_datareader: None
bs4 : 4.10.0
bottleneck : 1.3.2
fsspec : 2021.10.1
fastparquet : 0.7.1
gcsfs : 2021.10.1
matplotlib : 3.4.3
numexpr : 2.7.3
odfpy : None
openpyxl : 3.0.9
pandas_gbq : None
pyarrow : 5.0.0
pyxlsb : None
s3fs : 2021.10.1
scipy : 1.7.2
sqlalchemy : 1.4.26
tables : 3.6.1
tabulate : 0.8.9
xarray : 0.18.2
xlrd : 2.0.1
xlwt : 1.3.0
numba : 0.53.1