Description
Pandas version checks
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
import pandas as pd
df = pd.DataFrame(
{
"partition_col": [None, None, None],
}
)
df.to_parquet("test.parquet", partition_cols=["partition_col"])
pd.read_parquet("test.parquet")
Issue Description
It raises this:
---------------------------------------------------------------------------
ArrowInvalid Traceback (most recent call last)
Cell In[4], line 11
9 df.to_parquet("test.parquet", partition_cols=["partition_col"])
10 # This raises
---> 11 pd.read_parquet("test.parquet")
File /opt/conda/lib/python3.10/site-packages/pandas/io/parquet.py:670, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs)
667 use_nullable_dtypes = False
668 check_dtype_backend(dtype_backend)
--> 670 return impl.read(
671 path,
672 columns=columns,
673 filters=filters,
674 storage_options=storage_options,
675 use_nullable_dtypes=use_nullable_dtypes,
676 dtype_backend=dtype_backend,
677 filesystem=filesystem,
678 **kwargs,
679 )
File /opt/conda/lib/python3.10/site-packages/pandas/io/parquet.py:272, in PyArrowImpl.read(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, **kwargs)
265 path_or_handle, handles, filesystem = _get_path_or_handle(
266 path,
267 filesystem,
268 storage_options=storage_options,
269 mode="rb",
270 )
271 try:
--> 272 pa_table = self.api.parquet.read_table(
273 path_or_handle,
274 columns=columns,
275 filesystem=filesystem,
276 filters=filters,
277 **kwargs,
278 )
279 result = pa_table.to_pandas(**to_pandas_kwargs)
281 if manager == "array":
File /opt/conda/lib/python3.10/site-packages/pyarrow/parquet/core.py:2955, in read_table(source, columns, use_threads, metadata, schema, use_pandas_metadata, read_dictionary, memory_map, buffer_size, partitioning, filesystem, filters, use_legacy_dataset, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, decryption_properties, thrift_string_size_limit, thrift_container_size_limit)
2948 raise ValueError(
2949 "The 'metadata' keyword is no longer supported with the new "
2950 "datasets-based implementation. Specify "
2951 "'use_legacy_dataset=True' to temporarily recover the old "
2952 "behaviour."
2953 )
2954 try:
-> 2955 dataset = _ParquetDatasetV2(
2956 source,
2957 schema=schema,
2958 filesystem=filesystem,
2959 partitioning=partitioning,
2960 memory_map=memory_map,
2961 read_dictionary=read_dictionary,
2962 buffer_size=buffer_size,
2963 filters=filters,
2964 ignore_prefixes=ignore_prefixes,
2965 pre_buffer=pre_buffer,
2966 coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
2967 thrift_string_size_limit=thrift_string_size_limit,
2968 thrift_container_size_limit=thrift_container_size_limit,
2969 )
2970 except ImportError:
2971 # fall back on ParquetFile for simple cases when pyarrow.dataset
2972 # module is not available
2973 if filters is not None:
File /opt/conda/lib/python3.10/site-packages/pyarrow/parquet/core.py:2506, in _ParquetDatasetV2.__init__(self, path_or_paths, filesystem, filters, partitioning, read_dictionary, buffer_size, memory_map, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, schema, decryption_properties, thrift_string_size_limit, thrift_container_size_limit, **kwargs)
2502 if partitioning == "hive":
2503 partitioning = ds.HivePartitioning.discover(
2504 infer_dictionary=True)
-> 2506 self._dataset = ds.dataset(path_or_paths, filesystem=filesystem,
2507 schema=schema, format=parquet_format,
2508 partitioning=partitioning,
2509 ignore_prefixes=ignore_prefixes)
File /opt/conda/lib/python3.10/site-packages/pyarrow/dataset.py:773, in dataset(source, schema, format, filesystem, partitioning, partition_base_dir, exclude_invalid_files, ignore_prefixes)
762 kwargs = dict(
763 schema=schema,
764 filesystem=filesystem,
(...)
769 selector_ignore_prefixes=ignore_prefixes
770 )
772 if _is_path_like(source):
--> 773 return _filesystem_dataset(source, **kwargs)
774 elif isinstance(source, (tuple, list)):
775 if all(_is_path_like(elem) for elem in source):
File /opt/conda/lib/python3.10/site-packages/pyarrow/dataset.py:466, in _filesystem_dataset(source, schema, filesystem, partitioning, format, partition_base_dir, exclude_invalid_files, selector_ignore_prefixes)
458 options = FileSystemFactoryOptions(
459 partitioning=partitioning,
460 partition_base_dir=partition_base_dir,
461 exclude_invalid_files=exclude_invalid_files,
462 selector_ignore_prefixes=selector_ignore_prefixes
463 )
464 factory = FileSystemDatasetFactory(fs, paths_or_selector, format, options)
--> 466 return factory.finish(schema)
File /opt/conda/lib/python3.10/site-packages/pyarrow/_dataset.pyx:2941, in pyarrow._dataset.DatasetFactory.finish()
File /opt/conda/lib/python3.10/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()
File /opt/conda/lib/python3.10/site-packages/pyarrow/error.pxi:100, in pyarrow.lib.check_status()
ArrowInvalid: No non-null segments were available for field 'partition_col'; couldn't infer type
Expected Behavior
Considering that this works (with fastparquet
):
pd.read_parquet("test.parquet", engine="fastparquet")
and this also works:
ds.dataset(source="test.parquet").to_table().to_pandas()
I would expect the default read_parquet
to be able to read something it dumped before.
Installed Versions
INSTALLED VERSIONS
commit : e86ed37
python : 3.11.5.final.0
python-bits : 64
OS : Linux
OS-release : 5.10.179-168.710.amzn2.x86_64
Version : #1 SMP Mon May 22 23:10:22 UTC 2023
machine : x86_64
processor :
byteorder : little
LC_ALL : None
LANG : C.UTF-8
LOCALE : en_US.UTF-8
pandas : 2.1.1
numpy : 1.24.3
pytz : 2023.3.post1
dateutil : 2.8.2
setuptools : 68.2.2
pip : 23.2.1
Cython : None
pytest : 7.4.2
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : 2.9.8
jinja2 : 3.1.2
IPython : 8.15.0
pandas_datareader : None
bs4 : 4.12.2
bottleneck : None
dataframe-api-compat: None
fastparquet : None
fsspec : 2023.9.2
gcsfs : None
matplotlib : 3.8.0
numba : 0.58.0
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : 12.0.1
pyreadstat : None
pyxlsb : None
s3fs : 2023.9.2
scipy : 1.11.3
sqlalchemy : 2.0.21
tables : None
tabulate : 0.9.0
xarray : None
xlrd : None
zstandard : None
tzdata : 2023.3
qtpy : None
pyqt5 : None