Skip to content

BUG: To/read parquet with None values in partition cols fail #55400

Open
@TanguyLe

Description

@TanguyLe

Pandas version checks

  • I have checked that this issue has not already been reported.

  • I have confirmed this bug exists on the latest version of pandas.

  • I have confirmed this bug exists on the main branch of pandas.

Reproducible Example

import pandas as pd

df = pd.DataFrame(
    {
        "partition_col": [None, None, None],
    }
)
df.to_parquet("test.parquet", partition_cols=["partition_col"])

pd.read_parquet("test.parquet")

Issue Description

It raises this:

---------------------------------------------------------------------------
ArrowInvalid                              Traceback (most recent call last)
Cell In[4], line 11
      9 df.to_parquet("test.parquet", partition_cols=["partition_col"])
     10 # This raises
---> 11 pd.read_parquet("test.parquet")

File /opt/conda/lib/python3.10/site-packages/pandas/io/parquet.py:670, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs)
    667     use_nullable_dtypes = False
    668 check_dtype_backend(dtype_backend)
--> 670 return impl.read(
    671     path,
    672     columns=columns,
    673     filters=filters,
    674     storage_options=storage_options,
    675     use_nullable_dtypes=use_nullable_dtypes,
    676     dtype_backend=dtype_backend,
    677     filesystem=filesystem,
    678     **kwargs,
    679 )

File /opt/conda/lib/python3.10/site-packages/pandas/io/parquet.py:272, in PyArrowImpl.read(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, **kwargs)
    265 path_or_handle, handles, filesystem = _get_path_or_handle(
    266     path,
    267     filesystem,
    268     storage_options=storage_options,
    269     mode="rb",
    270 )
    271 try:
--> 272     pa_table = self.api.parquet.read_table(
    273         path_or_handle,
    274         columns=columns,
    275         filesystem=filesystem,
    276         filters=filters,
    277         **kwargs,
    278     )
    279     result = pa_table.to_pandas(**to_pandas_kwargs)
    281     if manager == "array":

File /opt/conda/lib/python3.10/site-packages/pyarrow/parquet/core.py:2955, in read_table(source, columns, use_threads, metadata, schema, use_pandas_metadata, read_dictionary, memory_map, buffer_size, partitioning, filesystem, filters, use_legacy_dataset, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, decryption_properties, thrift_string_size_limit, thrift_container_size_limit)
   2948     raise ValueError(
   2949         "The 'metadata' keyword is no longer supported with the new "
   2950         "datasets-based implementation. Specify "
   2951         "'use_legacy_dataset=True' to temporarily recover the old "
   2952         "behaviour."
   2953     )
   2954 try:
-> 2955     dataset = _ParquetDatasetV2(
   2956         source,
   2957         schema=schema,
   2958         filesystem=filesystem,
   2959         partitioning=partitioning,
   2960         memory_map=memory_map,
   2961         read_dictionary=read_dictionary,
   2962         buffer_size=buffer_size,
   2963         filters=filters,
   2964         ignore_prefixes=ignore_prefixes,
   2965         pre_buffer=pre_buffer,
   2966         coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
   2967         thrift_string_size_limit=thrift_string_size_limit,
   2968         thrift_container_size_limit=thrift_container_size_limit,
   2969     )
   2970 except ImportError:
   2971     # fall back on ParquetFile for simple cases when pyarrow.dataset
   2972     # module is not available
   2973     if filters is not None:

File /opt/conda/lib/python3.10/site-packages/pyarrow/parquet/core.py:2506, in _ParquetDatasetV2.__init__(self, path_or_paths, filesystem, filters, partitioning, read_dictionary, buffer_size, memory_map, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, schema, decryption_properties, thrift_string_size_limit, thrift_container_size_limit, **kwargs)
   2502 if partitioning == "hive":
   2503     partitioning = ds.HivePartitioning.discover(
   2504         infer_dictionary=True)
-> 2506 self._dataset = ds.dataset(path_or_paths, filesystem=filesystem,
   2507                            schema=schema, format=parquet_format,
   2508                            partitioning=partitioning,
   2509                            ignore_prefixes=ignore_prefixes)

File /opt/conda/lib/python3.10/site-packages/pyarrow/dataset.py:773, in dataset(source, schema, format, filesystem, partitioning, partition_base_dir, exclude_invalid_files, ignore_prefixes)
    762 kwargs = dict(
    763     schema=schema,
    764     filesystem=filesystem,
   (...)
    769     selector_ignore_prefixes=ignore_prefixes
    770 )
    772 if _is_path_like(source):
--> 773     return _filesystem_dataset(source, **kwargs)
    774 elif isinstance(source, (tuple, list)):
    775     if all(_is_path_like(elem) for elem in source):

File /opt/conda/lib/python3.10/site-packages/pyarrow/dataset.py:466, in _filesystem_dataset(source, schema, filesystem, partitioning, format, partition_base_dir, exclude_invalid_files, selector_ignore_prefixes)
    458 options = FileSystemFactoryOptions(
    459     partitioning=partitioning,
    460     partition_base_dir=partition_base_dir,
    461     exclude_invalid_files=exclude_invalid_files,
    462     selector_ignore_prefixes=selector_ignore_prefixes
    463 )
    464 factory = FileSystemDatasetFactory(fs, paths_or_selector, format, options)
--> 466 return factory.finish(schema)

File /opt/conda/lib/python3.10/site-packages/pyarrow/_dataset.pyx:2941, in pyarrow._dataset.DatasetFactory.finish()

File /opt/conda/lib/python3.10/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()

File /opt/conda/lib/python3.10/site-packages/pyarrow/error.pxi:100, in pyarrow.lib.check_status()

ArrowInvalid: No non-null segments were available for field 'partition_col'; couldn't infer type

Expected Behavior

Considering that this works (with fastparquet):

pd.read_parquet("test.parquet", engine="fastparquet")

and this also works:

ds.dataset(source="test.parquet").to_table().to_pandas()

I would expect the default read_parquet to be able to read something it dumped before.

Installed Versions

INSTALLED VERSIONS

commit : e86ed37
python : 3.11.5.final.0
python-bits : 64
OS : Linux
OS-release : 5.10.179-168.710.amzn2.x86_64
Version : #1 SMP Mon May 22 23:10:22 UTC 2023
machine : x86_64
processor :
byteorder : little
LC_ALL : None
LANG : C.UTF-8
LOCALE : en_US.UTF-8

pandas : 2.1.1
numpy : 1.24.3
pytz : 2023.3.post1
dateutil : 2.8.2
setuptools : 68.2.2
pip : 23.2.1
Cython : None
pytest : 7.4.2
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : 2.9.8
jinja2 : 3.1.2
IPython : 8.15.0
pandas_datareader : None
bs4 : 4.12.2
bottleneck : None
dataframe-api-compat: None
fastparquet : None
fsspec : 2023.9.2
gcsfs : None
matplotlib : 3.8.0
numba : 0.58.0
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : 12.0.1
pyreadstat : None
pyxlsb : None
s3fs : 2023.9.2
scipy : 1.11.3
sqlalchemy : 2.0.21
tables : None
tabulate : 0.9.0
xarray : None
xlrd : None
zstandard : None
tzdata : 2023.3
qtpy : None
pyqt5 : None

Metadata

Metadata

Assignees

No one assigned

    Labels

    BugIO Parquetparquet, featherNeeds TriageIssue that has not been reviewed by a pandas team member

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions