Description
Pandas version checks
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
import pandas as pd
td = pd.timedelta_range("1 Day", periods=3)
df = pd.DataFrame(columns=td)
df.to_parquet('test.parquet', engine='pyarrow')
pd.read_parquet('test.parquet', engine='pyarrow')
Issue Description
read_parquet
raises an exception: ValueError: Cannot convert from timedelta64[ns] to timedelta64. Supported resolutions are 's', 'ms', 'us', 'ns'
. Traceback:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
File /path/to/script.py:6
4 df = pd.DataFrame(columns=td)
5 df.to_parquet('test.parquet', engine='pyarrow')
----> 6 pd.read_parquet('test.parquet', engine='pyarrow')
File ~/.../site-packages/pandas/io/parquet.py:667, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs)
664 use_nullable_dtypes = False
665 check_dtype_backend(dtype_backend)
--> 667 return impl.read(
668 path,
669 columns=columns,
670 filters=filters,
671 storage_options=storage_options,
672 use_nullable_dtypes=use_nullable_dtypes,
673 dtype_backend=dtype_backend,
674 filesystem=filesystem,
675 **kwargs,
676 )
File ~/.../site-packages/pandas/io/parquet.py:281, in PyArrowImpl.read(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, **kwargs)
273 try:
274 pa_table = self.api.parquet.read_table(
275 path_or_handle,
276 columns=columns,
(...)
279 **kwargs,
280 )
--> 281 result = pa_table.to_pandas(**to_pandas_kwargs)
283 if manager == "array":
284 result = result._as_manager("array", copy=False)
File ~/.../site-packages/pyarrow/array.pxi:885, in pyarrow.lib._PandasConvertible.to_pandas()
File ~/.../site-packages/pyarrow/table.pxi:5002, in pyarrow.lib.Table._to_pandas()
File ~/.../site-packages/pyarrow/pandas_compat.py:781, in table_to_dataframe(options, table, categories, ignore_metadata, types_mapper)
778 ext_columns_dtypes = _get_extension_dtypes(table, [], types_mapper)
780 _check_data_column_metadata_consistency(all_columns)
--> 781 columns = _deserialize_column_index(table, all_columns, column_indexes)
783 column_names = table.column_names
784 result = pa.lib.table_to_blocks(options, table, categories,
785 list(ext_columns_dtypes.keys()))
File ~/.../site-packages/pyarrow/pandas_compat.py:919, in _deserialize_column_index(block_table, all_columns, column_indexes)
917 # if we're reconstructing the index
918 if len(column_indexes) > 0:
--> 919 columns = _reconstruct_columns_from_metadata(columns, column_indexes)
921 return columns
File ~/.../site-packages/pyarrow/pandas_compat.py:1122, in _reconstruct_columns_from_metadata(columns, column_indexes)
1120 level = _pandas_api.pd.Index([decimal.Decimal(i) for i in level])
1121 elif level.dtype != dtype:
-> 1122 level = level.astype(dtype)
1123 # ARROW-9096: if original DataFrame was upcast we keep that
1124 if level.dtype != numpy_dtype and pandas_dtype != "datetimetz":
File ~/.../site-packages/pandas/core/indexes/base.py:1097, in Index.astype(self, dtype, copy)
1093 new_values = cls._from_sequence(self, dtype=dtype, copy=copy)
1095 else:
1096 # GH#13149 specifically use astype_array instead of astype
-> 1097 new_values = astype_array(values, dtype=dtype, copy=copy)
1099 # pass copy=False because any copying will be done in the astype above
1100 result = Index(new_values, name=self.name, dtype=new_values.dtype, copy=False)
File ~/.../site-packages/pandas/core/dtypes/astype.py:182, in astype_array(values, dtype, copy)
179 values = values.astype(dtype, copy=copy)
181 else:
--> 182 values = _astype_nansafe(values, dtype, copy=copy)
184 # in pandas we don't store numpy str dtypes, so convert to object
185 if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):
File ~/.../site-packages/pandas/core/dtypes/astype.py:122, in _astype_nansafe(arr, dtype, copy, skipna)
119 tdvals = array_to_timedelta64(arr).view("m8[ns]")
121 tda = ensure_wrapped_if_datetimelike(tdvals)
--> 122 return tda.astype(dtype, copy=False)._ndarray
124 if dtype.name in ("datetime64", "timedelta64"):
125 msg = (
126 f"The '{dtype.name}' dtype has no unit. Please pass in "
127 f"'{dtype.name}[ns]' instead."
128 )
File ~/.../site-packages/pandas/core/arrays/timedeltas.py:358, in TimedeltaArray.astype(self, dtype, copy)
354 return type(self)._simple_new(
355 res_values, dtype=res_values.dtype, freq=self.freq
356 )
357 else:
--> 358 raise ValueError(
359 f"Cannot convert from {self.dtype} to {dtype}. "
360 "Supported resolutions are 's', 'ms', 'us', 'ns'"
361 )
363 return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy)
ValueError: Cannot convert from timedelta64[ns] to timedelta64. Supported resolutions are 's', 'ms', 'us', 'ns'
Now, the odd thing that makes me believe this might be a bug (and not a feature request) is that the code works with either df = pd.DataFrame(data=td)
or even df = pd.DataFrame(index=td)
.
BTW: df.to_parquet('test.parquet', engine='fastparquet')
fails with any of the three DataFrames.
Expected Behavior
Using td
for columns
should work the same way as using it for index
.
Installed Versions
INSTALLED VERSIONS
commit : d9cdd2e
python : 3.11.9.final.0
python-bits : 64
OS : Linux
OS-release : 6.8.0-101041-tuxedo
Version : #41~22.04.1tux1 SMP PREEMPT_DYNAMIC Wed Aug 21 22:16:53 UTC 2024
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : de_DE.UTF-8
LOCALE : de_DE.UTF-8
pandas : 2.2.2
numpy : 2.1.0
pytz : 2024.1
dateutil : 2.9.0
setuptools : 73.0.1
pip : 24.2
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : None
IPython : 8.27.0
pandas_datareader : None
adbc-driver-postgresql: None
adbc-driver-sqlite : None
bs4 : None
bottleneck : None
dataframe-api-compat : None
fastparquet : 2024.5.0
fsspec : 2024.6.1
gcsfs : None
matplotlib : None
numba : None
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : 17.0.0
pyreadstat : None
python-calamine : None
pyxlsb : None
s3fs : None
scipy : None
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
zstandard : None
tzdata : 2024.1
qtpy : None
pyqt5 : None