Skip to content

BUG: Summing empty DataFrame can convert to incompatible type #40669

Closed
@orbisvicis

Description

@orbisvicis

In DataFrame._reduce are there reasons not to have try/except clauses around this conversion?

pandas/pandas/core/frame.py

Lines 9581 to 9584 in 029907c

if axis == 0 and len(self) == 0 and name in ["sum", "prod"]:
# Even if we are object dtype, follow numpy and return
# float64, see test_apply_funcs_over_empty
out = out.astype(np.float64)

Like you have here:

pandas/pandas/core/frame.py

Lines 9609 to 9613 in 029907c

try:
result = result.astype(np.float64)
except (ValueError, TypeError):
# try to coerce to the original dtypes item by item if we can
pass

Because then you have issues like this, which are very common in groupby (I mean empty DataFrames are common):

pd.DataFrame(columns=["Duration"], dtype="timedelta64[ns]").sum(skipna=False)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-2442-dd5151a2feed> in <module>
----> 1 pd.DataFrame(columns=["Duration"], dtype="timedelta64[ns]").sum(skipna=False)

.../lib/python3.8/site-packages/pandas/core/generic.py in sum(self, axis, skipna, level, numeric_only, min_count, **kwargs)
  11069             **kwargs,
  11070         ):
> 11071             return NDFrame.sum(
  11072                 self, axis, skipna, level, numeric_only, min_count, **kwargs
  11073             )

.../lib/python3.8/site-packages/pandas/core/generic.py in sum(self, axis, skipna, level, numeric_only, min_count, **kwargs)
  10789         **kwargs,
  10790     ):
> 10791         return self._min_count_stat_function(
  10792             "sum", nanops.nansum, axis, skipna, level, numeric_only, min_count, **kwargs
  10793         )

.../lib/python3.8/site-packages/pandas/core/generic.py in _min_count_stat_function(self, name, func, axis, skipna, level, numeric_only, min_count, **kwargs)
  10771                 name, axis=axis, level=level, skipna=skipna, min_count=min_count
  10772             )
> 10773         return self._reduce(
  10774             func,
  10775             name=name,

.../lib/python3.8/site-packages/pandas/core/frame.py in _reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   8855                 # Even if we are object dtype, follow numpy and return
   8856                 #  float64, see test_apply_funcs_over_empty
-> 8857                 out = out.astype(np.float64)
   8858             return out
   8859

.../lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
   5875         else:
   5876             # else, only a single dtype is given
-> 5877             new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   5878             return self._constructor(new_data).__finalize__(self, method="astype")
   5879

.../lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
    629         self, dtype, copy: bool = False, errors: str = "raise"
    630     ) -> "BlockManager":
--> 631         return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
    632
    633     def convert(

.../lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
    425                     applied = b.apply(f, **kwargs)
    426                 else:
--> 427                     applied = getattr(b, f)(**kwargs)
    428             except (TypeError, NotImplementedError):
    429                 if not ignore_failures:

.../lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
    671             vals1d = values.ravel()
    672             try:
--> 673                 values = astype_nansafe(vals1d, dtype, copy=True)
    674             except (ValueError, TypeError):
    675                 # e.g. astype_nansafe can fail on object-dtype of strings

.../lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
   1061             return arr.astype(TD64NS_DTYPE, copy=copy)
   1062
-> 1063         raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]")
   1064
   1065     elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer):

TypeError: cannot astype a timedelta from [timedelta64[ns]] to [float64]

And I don't think one should depend on the the undocumented behavior of a negative min_count to bypass that coercion while guaranteeing the same output as if min_count were 0.

if (numeric_only is not None or axis == 0) and min_count == 0:

Especially with code like:

def _maybe_null_out(

This stackoverflow question, Sum Empty DataFrame without Converting to Incompatible Type goes into more detail.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions