Description
Code Sample, a copy-pastable example if possible
import pandas as pd
(
pd.DataFrame(
data=[[pd.Timestamp('2019-11-11 09:00:00+00:00'), 'x1', 'y1']],
columns=['timestamp', 'x', 'y']
)
.groupby(['x', 'y'])
.agg({'timestamp': lambda series: None})
)
Problem description
Potentially relates to #23683.
When upgrading from Pandas version 0.24.2 to 0.25.3 I noticed some of my old code failed of a sudden.
The above code isolates the issue: when grouping on more than one key and aggregating a Timestamp column, the _try_cast()
method is called twice from inside _python_agg_general()
method - the first call transforms the object type from Series to DatetimeArray and the second call fails because the new object doesn't have the _values
attribute.
If the aggregated column is not of Timestamp dtype but eg., string, the code runs just fine:
import pandas as pd
(
pd.DataFrame(
data=[['2019-11-11 09:00:00+00:00', 'x1', 'y1']],
columns=['timestamp', 'x', 'y']
)
.groupby(['x', 'y'])
.agg({'timestamp': lambda series: None})
)
If we keep the Timestamp dtype but remove the timezone, the code runs just fine:
import pandas as pd
(
pd.DataFrame(
data=[[pd.Timestamp('2019-11-11 09:00:00'), 'x1', 'y1']],
columns=['timestamp', 'x', 'y']
)
.groupby(['x', 'y'])
.agg({'timestamp': lambda series: None})
)
If we keep the Timestamp dtype and the timezone but aggregate on a single key, the code runs just fine:
import pandas as pd
(
pd.DataFrame(
data=[[pd.Timestamp('2019-11-11 09:00:00+00:00'), 'x1', 'y1']],
columns=['timestamp', 'x', 'y']
)
.groupby(['x'])
.agg({'timestamp': lambda series: None})
)
Running the initial code sample produces the following exception:
AttributeError Traceback (most recent call last)
<ipython-input-130-fb69a372d092> in <module>()
8 )
9 .groupby(['x', 'y'])
---> 10 .agg({'timestamp': lambda series: None})
11 )
~/python/anaconda3/envs/env1/lib/python3.6/site-packages/pandas/core/groupby/generic.py in aggregate(self, arg, *args, **kwargs)
1453 @Appender(_shared_docs["aggregate"])
1454 def aggregate(self, arg=None, *args, **kwargs):
-> 1455 return super().aggregate(arg, *args, **kwargs)
1456
1457 agg = aggregate
~/python/anaconda3/envs/env1/lib/python3.6/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, *args, **kwargs)
227 func = _maybe_mangle_lambdas(func)
228
--> 229 result, how = self._aggregate(func, _level=_level, *args, **kwargs)
230 if how is None:
231 return result
~/python/anaconda3/envs/env1/lib/python3.6/site-packages/pandas/core/base.py in _aggregate(self, arg, *args, **kwargs)
504
505 try:
--> 506 result = _agg(arg, _agg_1dim)
507 except SpecificationError:
508
~/python/anaconda3/envs/env1/lib/python3.6/site-packages/pandas/core/base.py in _agg(arg, func)
454 result = OrderedDict()
455 for fname, agg_how in arg.items():
--> 456 result[fname] = func(fname, agg_how)
457 return result
458
~/python/anaconda3/envs/env1/lib/python3.6/site-packages/pandas/core/base.py in _agg_1dim(name, how, subset)
438 "nested dictionary is ambiguous " "in aggregation"
439 )
--> 440 return colg.aggregate(how, _level=(_level or 0) + 1)
441
442 def _agg_2dim(name, how):
~/python/anaconda3/envs/env1/lib/python3.6/site-packages/pandas/core/groupby/generic.py in aggregate(self, func_or_funcs, *args, **kwargs)
858
859 if self.grouper.nkeys > 1:
--> 860 return self._python_agg_general(func_or_funcs, *args, **kwargs)
861
862 try:
~/python/anaconda3/envs/env1/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _python_agg_general(self, func, *args, **kwargs)
916 values = ensure_float(values)
917
--> 918 output[name] = self._try_cast(values[mask], result)
919
920 return self._wrap_aggregated_output(output)
~/python/anaconda3/envs/env1/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _try_cast(self, result, obj, numeric_only)
805 # to the target timezone
806 try:
--> 807 result = obj._values._from_sequence(
808 result, dtype="datetime64[ns, UTC]"
809 )
AttributeError: 'DatetimeArray' object has no attribute '_values'
Expected Output
timestamp x y x1 y1 NaT
Output of pd.show_versions()
$ python -c 'import pandas as pd; pd.show_versions()'
INSTALLED VERSIONS
commit : None
python : 3.6.7.final.0
python-bits : 64
OS : Darwin
OS-release : 18.7.0
machine : x86_64
processor : i386
byteorder : little
LC_ALL : en_US.UTF-8
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 0.25.3
numpy : 1.17.2
pytz : 2018.5
dateutil : 2.7.3
pip : 18.0
setuptools : 40.4.0
Cython : None
pytest : 5.3.0
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : 1.1.2
lxml.etree : 4.3.2
html5lib : 1.0.1
pymysql : 0.9.2
psycopg2 : None
jinja2 : 2.10
IPython : 6.5.0
pandas_datareader: None
bs4 : 4.6.3
bottleneck : None
fastparquet : None
gcsfs : None
lxml.etree : 4.3.2
matplotlib : 3.0.0
numexpr : 2.7.0
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : 0.14.1
pytables : None
s3fs : None
scipy : 1.1.0
sqlalchemy : 1.2.11
tables : 3.5.2
xarray : None
xlrd : 1.1.0
xlwt : 1.3.0
xlsxwriter : 1.1.2