Description
Code Sample (copy-pastable)
import pandas as pd
df1 = pd.DataFrame([["a", 1, 2, "05/29/2019"], ["a", 4, 5, "05/28/2019"], ["b", 2, 3, "05/27/2019"]], columns=["type", "num1", "num2", "date"]).assign(date=lambda df: pd.to_datetime(df["date"]))
df2 = pd.DataFrame(columns=["type", "num1", "num2", "date"]).assign(date=lambda df: pd.to_datetime(df["date"]))
groupbys = ["type", pd.Grouper(key="date", freq="1D")]
df1.groupby(groupbys).head()
df2.groupby(groupbys).head()
Problem description
The above code sample throws the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-1-ae6b527f8edf> in <module>
7
8 df1.groupby(groupbys).head()
----> 9 df2.groupby(groupbys).head()
~/miniconda3/envs/mbc/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in head(self, n)
2062 """
2063 self._reset_group_selection()
-> 2064 mask = self._cumcount_array() < n
2065 return self._selected_obj[mask]
2066
~/miniconda3/envs/mbc/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _cumcount_array(self, ascending)
730 (though the default is sort=True) for groupby in general
731 """
--> 732 ids, _, ngroups = self.grouper.group_info
733 sorter = get_group_index_sorter(ids, ngroups)
734 ids, count = ids[sorter], len(ids)
pandas/_libs/properties.pyx in pandas._libs.properties.CachedProperty.__get__()
~/miniconda3/envs/mbc/lib/python3.6/site-packages/pandas/core/groupby/ops.py in group_info(self)
250 @cache_readonly
251 def group_info(self):
--> 252 comp_ids, obs_group_ids = self._get_compressed_labels()
253
254 ngroups = len(obs_group_ids)
~/miniconda3/envs/mbc/lib/python3.6/site-packages/pandas/core/groupby/ops.py in _get_compressed_labels(self)
266
267 def _get_compressed_labels(self):
--> 268 all_labels = [ping.labels for ping in self.groupings]
269 if len(all_labels) > 1:
270 group_index = get_group_index(all_labels, self.shape,
~/miniconda3/envs/mbc/lib/python3.6/site-packages/pandas/core/groupby/ops.py in <listcomp>(.0)
266
267 def _get_compressed_labels(self):
--> 268 all_labels = [ping.labels for ping in self.groupings]
269 if len(all_labels) > 1:
270 group_index = get_group_index(all_labels, self.shape,
~/miniconda3/envs/mbc/lib/python3.6/site-packages/pandas/core/groupby/grouper.py in labels(self)
365 def labels(self):
366 if self._labels is None:
--> 367 self._make_labels()
368 return self._labels
369
~/miniconda3/envs/mbc/lib/python3.6/site-packages/pandas/core/groupby/grouper.py in _make_labels(self)
386 # we have a list of groupers
387 if isinstance(self.grouper, BaseGrouper):
--> 388 labels = self.grouper.label_info
389 uniques = self.grouper.result_index
390 else:
pandas/_libs/properties.pyx in pandas._libs.properties.CachedProperty.__get__()
~/miniconda3/envs/mbc/lib/python3.6/site-packages/pandas/core/groupby/ops.py in label_info(self)
261 labels, _, _ = self.group_info
262 if self.indexer is not None:
--> 263 sorter = np.lexsort((labels, self.indexer))
264 labels = labels[sorter]
265 return labels
ValueError: all keys need to be the same shape
The pd.Grouper
object seems to be modified inside the list. This is not the behavior I expect. I can resolve this by making explicit (deep) copies of the list so that new instances of pd.Grouper
are passed into both groupby
methods. Like so:
In [11]: import copy
In [12]: groupbys = ["type", pd.Grouper(key="date", freq="1D")]
In [13]: df1.groupby(copy.deepcopy(groupbys)).head()
Out[13]:
type num1 num2 date
0 a 1 2 2019-05-29
1 a 4 5 2019-05-28
2 b 2 3 2019-05-27
In [14]: df2.groupby(copy.deepcopy(groupbys)).head()
Out[14]:
Empty DataFrame
Columns: [type, num1, num2, date]
Index: []
Expected Output
As show above, expected output is the empty dataframe, instead of the ValueError
. Interestingly, if I reverse the order and run the df2.groupby
first, then run df1.groupby
, it works fine. However, doing df2.groupby
again throws the ValueError
. There's definitely something in the df1.groupby
that is modifying the pd.Grouper
.
Output of pd.show_versions()
[paste the output of pd.show_versions()
here below this line]
INSTALLED VERSIONS
commit: None
python: 3.6.7.final.0
python-bits: 64
OS: Linux
OS-release: 4.15.0-50-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: en_US.UTF-8
pandas: 0.24.2
pytest: 4.4.0
pip: 19.0.3
setuptools: 40.8.0
Cython: None
numpy: 1.16.2
scipy: 1.2.1
pyarrow: None
xarray: None
IPython: 7.4.0
sphinx: None
patsy: None
dateutil: 2.8.0
pytz: 2018.9
blosc: None
bottleneck: None
tables: None
numexpr: None
feather: None
matplotlib: 3.0.3
openpyxl: None
xlrd: 1.2.0
xlwt: None
xlsxwriter: 1.1.6
lxml.etree: None
bs4: None
html5lib: None
sqlalchemy: 1.2.18
pymysql: None
psycopg2: 2.8.1 (dt dec pq3 ext lo64)
jinja2: 2.10.1
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None
gcsfs: None
If this is expected behavior please let me know (we can close the issue). If this is not expected behavior, I'd love to take a crack at resolving this (any insight into the issue would be appreciated).