Closed
Description
Code Sample
Consider the following three cases:
df1 = pd.DataFrame([
[1, 1, None, None, 30.0, None],
[2, 2, None, None, 30.0, None],
], columns=[u'ix1', u'ix2', u'col1', u'col2', u'col3', u'col4',]).set_index([u'ix1', 'ix2'])
df1.iloc[1:2].unstack('ix2')
df2 = pd.DataFrame([
[1, 1, None, None, 30.0],
[2, 2, None, None, 30.0],
], columns=[u'ix1', u'ix2', u'col1', u'col2', u'col3']).set_index([u'ix1', 'ix2'])
df2.iloc[1:2].unstack('ix2')
df3 = pd.DataFrame([
[1, 1, None, None, 30.0],
[2, None, None, None, 30.0],
], columns=[u'ix1', u'ix2', u'col1', u'col2', u'col3']).set_index([u'ix1', 'ix2'])
df3.iloc[1:2].unstack('ix2')
Problem description
When DataFrame().unstack()
is run on a subset of the DataFrame (i.e. the Index levels contain values that are not present in the subset), spurious errors are triggered.
We have encountered the exception triggered by df1
in production when groupby(A).apply(B)
which performed an unstack in the B
function. When trying to build a reduced test case, we encountered the issues of df2
and df3
and found them worthwhile to report.
Note that none of these fail when the DataFrame has only 3 columns or less.
Expected Output
In all cases, I expect a pivoted DataFrame simila to this:
col1 col2 col3 col4
ix2 2 2 2 2
ix1
2 None None 30.0 None
Actual Output
# df1
IndexError Traceback (most recent call last)
<ipython-input-9-4c1b8dc12e05> in <module>()
3 [2, 2, None, None, 30.0, None],
4 ], columns=[u'ix1', u'ix2', u'col1', u'col2', u'col3', u'col4',]).set_index([u'ix1', 'ix2'])
----> 5 df1.iloc[1:2].unstack('ix2')
pandas/core/frame.pyc in unstack(self, level, fill_value)
4567 """
4568 from pandas.core.reshape.reshape import unstack
-> 4569 return unstack(self, level, fill_value)
4570
4571 _shared_docs['melt'] = ("""
pandas/core/reshape/reshape.pyc in unstack(obj, level, fill_value)
467 if isinstance(obj, DataFrame):
468 if isinstance(obj.index, MultiIndex):
--> 469 return _unstack_frame(obj, level, fill_value=fill_value)
470 else:
471 return obj.T.stack(dropna=False)
pandas/core/reshape/reshape.pyc in _unstack_frame(obj, level, fill_value)
480 unstacker = partial(_Unstacker, index=obj.index,
481 level=level, fill_value=fill_value)
--> 482 blocks = obj._data.unstack(unstacker)
483 klass = type(obj)
484 return klass(blocks)
pandas/core/internals.pyc in unstack(self, unstacker_func)
4349 new_columns = new_columns[columns_mask]
4350
-> 4351 bm = BlockManager(new_blocks, [new_columns, new_index])
4352 return bm
4353
pandas/core/internals.pyc in __init__(self, blocks, axes, do_integrity_check, fastpath)
3035 self._consolidate_check()
3036
-> 3037 self._rebuild_blknos_and_blklocs()
3038
3039 def make_empty(self, axes=None):
pandas/core/internals.pyc in _rebuild_blknos_and_blklocs(self)
3123 for blkno, blk in enumerate(self.blocks):
3124 rl = blk.mgr_locs
-> 3125 new_blknos[rl.indexer] = blkno
3126 new_blklocs[rl.indexer] = np.arange(len(rl))
3127
IndexError: index 7 is out of bounds for axis 1 with size 4
# df2
ValueError Traceback (most recent call last)
<ipython-input-4-b8998add9f5d> in <module>()
3 [2, 2, None, None, 30.0],
4 ], columns=[u'ix1', u'ix2', u'col1', u'col2', u'col3']).set_index([u'ix1', 'ix2'])
----> 5 df2.iloc[1:2].unstack('ix2')
pandas/core/frame.pyc in unstack(self, level, fill_value)
4567 """
4568 from pandas.core.reshape.reshape import unstack
-> 4569 return unstack(self, level, fill_value)
4570
4571 _shared_docs['melt'] = ("""
pandas/core/reshape/reshape.pyc in unstack(obj, level, fill_value)
467 if isinstance(obj, DataFrame):
468 if isinstance(obj.index, MultiIndex):
--> 469 return _unstack_frame(obj, level, fill_value=fill_value)
470 else:
471 return obj.T.stack(dropna=False)
pandas/core/reshape/reshape.pyc in _unstack_frame(obj, level, fill_value)
480 unstacker = partial(_Unstacker, index=obj.index,
481 level=level, fill_value=fill_value)
--> 482 blocks = obj._data.unstack(unstacker)
483 klass = type(obj)
484 return klass(blocks)
pandas/core/internals.pyc in unstack(self, unstacker_func)
4349 new_columns = new_columns[columns_mask]
4350
-> 4351 bm = BlockManager(new_blocks, [new_columns, new_index])
4352 return bm
4353
pandas/core/internals.pyc in __init__(self, blocks, axes, do_integrity_check, fastpath)
3035 self._consolidate_check()
3036
-> 3037 self._rebuild_blknos_and_blklocs()
3038
3039 def make_empty(self, axes=None):
pandas/core/internals.pyc in _rebuild_blknos_and_blklocs(self)
3124 rl = blk.mgr_locs
3125 new_blknos[rl.indexer] = blkno
-> 3126 new_blklocs[rl.indexer] = np.arange(len(rl))
3127
3128 if (new_blknos == -1).any():
ValueError: could not broadcast input array from shape (2) into shape (1)
# df3
AssertionError Traceback (most recent call last)
<ipython-input-5-20d7feb391ae> in <module>()
3 [2, None, None, None, 30.0],
4 ], columns=[u'ix1', u'ix2', u'col1', u'col2', u'col3']).set_index([u'ix1', 'ix2'])
----> 5 df3.iloc[1:2].unstack('ix2')
pandas/core/frame.pyc in unstack(self, level, fill_value)
4567 """
4568 from pandas.core.reshape.reshape import unstack
-> 4569 return unstack(self, level, fill_value)
4570
4571 _shared_docs['melt'] = ("""
pandas/core/reshape/reshape.pyc in unstack(obj, level, fill_value)
467 if isinstance(obj, DataFrame):
468 if isinstance(obj.index, MultiIndex):
--> 469 return _unstack_frame(obj, level, fill_value=fill_value)
470 else:
471 return obj.T.stack(dropna=False)
pandas/core/reshape/reshape.pyc in _unstack_frame(obj, level, fill_value)
480 unstacker = partial(_Unstacker, index=obj.index,
481 level=level, fill_value=fill_value)
--> 482 blocks = obj._data.unstack(unstacker)
483 klass = type(obj)
484 return klass(blocks)
pandas/core/internals.pyc in unstack(self, unstacker_func)
4349 new_columns = new_columns[columns_mask]
4350
-> 4351 bm = BlockManager(new_blocks, [new_columns, new_index])
4352 return bm
4353
pandas/core/internals.pyc in __init__(self, blocks, axes, do_integrity_check, fastpath)
3035 self._consolidate_check()
3036
-> 3037 self._rebuild_blknos_and_blklocs()
3038
3039 def make_empty(self, axes=None):
pandas/core/internals.pyc in _rebuild_blknos_and_blklocs(self)
3127
3128 if (new_blknos == -1).any():
-> 3129 raise AssertionError("Gaps in blk ref_locs")
3130
3131 self._blknos = new_blknos
AssertionError: Gaps in blk ref_locs
Output of pd.show_versions()
INSTALLED VERSIONS
------------------
commit: None
python: 2.7.14.final.0
python-bits: 64
OS: Linux
OS-release: 4.14.13-300.fc27.x86_64
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: None.None
pandas: 0.22.0
pytest: None
pip: 9.0.1
setuptools: 38.2.5
Cython: None
numpy: 1.14.0
scipy: 1.0.0
pyarrow: None
xarray: None
IPython: 5.5.0
sphinx: None
patsy: None
dateutil: 2.6.1
pytz: 2017.3
blosc: None
bottleneck: 1.2.1
tables: None
numexpr: 2.6.4
feather: None
matplotlib: 2.1.2
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
lxml: None
bs4: 4.6.0
html5lib: 1.0.1
sqlalchemy: 0.9.10
pymysql: 0.8.0
psycopg2: None
jinja2: 2.10
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None