Skip to content

df.groupby('symbol')['status'].ffill() results in Exception: cannot handle a non-unique multi-index! #19437

Closed
@gregsifr

Description

@gregsifr

Code Sample, a copy-pastable example if possible

import pandas as pd

df = pd.DataFrame([['2018-01-01','MSFT','short'],
                  ['2018-01-01','MSFT','long'],
                  ['2018-01-01','MSFT',pd.np.NaN],
                  ['2018-01-01','AAPL','short'],
                  ['2018-01-01','AAPL',pd.np.NaN],
                  ['2018-01-02','AAPL','short'],
                  ['2018-01-01','TSLA','neutral'],
                  ['2018-01-02','TSLA',pd.np.NaN]],
                columns=['date','symbol','status'])

df['date'] = pd.to_datetime(df['date'])
df = df.set_index(['date','symbol'])
df.groupby('symbol')['status'].ffill()

Raised exception

---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-58-2fe0989150cf> in <module>()
     13 df['date'] = pd.to_datetime(df['date'])
     14 df = df.set_index(['date','symbol'])
---> 15 df.groupby('symbol')['status'].ffill()

/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/groupby.pyc in pad(self, limit)
   1362         DataFrame.fillna
   1363         """
-> 1364         return self.apply(lambda x: x.ffill(limit=limit))
   1365     ffill = pad
   1366 

/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/groupby.pyc in apply(self, func, *args, **kwargs)
   3110                       examples=_apply_docs['series_examples']))
   3111     def apply(self, func, *args, **kwargs):
-> 3112         return super(SeriesGroupBy, self).apply(func, *args, **kwargs)
   3113 
   3114     @Appender(_agg_doc)

/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/groupby.pyc in apply(self, func, *args, **kwargs)
    803         # ignore SettingWithCopy here in case the user mutates
    804         with option_context('mode.chained_assignment', None):
--> 805             return self._python_apply_general(f)
    806 
    807     def _python_apply_general(self, f):

/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/groupby.pyc in _python_apply_general(self, f)
    812             keys,
    813             values,
--> 814             not_indexed_same=mutated or self.mutated)
    815 
    816     def _iterate_slices(self):

/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/groupby.pyc in _wrap_applied_output(self, keys, values, not_indexed_same)
   3250         if isinstance(values[0], (Series, dict)):
   3251             return self._concat_objects(keys, values,
-> 3252                                         not_indexed_same=not_indexed_same)
   3253         elif isinstance(values[0], DataFrame):
   3254             # possible that Series -> DataFrame by applied function

/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/groupby.pyc in _concat_objects(self, keys, values, not_indexed_same)
    978 
    979             if isinstance(result, Series):
--> 980                 result = result.reindex(ax)
    981             else:
    982 

/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/series.pyc in reindex(self, index, **kwargs)
   2679     @Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs)
   2680     def reindex(self, index=None, **kwargs):
-> 2681         return super(Series, self).reindex(index=index, **kwargs)
   2682 
   2683     @Appender(generic._shared_docs['fillna'] % _shared_doc_kwargs)

/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc in reindex(self, *args, **kwargs)
   3021         # perform the reindex on the axes
   3022         return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3023                                   fill_value, copy).__finalize__(self)
   3024 
   3025     def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,

/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   3034             ax = self._get_axis(a)
   3035             new_index, indexer = ax.reindex(labels, level=level, limit=limit,
-> 3036                                             tolerance=tolerance, method=method)
   3037 
   3038             axis = self._get_axis_number(a)

/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/indexes/multi.pyc in reindex(self, target, method, level, limit, tolerance)
   1901                                                tolerance=tolerance)
   1902                 else:
-> 1903                     raise Exception("cannot handle a non-unique multi-index!")
   1904 
   1905         if not isinstance(target, MultiIndex):

Exception: cannot handle a non-unique multi-index!

Problem description

Performing ffill() does not forward fill values as intended. The index has duplicate values, the expectation is that the values will be filled according to the order the rows are in already.

Duplicate index values are possible when using high frequency time intervals.

Expected Output

df = pd.DataFrame([['2018-01-01','MSFT','short'],
                  ['2018-01-01','MSFT','long'],
                  ['2018-01-01','MSFT','long'],
                  ['2018-01-01','AAPL','short'],
                  ['2018-01-01','AAPL','short'],
                  ['2018-01-02','AAPL','short'],
                  ['2018-01-01','TSLA','neutral'],
                  ['2018-01-02','TSLA','neutral']],
                columns=['date','symbol','status'])

df['date'] = pd.to_datetime(df['date'])

Output of pd.show_versions()

INSTALLED VERSIONS

commit: None
python: 2.7.14.final.0
python-bits: 64
OS: Linux
OS-release: 4.13.0-31-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_AU.UTF-8
LOCALE: None.None

pandas: 0.22.0
pytest: 3.2.1
pip: 9.0.1
setuptools: 36.5.0.post20170921
Cython: 0.26.1
numpy: 1.14.0
scipy: 0.19.1
pyarrow: None
xarray: None
IPython: 5.4.1
sphinx: 1.6.3
patsy: 0.4.1
dateutil: 2.6.1
pytz: 2017.3
blosc: None
bottleneck: 1.2.1
tables: 3.4.2
numexpr: 2.6.2
feather: None
matplotlib: 2.1.0
openpyxl: 2.4.8
xlrd: 1.1.0
xlwt: 1.3.0
xlsxwriter: 1.0.2
lxml: 4.1.0
bs4: 4.6.0
html5lib: 0.999999999
sqlalchemy: 1.1.13
pymysql: None
psycopg2: 2.7.3.2 (dt dec pq3 ext lo64)
jinja2: 2.9.6
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None

This also happens with axis=1.

import pandas as pd
import numpy as np

df = pd.DataFrame(
    np.ones([6, 4], dtype=int),
    columns=pd.MultiIndex.from_product([['A', 'B'], [1, 2]])
)

(
    df
    .groupby(level=0, axis=1)
    .apply(
        lambda df: 2*df.xs(df.name, axis=1)
    )
)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-12-e7c022399f89> in <module>()
      3     .groupby(level=0, axis=1)
      4     .apply(
----> 5         lambda df: 2*df.xs(df.name, axis=1)
      6     )
      7 )

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/groupby.py in apply(self, func, *args, **kwargs)
    882         # ignore SettingWithCopy here in case the user mutates
    883         with option_context('mode.chained_assignment', None):
--> 884             return self._python_apply_general(f)
    885
    886     def _python_apply_general(self, f):

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/groupby.py in _python_apply_general(self, f)
    891             keys,
    892             values,
--> 893             not_indexed_same=mutated or self.mutated)
    894
    895     def _iterate_slices(self):

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/groupby.py in _wrap_applied_output(self, keys, values, not_indexed_same)
   3920         elif isinstance(v, DataFrame):
   3921             return self._concat_objects(keys, values,
-> 3922                                         not_indexed_same=not_indexed_same)
   3923         elif self.grouper.groupings is not None:
   3924             if len(self.grouper.groupings) > 1:

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/groupby.py in _concat_objects(self, keys, values, not_indexed_same)
   1085                     result = result.take(indexer, axis=self.axis)
   1086                 else:
-> 1087                     result = result.reindex(ax, axis=self.axis)
   1088
   1089         elif self.group_keys:

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    145         @wraps(func)
    146         def wrapper(*args, **kwargs):
--> 147             return func(*args, **kwargs)
    148
    149         if not PY2:

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/frame.py in reindex(self, *args, **kwargs)
   2979         kwargs.pop('axis', None)
   2980         kwargs.pop('labels', None)
-> 2981         return super(DataFrame, self).reindex(**kwargs)
   2982
   2983     @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/generic.py in reindex(self, *args, **kwargs)
   3356         # perform the reindex on the axes
   3357         return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3358                                   fill_value, copy).__finalize__(self)
   3359
   3360     def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   2909         if columns is not None:
   2910             frame = frame._reindex_columns(columns, method, copy, level,
-> 2911                                            fill_value, limit, tolerance)
   2912
   2913         index = axes['index']

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/frame.py in _reindex_columns(self, new_columns, method, copy, level, fill_value, limit, tolerance)
   2934         return self._reindex_with_indexers({1: [new_columns, indexer]},
   2935                                            copy=copy, fill_value=fill_value,
-> 2936                                            allow_dups=False)
   2937
   2938     def _reindex_multi(self, axes, copy, fill_value):

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
   3477                                                 fill_value=fill_value,
   3478                                                 allow_dups=allow_dups,
-> 3479                                                 copy=copy)
   3480
   3481         if copy and new_data is self._data:

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
   4125         # some axes don't allow reindexing with dups
   4126         if not allow_dups:
-> 4127             self.axes[axis]._can_reindex(indexer)
   4128
   4129         if axis >= self.ndim:

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/indexes/base.py in _can_reindex(self, indexer)
   2940         # trying to reindex on an axis with duplicates
   2941         if not self.is_unique and len(indexer):
-> 2942             raise ValueError("cannot reindex from a duplicate axis")
   2943
   2944     def reindex(self, target, method=None, level=None, limit=None,

ValueError: cannot reindex from a duplicate axis

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions