Description
Code Sample, a copy-pastable example if possible
import pandas as pd
df = pd.DataFrame([['2018-01-01','MSFT','short'],
['2018-01-01','MSFT','long'],
['2018-01-01','MSFT',pd.np.NaN],
['2018-01-01','AAPL','short'],
['2018-01-01','AAPL',pd.np.NaN],
['2018-01-02','AAPL','short'],
['2018-01-01','TSLA','neutral'],
['2018-01-02','TSLA',pd.np.NaN]],
columns=['date','symbol','status'])
df['date'] = pd.to_datetime(df['date'])
df = df.set_index(['date','symbol'])
df.groupby('symbol')['status'].ffill()
Raised exception
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-58-2fe0989150cf> in <module>()
13 df['date'] = pd.to_datetime(df['date'])
14 df = df.set_index(['date','symbol'])
---> 15 df.groupby('symbol')['status'].ffill()
/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/groupby.pyc in pad(self, limit)
1362 DataFrame.fillna
1363 """
-> 1364 return self.apply(lambda x: x.ffill(limit=limit))
1365 ffill = pad
1366
/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/groupby.pyc in apply(self, func, *args, **kwargs)
3110 examples=_apply_docs['series_examples']))
3111 def apply(self, func, *args, **kwargs):
-> 3112 return super(SeriesGroupBy, self).apply(func, *args, **kwargs)
3113
3114 @Appender(_agg_doc)
/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/groupby.pyc in apply(self, func, *args, **kwargs)
803 # ignore SettingWithCopy here in case the user mutates
804 with option_context('mode.chained_assignment', None):
--> 805 return self._python_apply_general(f)
806
807 def _python_apply_general(self, f):
/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/groupby.pyc in _python_apply_general(self, f)
812 keys,
813 values,
--> 814 not_indexed_same=mutated or self.mutated)
815
816 def _iterate_slices(self):
/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/groupby.pyc in _wrap_applied_output(self, keys, values, not_indexed_same)
3250 if isinstance(values[0], (Series, dict)):
3251 return self._concat_objects(keys, values,
-> 3252 not_indexed_same=not_indexed_same)
3253 elif isinstance(values[0], DataFrame):
3254 # possible that Series -> DataFrame by applied function
/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/groupby.pyc in _concat_objects(self, keys, values, not_indexed_same)
978
979 if isinstance(result, Series):
--> 980 result = result.reindex(ax)
981 else:
982
/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/series.pyc in reindex(self, index, **kwargs)
2679 @Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs)
2680 def reindex(self, index=None, **kwargs):
-> 2681 return super(Series, self).reindex(index=index, **kwargs)
2682
2683 @Appender(generic._shared_docs['fillna'] % _shared_doc_kwargs)
/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc in reindex(self, *args, **kwargs)
3021 # perform the reindex on the axes
3022 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3023 fill_value, copy).__finalize__(self)
3024
3025 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
3034 ax = self._get_axis(a)
3035 new_index, indexer = ax.reindex(labels, level=level, limit=limit,
-> 3036 tolerance=tolerance, method=method)
3037
3038 axis = self._get_axis_number(a)
/home/greg/anaconda2/lib/python2.7/site-packages/pandas/core/indexes/multi.pyc in reindex(self, target, method, level, limit, tolerance)
1901 tolerance=tolerance)
1902 else:
-> 1903 raise Exception("cannot handle a non-unique multi-index!")
1904
1905 if not isinstance(target, MultiIndex):
Exception: cannot handle a non-unique multi-index!
Problem description
Performing ffill()
does not forward fill values as intended. The index has duplicate values, the expectation is that the values will be filled according to the order the rows are in already.
Duplicate index values are possible when using high frequency time intervals.
Expected Output
df = pd.DataFrame([['2018-01-01','MSFT','short'],
['2018-01-01','MSFT','long'],
['2018-01-01','MSFT','long'],
['2018-01-01','AAPL','short'],
['2018-01-01','AAPL','short'],
['2018-01-02','AAPL','short'],
['2018-01-01','TSLA','neutral'],
['2018-01-02','TSLA','neutral']],
columns=['date','symbol','status'])
df['date'] = pd.to_datetime(df['date'])
Output of pd.show_versions()
INSTALLED VERSIONS
commit: None
python: 2.7.14.final.0
python-bits: 64
OS: Linux
OS-release: 4.13.0-31-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_AU.UTF-8
LOCALE: None.None
pandas: 0.22.0
pytest: 3.2.1
pip: 9.0.1
setuptools: 36.5.0.post20170921
Cython: 0.26.1
numpy: 1.14.0
scipy: 0.19.1
pyarrow: None
xarray: None
IPython: 5.4.1
sphinx: 1.6.3
patsy: 0.4.1
dateutil: 2.6.1
pytz: 2017.3
blosc: None
bottleneck: 1.2.1
tables: 3.4.2
numexpr: 2.6.2
feather: None
matplotlib: 2.1.0
openpyxl: 2.4.8
xlrd: 1.1.0
xlwt: 1.3.0
xlsxwriter: 1.0.2
lxml: 4.1.0
bs4: 4.6.0
html5lib: 0.999999999
sqlalchemy: 1.1.13
pymysql: None
psycopg2: 2.7.3.2 (dt dec pq3 ext lo64)
jinja2: 2.9.6
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None
This also happens with axis=1
.
import pandas as pd
import numpy as np
df = pd.DataFrame(
np.ones([6, 4], dtype=int),
columns=pd.MultiIndex.from_product([['A', 'B'], [1, 2]])
)
(
df
.groupby(level=0, axis=1)
.apply(
lambda df: 2*df.xs(df.name, axis=1)
)
)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-12-e7c022399f89> in <module>()
3 .groupby(level=0, axis=1)
4 .apply(
----> 5 lambda df: 2*df.xs(df.name, axis=1)
6 )
7 )
~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/groupby.py in apply(self, func, *args, **kwargs)
882 # ignore SettingWithCopy here in case the user mutates
883 with option_context('mode.chained_assignment', None):
--> 884 return self._python_apply_general(f)
885
886 def _python_apply_general(self, f):
~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/groupby.py in _python_apply_general(self, f)
891 keys,
892 values,
--> 893 not_indexed_same=mutated or self.mutated)
894
895 def _iterate_slices(self):
~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/groupby.py in _wrap_applied_output(self, keys, values, not_indexed_same)
3920 elif isinstance(v, DataFrame):
3921 return self._concat_objects(keys, values,
-> 3922 not_indexed_same=not_indexed_same)
3923 elif self.grouper.groupings is not None:
3924 if len(self.grouper.groupings) > 1:
~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/groupby.py in _concat_objects(self, keys, values, not_indexed_same)
1085 result = result.take(indexer, axis=self.axis)
1086 else:
-> 1087 result = result.reindex(ax, axis=self.axis)
1088
1089 elif self.group_keys:
~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/util/_decorators.py in wrapper(*args, **kwargs)
145 @wraps(func)
146 def wrapper(*args, **kwargs):
--> 147 return func(*args, **kwargs)
148
149 if not PY2:
~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/frame.py in reindex(self, *args, **kwargs)
2979 kwargs.pop('axis', None)
2980 kwargs.pop('labels', None)
-> 2981 return super(DataFrame, self).reindex(**kwargs)
2982
2983 @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/generic.py in reindex(self, *args, **kwargs)
3356 # perform the reindex on the axes
3357 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3358 fill_value, copy).__finalize__(self)
3359
3360 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
2909 if columns is not None:
2910 frame = frame._reindex_columns(columns, method, copy, level,
-> 2911 fill_value, limit, tolerance)
2912
2913 index = axes['index']
~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/frame.py in _reindex_columns(self, new_columns, method, copy, level, fill_value, limit, tolerance)
2934 return self._reindex_with_indexers({1: [new_columns, indexer]},
2935 copy=copy, fill_value=fill_value,
-> 2936 allow_dups=False)
2937
2938 def _reindex_multi(self, axes, copy, fill_value):
~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
3477 fill_value=fill_value,
3478 allow_dups=allow_dups,
-> 3479 copy=copy)
3480
3481 if copy and new_data is self._data:
~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
4125 # some axes don't allow reindexing with dups
4126 if not allow_dups:
-> 4127 self.axes[axis]._can_reindex(indexer)
4128
4129 if axis >= self.ndim:
~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/indexes/base.py in _can_reindex(self, indexer)
2940 # trying to reindex on an axis with duplicates
2941 if not self.is_unique and len(indexer):
-> 2942 raise ValueError("cannot reindex from a duplicate axis")
2943
2944 def reindex(self, target, method=None, level=None, limit=None,
ValueError: cannot reindex from a duplicate axis