Skip to content

ValueError on assigning multiple NaN/None to Categorial Series when all categories are Intervals #27937

Closed
@GanymedeZero

Description

@GanymedeZero

Reproducing the error:

import numpy as np, pandas as pd

# providing CategoricalDtype via e.g. pd.cut will also produce the error
s = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype=pd.CategoricalDtype(categories=[pd.Interval(0,1)]))
s.iloc[:3] = [np.nan, pd.Interval(0,1), np.nan]
# ValueError: Reindexing only valid with uniquely valued Index objects

# these work:
s.iloc[:3] = [np.nan, np.nan, np.nan]
s.iloc[:3] = [np.nan, pd.Interval(0,1), pd.Interval(0,1)]

# as does:
s = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype=pd.CategoricalDtype(categories=['not-an-interval', pd.Interval(0,1)]))
s.iloc[:3] = [np.nan, pd.Interval(0,1), np.nan]

# as does:
d = pd.DataFrame({'s': pd.Series([np.nan]*4, dtype=pd.cut(range(3), bins=2).dtype), 'a': range(4)})
d['s'] = [np.nan, pd.Interval(1.0, 2.0), np.nan, np.nan]
d.loc[slice(0,2,1), 's'] = [pd.Interval(1.0, 2.0), np.nan, np.nan]

# but NOT:
d = pd.DataFrame({'s': pd.Series([np.nan]*4, dtype=pd.cut(range(3), bins=2).dtype), 'a': range(4)})
d.loc[slice(0,2,1), 's'] = [pd.Interval(1.0, 2.0), np.nan, np.nan]

The full traceback:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-30-ca2aabf517a0> in <module>
      1 s = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype=pd.CategoricalDtype(categories=[pd.Interval(0,1)]))
----> 2 s.iloc[:3] = [np.nan, pd.Interval(0,1), np.nan]

c:\python36\lib\site-packages\pandas\core\indexing.py in __setitem__(self, key, value)
    200             key = com.apply_if_callable(key, self.obj)
    201         indexer = self._get_setitem_indexer(key)
--> 202         self._setitem_with_indexer(indexer, value)
    203
    204     def _validate_key(self, key, axis: int):

c:\python36\lib\site-packages\pandas\core\indexing.py in _setitem_with_indexer(self, indexer, value)
    577             # actually do the set
    578             self.obj._consolidate_inplace()
--> 579             self.obj._data = self.obj._data.setitem(indexer=indexer, value=value)
    580             self.obj._maybe_update_cacher(clear=True)
    581

c:\python36\lib\site-packages\pandas\core\internals\managers.py in setitem(self, **kwargs)
    558
    559     def setitem(self, **kwargs):
--> 560         return self.apply("setitem", **kwargs)
    561
    562     def putmask(self, **kwargs):

c:\python36\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
    436                     kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)
    437
--> 438             applied = getattr(b, f)(**kwargs)
    439             result_blocks = _extend_blocks(applied, result_blocks)
    440

c:\python36\lib\site-packages\pandas\core\internals\blocks.py in setitem(self, indexer, value)
   1834
   1835         check_setitem_lengths(indexer, value, self.values)
-> 1836         self.values[indexer] = value
   1837         return self
   1838

c:\python36\lib\site-packages\pandas\core\arrays\categorical.py in __setitem__(self, key, value)
   2182         from pandas import Index
   2183
-> 2184         to_add = Index(rvalue).difference(self.categories)
   2185
   2186         # no assignments of values not in categories, but it's always ok to set

c:\python36\lib\site-packages\pandas\core\indexes\interval.py in func(intvidx_self, other, sort)
    163                 raise TypeError(msg.format(op=self.op_name))
    164
--> 165             return setop(intvidx_self, other, sort)
    166
    167         return func

c:\python36\lib\site-packages\pandas\core\indexes\interval.py in func(self, other, sort)
   1267         @SetopCheck(op_name=op_name)
   1268         def func(self, other, sort=sort):
-> 1269             result = getattr(self._multiindex, op_name)(other._multiindex, sort=sort)
   1270             result_name = get_op_result_name(self, other)
   1271

c:\python36\lib\site-packages\pandas\core\indexes\multi.py in difference(self, other, sort)
   3299         this = self._get_unique_index()
   3300
-> 3301         indexer = this.get_indexer(other)
   3302         indexer = indexer.take((indexer != -1).nonzero()[0])
   3303

c:\python36\lib\site-packages\pandas\core\indexes\multi.py in get_indexer(self, target, method, limit, tolerance)
   2423         if not self.is_unique:
   2424             raise ValueError(
-> 2425                 "Reindexing only valid with uniquely valued " "Index objects"
   2426             )
   2427

ValueError: Reindexing only valid with uniquely valued Index objects

Problem description

The problem is a bit specific: the assignment has to be to a slice, the Categorical Series/DataFrame must have all Intervals for categories, and the rhs must have at least 2 np.nan (or None) but not all of its elements should be np.nan or else the assignment will work.

My guess is that this has something to do with the fact that when a Series or DataFrame column has a CategoricalDtype where the categories are all Intervals, inspection of the object shows that the categories are stored as an IntervalIndex:

In [1]: s.dtype.categories

Out[1]:
IntervalIndex([(0, 1]],
              closed='right',
              dtype='interval[int64]')

whereas a categorical object with non-Interval categories mixed in stores them as just a vanilla Index, and in the case of setting values to the DataFrame column it changes the column's dtype to dtype('O').

Expected Output

import numpy as np, pandas as pd

s = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype=pd.CategoricalDtype(categories=[pd.Interval(0,1)]))
s.iloc[:3] = [np.nan, pd.Interval(0,1), np.nan]
print(s)

#0       NaN
#1    (0, 1]
#2       NaN
#3       NaN
#Length: 4, dtype: category
#Categories (1, interval[int64]): [(0, 1]]

Output of pd.show_versions()

INSTALLED VERSIONS

commit : None
python : 3.6.1.final.0
python-bits : 64
OS : Windows
OS-release : 10
machine : AMD64
processor : Intel64 Family 6 Model 94 Stepping 3, GenuineIntel
byteorder : little
LC_ALL : None
LANG : None
LOCALE : None.None

pandas : 0.25.0
numpy : 1.17.0
pytz : 2018.3
dateutil : 2.8.0
pip : 19.2.1
setuptools : 41.0.1
Cython : 0.25.2
pytest : 3.7.2
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : 0.9999999
pymysql : 0.7.11.None
psycopg2 : None
jinja2 : 2.10.1
IPython : 7.5.0
pandas_datareader: None
bs4 : None
bottleneck : 1.2.1
fastparquet : None
gcsfs : None
lxml.etree : None
matplotlib : 3.0.2
numexpr : 2.7.0
odfpy : None
openpyxl : 2.4.8
pandas_gbq : None
pyarrow : None
pytables : None
s3fs : None
scipy : 1.1.0
sqlalchemy : None
tables : 3.5.2
xarray : None
xlrd : 1.0.0
xlwt : None
xlsxwriter : None

Metadata

Metadata

Assignees

No one assigned

    Labels

    CategoricalCategorical Data TypeIndexingRelated to indexing on series/frames, not to indexes themselvesIntervalInterval data typeNeeds TestsUnit test(s) needed to prevent regressionsgood first issue

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions