Description
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
(optional) I have confirmed this bug exists on the master branch of pandas.
Code Sample, a copy-pastable example
In [79]: x = ['b', 'b', 'c', 'a', 'b', np.nan]
...: y = ['a', 'b', 'c', 'a', 'b', 'd']
...: mi1 = pd.MultiIndex.from_arrays(
...: [x, [1, 2, 3, 4, 5, 6]],
...: names=['a', 'b']
...: )
...: df = pd.DataFrame({'c': [1, 1, 1, 1, 1, 1]}, index=mi1)
...: mi2 = pd.MultiIndex.from_arrays(
...: [y, [1, 1, 1, 1, 1, 1]],
...: names=['a', 'b']
...: )
...: s = pd.Series([1, 2, 3, 4, 5, 6], index=mi2)
...: df.combine_first(pd.DataFrame({'some_col': s}))
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~/envs/pandas-test/lib/python3.8/site-packages/pandas/core/algorithms.py in safe_sort(values, codes, na_sentinel, assume_unique, verify)
2060 try:
-> 2061 sorter = values.argsort()
2062 ordered = values.take(sorter)
TypeError: '<' not supported between instances of 'float' and 'str'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-79-de018ddfae29> in <module>
11 )
12 s = pd.Series([1, 2, 3, 4, 5, 6], index=mi2)
---> 13 df.combine_first(pd.DataFrame({'some_col': s}))
~/envs/pandas-test/lib/python3.8/site-packages/pandas/core/frame.py in combine_first(self, other)
6239 return expressions.where(mask, y_values, x_values)
6240
-> 6241 return self.combine(other, combiner, overwrite=False)
6242
6243 def update(
~/envs/pandas-test/lib/python3.8/site-packages/pandas/core/frame.py in combine(self, other, func, fill_value, overwrite)
6104 other_idxlen = len(other.index) # save for compare
6105
-> 6106 this, other = self.align(other, copy=False)
6107 new_index = this.index
6108
~/envs/pandas-test/lib/python3.8/site-packages/pandas/core/frame.py in align(self, other, join, axis, level, copy, fill_value, method, limit, fill_axis, broadcast_axis)
3955 broadcast_axis=None,
3956 ) -> "DataFrame":
-> 3957 return super().align(
3958 other,
3959 join=join,
~/envs/pandas-test/lib/python3.8/site-packages/pandas/core/generic.py in align(self, other, join, axis, level, copy, fill_value, method, limit, fill_axis, broadcast_axis)
8542 axis = self._get_axis_number(axis)
8543 if isinstance(other, ABCDataFrame):
-> 8544 return self._align_frame(
8545 other,
8546 join=join,
~/envs/pandas-test/lib/python3.8/site-packages/pandas/core/generic.py in _align_frame(self, other, join, axis, level, copy, fill_value, method, limit, fill_axis)
8589 if axis is None or axis == 0:
8590 if not self.index.equals(other.index):
-> 8591 join_index, ilidx, iridx = self.index.join(
8592 other.index, how=join, level=level, return_indexers=True
8593 )
~/envs/pandas-test/lib/python3.8/site-packages/pandas/core/indexes/base.py in join(self, other, how, level, return_indexers, sort)
3491 )
3492 else:
-> 3493 return self._join_non_unique(
3494 other, how=how, return_indexers=return_indexers
3495 )
~/envs/pandas-test/lib/python3.8/site-packages/pandas/core/indexes/base.py in _join_non_unique(self, other, how, return_indexers)
3618 rvalues = other._get_engine_target()
3619
-> 3620 left_idx, right_idx = _get_join_indexers(
3621 [lvalues], [rvalues], how=how, sort=True
3622 )
~/envs/pandas-test/lib/python3.8/site-packages/pandas/core/reshape/merge.py in _get_join_indexers(left_keys, right_keys, sort, how, **kwargs)
1326 for n in range(len(left_keys))
1327 )
-> 1328 zipped = zip(*mapped)
1329 llab, rlab, shape = [list(x) for x in zipped]
1330
~/envs/pandas-test/lib/python3.8/site-packages/pandas/core/reshape/merge.py in <genexpr>(.0)
1323 # get left & right join labels and num. of levels at each location
1324 mapped = (
-> 1325 _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
1326 for n in range(len(left_keys))
1327 )
~/envs/pandas-test/lib/python3.8/site-packages/pandas/core/reshape/merge.py in _factorize_keys(lk, rk, sort, how)
1978 if sort:
1979 uniques = rizer.uniques.to_array()
-> 1980 llab, rlab = _sort_labels(uniques, llab, rlab)
1981
1982 # NA group
~/envs/pandas-test/lib/python3.8/site-packages/pandas/core/reshape/merge.py in _sort_labels(uniques, left, right)
2003 labels = np.concatenate([left, right])
2004
-> 2005 _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1)
2006 new_labels = ensure_int64(new_labels)
2007 new_left, new_right = new_labels[:llength], new_labels[llength:]
~/envs/pandas-test/lib/python3.8/site-packages/pandas/core/algorithms.py in safe_sort(values, codes, na_sentinel, assume_unique, verify)
2063 except TypeError:
2064 # try this anyway
-> 2065 ordered = sort_mixed(values)
2066
2067 # codes:
~/envs/pandas-test/lib/python3.8/site-packages/pandas/core/algorithms.py in sort_mixed(values)
2046 # order ints before strings, safe in py3
2047 str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
-> 2048 nums = np.sort(values[~str_pos])
2049 strs = np.sort(values[str_pos])
2050 return np.concatenate([nums, np.asarray(strs, dtype=object)])
<__array_function__ internals> in sort(*args, **kwargs)
~/envs/pandas-test/lib/python3.8/site-packages/numpy/core/fromnumeric.py in sort(a, axis, kind, order)
989 else:
990 a = asanyarray(a).copy(order="K")
--> 991 a.sort(axis=axis, kind=kind, order=order)
992 return a
993
TypeError: '<' not supported between instances of 'float' and 'str'
Problem description
I use df.combine_first(...)
to add a column to a dataframe while extending the index in case an index value does not exist in the target dataframe. However, if the MultIindex of the dataframe/series contains mixed np.nan/str values in their index value, the above TypeError
is raised. I originally noticed this for categorical types (x
and y
), but the problem can be simplified to plain string types.
The reproduction of this error also seems to depend on the length/order of x
and y
(I tried to reduce it to fewer elements and kept the np.nan
, but that didn't reproduce the error).
Expected Output
No exception and the dataframe containing the new column of series.
Output of pd.show_versions()
In [84]: pd.show_versions()
INSTALLED VERSIONS
commit : 2a7d332
python : 3.8.5.final.0
python-bits : 64
OS : Darwin
OS-release : 19.6.0
Version : Darwin Kernel Version 19.6.0: Thu Jun 18 20:49:00 PDT 2020; root:xnu-6153.141.1~1/RELEASE_X86_64
machine : x86_64
processor : i386
byteorder : little
LC_ALL : None
LANG : en_AU.UTF-8
LOCALE : en_AU.UTF-8
pandas : 1.1.2
numpy : 1.19.2
pytz : 2020.1
dateutil : 2.8.1
pip : 20.1.1
setuptools : 46.4.0
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : None
IPython : 7.18.1
pandas_datareader: None
bs4 : None
bottleneck : None
fsspec : None
fastparquet : None
gcsfs : None
matplotlib : None
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pytables : None
pyxlsb : None
s3fs : None
scipy : None
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
xlwt : None
numba : None