diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 6ace245a4bae1..f42cfc4d4c27f 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -1215,6 +1215,8 @@ Indexing - Bug in `scalar in Index` if scalar is a float while the ``Index`` is of integer dtype (:issue:`22085`) - Bug in `MultiIndex.set_levels` when levels value is not subscriptable (:issue:`23273`) - Bug where setting a timedelta column by ``Index`` causes it to be casted to double, and therefore lose precision (:issue:`23511`) +- Bug in :func:`Index.union` and :func:`Index.intersection` where name of the ``Index`` of the result was not computed correctly for certain cases (:issue:`9943`, :issue:`9862`) + Missing ^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ae64179b36485..7434a02043d65 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -61,7 +61,7 @@ import pandas.core.sorting as sorting from pandas.io.formats.printing import ( pprint_thing, default_pprint, format_object_summary, format_object_attrs) -from pandas.core.ops import make_invalid_op +from pandas.core.ops import make_invalid_op, get_op_result_name from pandas.core.strings import StringMethods __all__ = ['Index'] @@ -1253,7 +1253,7 @@ def _convert_can_do_setop(self, other): other = Index(other, name=self.name) result_name = self.name else: - result_name = self.name if self.name == other.name else None + result_name = get_op_result_name(self, other) return other, result_name def _convert_for_op(self, value): @@ -2745,19 +2745,15 @@ def __or__(self, other): def __xor__(self, other): return self.symmetric_difference(other) - def _get_consensus_name(self, other): + def _get_reconciled_name_object(self, other): """ - Given 2 indexes, give a consensus name meaning - we take the not None one, or None if the names differ. - Return a new object if we are resetting the name + If the result of a set operation will be self, + return self, unless the name changes, in which + case make a shallow copy of self. """ - if self.name != other.name: - if self.name is None or other.name is None: - name = self.name or other.name - else: - name = None - if self.name != name: - return self._shallow_copy(name=name) + name = get_op_result_name(self, other) + if self.name != name: + return self._shallow_copy(name=name) return self def union(self, other): @@ -2785,10 +2781,10 @@ def union(self, other): other = ensure_index(other) if len(other) == 0 or self.equals(other): - return self._get_consensus_name(other) + return self._get_reconciled_name_object(other) if len(self) == 0: - return other._get_consensus_name(self) + return other._get_reconciled_name_object(self) # TODO: is_dtype_union_equal is a hack around # 1. buggy set ops with duplicates (GH #13432) @@ -2851,11 +2847,10 @@ def union(self, other): stacklevel=3) # for subclasses - return self._wrap_union_result(other, result) + return self._wrap_setop_result(other, result) - def _wrap_union_result(self, other, result): - name = self.name if self.name == other.name else None - return self.__class__(result, name=name) + def _wrap_setop_result(self, other, result): + return self._constructor(result, name=get_op_result_name(self, other)) def intersection(self, other): """ @@ -2885,7 +2880,7 @@ def intersection(self, other): other = ensure_index(other) if self.equals(other): - return self._get_consensus_name(other) + return self._get_reconciled_name_object(other) if not is_dtype_equal(self.dtype, other.dtype): this = self.astype('O') @@ -2905,7 +2900,7 @@ def intersection(self, other): if self.is_monotonic and other.is_monotonic: try: result = self._inner_indexer(lvals, rvals)[0] - return self._wrap_union_result(other, result) + return self._wrap_setop_result(other, result) except TypeError: pass @@ -4175,7 +4170,7 @@ def _join_monotonic(self, other, how='left', return_indexers=False): return join_index def _wrap_joined_index(self, joined, other): - name = self.name if self.name == other.name else None + name = get_op_result_name(self, other) return Index(joined, name=name) def _get_string_slice(self, key, use_lhs=True, use_rhs=True): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 278e395d65014..6e2f0b00fcd6e 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -25,6 +25,7 @@ import pandas.core.common as com import pandas.core.missing as missing import pandas.core.indexes.base as ibase +from pandas.core.ops import get_op_result_name from pandas.core.arrays.categorical import Categorical, contains _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -324,6 +325,10 @@ def itemsize(self): # Size of the items in categories, not codes. return self.values.itemsize + def _wrap_setop_result(self, other, result): + name = get_op_result_name(self, other) + return self._shallow_copy(result, name=name) + def get_values(self): """ return the underlying data as an ndarray """ return self._data.get_values() diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index bd6f0c68a9aa5..3a2f9986760d3 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -34,6 +34,7 @@ from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.numeric import Int64Index +from pandas.core.ops import get_op_result_name import pandas.compat as compat from pandas.tseries.frequencies import to_offset, Resolution from pandas.core.indexes.datetimelike import ( @@ -592,6 +593,10 @@ def union(self, other): y : Index or DatetimeIndex """ self._assert_can_do_setop(other) + + if len(other) == 0 or self.equals(other) or len(self) == 0: + return super(DatetimeIndex, self).union(other) + if not isinstance(other, DatetimeIndex): try: other = DatetimeIndex(other) @@ -674,7 +679,7 @@ def _maybe_utc_convert(self, other): return this, other def _wrap_joined_index(self, joined, other): - name = self.name if self.name == other.name else None + name = get_op_result_name(self, other) if (isinstance(other, DatetimeIndex) and self.freq == other.freq and self._can_fast_union(other)): @@ -745,11 +750,11 @@ def _fast_union(self, other): else: return left - def _wrap_union_result(self, other, result): - name = self.name if self.name == other.name else None + def _wrap_setop_result(self, other, result): + name = get_op_result_name(self, other) if not timezones.tz_compare(self.tz, other.tz): raise ValueError('Passed item and index have different timezone') - return self._simple_new(result, name=name, freq=None, tz=self.tz) + return self._shallow_copy(result, name=name, freq=None, tz=self.tz) def intersection(self, other): """ @@ -765,6 +770,10 @@ def intersection(self, other): y : Index or DatetimeIndex """ self._assert_can_do_setop(other) + + if self.equals(other): + return self._get_reconciled_name_object(other) + if not isinstance(other, DatetimeIndex): try: other = DatetimeIndex(other) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index b86921b5579ed..79239ec90ac80 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -25,6 +25,7 @@ from pandas.core.indexes.base import ( Index, ensure_index, default_pprint, _index_shared_docs) +from pandas.core.ops import get_op_result_name from pandas._libs import Timestamp, Timedelta from pandas._libs.interval import ( @@ -1048,7 +1049,7 @@ def func(self, other): raise TypeError(msg.format(op=op_name)) result = getattr(self._multiindex, op_name)(other._multiindex) - result_name = self.name if self.name == other.name else None + result_name = get_op_result_name(self, other) # GH 19101: ensure empty results have correct dtype if result.empty: diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 420b862ae16a4..795ffeefa1794 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -21,7 +21,7 @@ from pandas.util._decorators import Appender, cache_readonly import pandas.core.dtypes.concat as _concat import pandas.core.indexes.base as ibase - +from pandas.core.ops import get_op_result_name _num_index_shared_docs = dict() @@ -215,7 +215,7 @@ def _convert_scalar_indexer(self, key, kind=None): ._convert_scalar_indexer(key, kind=kind)) def _wrap_joined_index(self, joined, other): - name = self.name if self.name == other.name else None + name = get_op_result_name(self, other) return Int64Index(joined, name=name) @classmethod @@ -288,7 +288,7 @@ def _convert_index_indexer(self, keyarr): return keyarr def _wrap_joined_index(self, joined, other): - name = self.name if self.name == other.name else None + name = get_op_result_name(self, other) return UInt64Index(joined, name=name) @classmethod diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 7e11ca5dbfcef..92ffaea521d7f 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -12,9 +12,9 @@ is_integer_dtype, is_datetime64_any_dtype, is_bool_dtype, - pandas_dtype, + pandas_dtype ) - +from pandas.core.ops import get_op_result_name from pandas.core.accessor import PandasDelegate, delegate_names from pandas.core.indexes.datetimes import DatetimeIndex, Int64Index, Index from pandas.core.indexes.datetimelike import ( @@ -848,8 +848,8 @@ def _assert_can_do_setop(self, other): msg = DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - def _wrap_union_result(self, other, result): - name = self.name if self.name == other.name else None + def _wrap_setop_result(self, other, result): + name = get_op_result_name(self, other) result = self._apply_meta(result) result.name = name return result diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 673ab9f2118a4..d1b5645928921 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -5,7 +5,7 @@ import numpy as np -from pandas._libs import index as libindex +from pandas._libs import index as libindex, lib import pandas.compat as compat from pandas.compat import get_range_parameters, lrange, range from pandas.compat.numpy import function as nv @@ -263,8 +263,9 @@ def tolist(self): @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, **kwargs): if values is None: + name = kwargs.get("name", self.name) return RangeIndex._simple_new( - name=self.name, **dict(self._get_data_as_items())) + name=name, **dict(self._get_data_as_items())) else: kwargs.setdefault('name', self.name) return self._int64index._shallow_copy(values, **kwargs) @@ -344,6 +345,10 @@ def intersection(self, other): ------- intersection : Index """ + + if self.equals(other): + return self._get_reconciled_name_object(other) + if not isinstance(other, RangeIndex): return super(RangeIndex, self).intersection(other) @@ -424,10 +429,9 @@ def union(self, other): union : Index """ self._assert_can_do_setop(other) - if len(other) == 0 or self.equals(other): - return self - if len(self) == 0: - return other + if len(other) == 0 or self.equals(other) or len(self) == 0: + return super(RangeIndex, self).union(other) + if isinstance(other, RangeIndex): start_s, step_s = self._start, self._step end_s = self._start + self._step * (len(self) - 1) @@ -498,7 +502,12 @@ def __getitem__(self, key): super_getitem = super(RangeIndex, self).__getitem__ if is_scalar(key): - n = int(key) + if not lib.is_integer(key): + raise IndexError("only integers, slices (`:`), " + "ellipsis (`...`), numpy.newaxis (`None`) " + "and integer or boolean " + "arrays are valid indices") + n = com.cast_scalar_indexer(key) if n != key: return super_getitem(key) if n < 0: @@ -649,7 +658,8 @@ def _evaluate_numeric_binop(self, other): return op(self._int64index, other) # TODO: Do attrs get handled reliably? - return _evaluate_numeric_binop + name = '__{name}__'.format(name=op.__name__) + return compat.set_function_name(_evaluate_numeric_binop, name, cls) cls.__add__ = _make_evaluate_binop(operator.add) cls.__radd__ = _make_evaluate_binop(ops.radd) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 33361c851a4c5..5b077a6984114 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -26,6 +26,7 @@ from pandas.core.base import _shared_docs from pandas.core.indexes.base import _index_shared_docs import pandas.core.common as com +from pandas.core.ops import get_op_result_name import pandas.core.dtypes.concat as _concat from pandas.util._decorators import Appender, Substitution from pandas.core.indexes.datetimelike import ( @@ -281,6 +282,10 @@ def union(self, other): y : Index or TimedeltaIndex """ self._assert_can_do_setop(other) + + if len(other) == 0 or self.equals(other) or len(self) == 0: + return super(TimedeltaIndex, self).union(other) + if not isinstance(other, TimedeltaIndex): try: other = TimedeltaIndex(other) @@ -313,7 +318,7 @@ def join(self, other, how='left', level=None, return_indexers=False, sort=sort) def _wrap_joined_index(self, joined, other): - name = self.name if self.name == other.name else None + name = get_op_result_name(self, other) if (isinstance(other, TimedeltaIndex) and self.freq == other.freq and self._can_fast_union(other)): joined = self._shallow_copy(joined, name=name) @@ -373,10 +378,6 @@ def _fast_union(self, other): else: return left - def _wrap_union_result(self, other, result): - name = self.name if self.name == other.name else None - return self._simple_new(result, name=name, freq=None) - def intersection(self, other): """ Specialized intersection for TimedeltaIndex objects. May be much faster @@ -391,6 +392,10 @@ def intersection(self, other): y : Index or TimedeltaIndex """ self._assert_can_do_setop(other) + + if self.equals(other): + return self._get_reconciled_name_object(other) + if not isinstance(other, TimedeltaIndex): try: other = TimedeltaIndex(other) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 468b1610a9142..c5cbaea23df76 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -360,10 +360,10 @@ def test_has_duplicates(self, indices): def test_duplicated(self, indices, keep): if type(indices) is not self._holder: pytest.skip('Can only check if we know the index type') - if not len(indices) or isinstance(indices, MultiIndex): + if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)): # MultiIndex tested separately in: # tests/indexes/multi/test_unique_and_duplicates - pytest.skip('Skip check for empty Index and MultiIndex') + pytest.skip('Skip check for empty Index, MultiIndex, RangeIndex') idx = self._holder(indices) if idx.has_duplicates: diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 8cfed33a96ac5..e82cce873e75c 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -15,6 +15,7 @@ tm.makeTimedeltaIndex(100), tm.makeIntIndex(100), tm.makeUIntIndex(100), + tm.makeRangeIndex(100), tm.makeFloatIndex(100), Index([True, False]), tm.makeCategoricalIndex(100), diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index daebc6e95aac4..724dffc49dd3b 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -786,6 +786,67 @@ def test_intersect_str_dates(self): assert len(result) == 0 + @pytest.mark.parametrize( + 'fname, sname, expected_name', + [ + ('A', 'A', 'A'), + ('A', 'B', None), + ('A', None, None), + (None, 'B', None), + (None, None, None), + ]) + def test_corner_union(self, indices, fname, sname, expected_name): + # GH 9943 9862 + # Test unions with various name combinations + # Do not test MultiIndex or repeats + + if isinstance(indices, MultiIndex) or not indices.is_unique: + pytest.skip("Not for MultiIndex or repeated indices") + + # Test copy.union(copy) + first = indices.copy().set_names(fname) + second = indices.copy().set_names(sname) + union = first.union(second) + expected = indices.copy().set_names(expected_name) + tm.assert_index_equal(union, expected) + + # Test copy.union(empty) + first = indices.copy().set_names(fname) + second = indices.drop(indices).set_names(sname) + union = first.union(second) + expected = indices.copy().set_names(expected_name) + tm.assert_index_equal(union, expected) + + # Test empty.union(copy) + first = indices.drop(indices).set_names(fname) + second = indices.copy().set_names(sname) + union = first.union(second) + expected = indices.copy().set_names(expected_name) + tm.assert_index_equal(union, expected) + + # Test empty.union(empty) + first = indices.drop(indices).set_names(fname) + second = indices.drop(indices).set_names(sname) + union = first.union(second) + expected = indices.drop(indices).set_names(expected_name) + tm.assert_index_equal(union, expected) + + def test_chained_union(self): + # Chained unions handles names correctly + i1 = Index([1, 2], name='i1') + i2 = Index([3, 4], name='i2') + i3 = Index([5, 6], name='i3') + union = i1.union(i2.union(i3)) + expected = i1.union(i2).union(i3) + tm.assert_index_equal(union, expected) + + j1 = Index([1, 2], name='j1') + j2 = Index([], name='j2') + j3 = Index([], name='j3') + union = j1.union(j2.union(j3)) + expected = j1.union(j2).union(j3) + tm.assert_index_equal(union, expected) + def test_union(self): # TODO: Replace with fixturesult first = self.strIndex[5:20] @@ -824,7 +885,7 @@ def test_union_identity(self): @pytest.mark.parametrize("first_list", [list('ab'), list()]) @pytest.mark.parametrize("second_list", [list('ab'), list()]) @pytest.mark.parametrize("first_name, second_name, expected_name", [ - ('A', 'B', None), (None, 'B', 'B'), ('A', None, 'A')]) + ('A', 'B', None), (None, 'B', None), ('A', None, None)]) def test_union_name_preservation(self, first_list, second_list, first_name, second_name, expected_name): first = Index(first_list, name=first_name) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 23bf8896409c9..c632a9fe31faa 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2290,10 +2290,10 @@ def test_concat_categoricalindex(self): result = pd.concat([a, b, c], axis=1) - exp_idx = pd.CategoricalIndex([0, 1, 2, 9]) - exp = pd.DataFrame({0: [1, np.nan, np.nan, 1], - 1: [2, 2, np.nan, np.nan], - 2: [np.nan, 3, 3, np.nan]}, + exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories) + exp = pd.DataFrame({0: [1, 1, np.nan, np.nan], + 1: [np.nan, 2, 2, np.nan], + 2: [np.nan, np.nan, 3, 3]}, columns=[0, 1, 2], index=exp_idx) tm.assert_frame_equal(result, exp)