-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH - Index set operation modifications to address issue #23525 #23538
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
c2cf269
435e50f
4922fd3
5e528a1
cdaa5b0
40d57ec
11fd041
8f0ace3
8364c2e
ab329a9
93486ad
e435e4c
b9787b8
2241b65
4daf360
6e5a52b
d344e11
b339bd1
85e2db7
cf34960
7150c22
fbb3743
5aa41f6
02d7a3b
558e182
706f973
2ccab59
c70f1c0
edb7e9c
84bfbda
aba75fe
fc9f138
69cce99
fdfc7d7
42ca70e
5b25645
9b1ee7f
fdf9b71
1de3cc8
8ed1093
77ca3a3
5921038
3b94e3b
fd4510e
345eec1
265a7ee
5de3d57
6d82621
0af8a24
5a87715
a4f9e78
c3c0caa
0bcbdf4
c410625
6bb054f
aea731c
b5938fc
25452fc
0b97a79
bf11c6f
6fd941d
32037b5
8870006
1d12bc9
92f6707
fbf3242
38d9f74
b9e7b18
69aaa93
b57160a
daa1287
54898c1
fa839a9
a36f475
b840f49
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2756,6 +2756,16 @@ def _get_reconciled_name_object(self, other): | |
return self._shallow_copy(name=name) | ||
return self | ||
|
||
def _union_inconsistent_dtypes(self, other): | ||
this = self.astype('O') | ||
# call Index for when `other` is list-like | ||
other = Index(other).astype('O') | ||
return Index.union(this, other).astype('O') | ||
|
||
def _is_inconsistent(self, other): | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return (type(self) is not type(other) | ||
or not is_dtype_equal(self.dtype, other.dtype)) | ||
|
||
def union(self, other): | ||
""" | ||
Form the union of two Index objects and sorts if possible. | ||
|
@@ -2778,22 +2788,36 @@ def union(self, other): | |
|
||
""" | ||
self._assert_can_do_setop(other) | ||
|
||
if self._is_inconsistent(other): | ||
return self._union_inconsistent_dtypes(other) | ||
|
||
other = ensure_index(other) | ||
return self._union(other) | ||
|
||
if len(other) == 0 or self.equals(other): | ||
return self._get_reconciled_name_object(other) | ||
def _union(self, other): | ||
|
||
# if is_dtype_equal(self.dtype, other.dtype): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you put some comments here on the decisions here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All subclass implementations started with this check, so thought it would make sense to pull this out to be common among all, and implement index specific overriden behavior in the |
||
if len(self) == 0: | ||
return other._get_reconciled_name_object(self) | ||
elif len(other) == 0: | ||
return self._get_reconciled_name_object(other) | ||
|
||
if self.equals(other): | ||
return self._get_reconciled_name_object(other) | ||
|
||
|
||
# TODO: is_dtype_union_equal is a hack around | ||
# 1. buggy set ops with duplicates (GH #13432) | ||
# 2. CategoricalIndex lacking setops (GH #10186) | ||
# Once those are fixed, this workaround can be removed | ||
if not is_dtype_union_equal(self.dtype, other.dtype): | ||
this = self.astype('O') | ||
other = other.astype('O') | ||
return this.union(other) | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# remove? this might be too lenient | ||
# if not is_dtype_union_equal(self.dtype, other.dtype): | ||
# this = self.astype('O') | ||
# other = other.astype('O') | ||
# # force object dtype, for empty index cases | ||
# return this.union(other).astype('O') | ||
|
||
# TODO(EA): setops-refactor, clean all this up | ||
if is_period_dtype(self) or is_datetime64tz_dtype(self): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -578,6 +578,17 @@ def unique(self, level=None): | |
result = super(DatetimeIndex, naive).unique(level=level) | ||
return self._shallow_copy(result.values) | ||
|
||
def _is_inconsistent(self, other): | ||
is_inconsistent = super(DatetimeIndex, self)._is_inconsistent(other) | ||
if is_inconsistent: | ||
if hasattr(other, 'dtype'): | ||
# If same base, consider consistent, let UTC logic takeover | ||
return self.dtype.base != other.dtype.base | ||
else: | ||
return True | ||
else: | ||
return is_inconsistent | ||
|
||
def union(self, other): | ||
""" | ||
Specialized union for DatetimeIndex objects. If combine | ||
|
@@ -592,10 +603,11 @@ def union(self, other): | |
------- | ||
y : Index or DatetimeIndex | ||
""" | ||
self._assert_can_do_setop(other) | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return super(DatetimeIndex, self).union(other) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since all the logic for custom unions is in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you can remove this, no? |
||
|
||
def _union(self, other): | ||
if len(other) == 0 or self.equals(other) or len(self) == 0: | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return super(DatetimeIndex, self).union(other) | ||
return super(DatetimeIndex, self)._union(other) | ||
|
||
if not isinstance(other, DatetimeIndex): | ||
try: | ||
|
@@ -608,7 +620,7 @@ def union(self, other): | |
if this._can_fast_union(other): | ||
return this._fast_union(other) | ||
else: | ||
result = Index.union(this, other) | ||
result = Index._union(this, other) | ||
if isinstance(result, DatetimeIndex): | ||
result._tz = timezones.tz_standardize(this.tz) | ||
if (result.freq is None and | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1038,7 +1038,16 @@ def overlaps(self, other): | |
|
||
def _setop(op_name): | ||
def func(self, other): | ||
other = self._as_like_interval_index(other) | ||
try: | ||
jschendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
other = self._as_like_interval_index(other) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agreed and done, also allows me to get rid of the try except that really should have just been an if condition. |
||
# allow ValueError from this method to raise to catch mixed closed | ||
# except only Non-Interval index mismatches. | ||
except TypeError: | ||
# Currently this will cause difference operations to return | ||
# object dtype as opposed to IntervalIndex, unlike other Index | ||
# objects that return the same type when using `difference` on | ||
# mismatched types | ||
return getattr(self.astype('O'), op_name)(other) | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# GH 19016: ensure set op will not return a prohibited dtype | ||
subtypes = [self.dtype.subtype, other.dtype.subtype] | ||
|
@@ -1059,6 +1068,7 @@ def func(self, other): | |
|
||
return type(self).from_tuples(result, closed=self.closed, | ||
name=result_name) | ||
|
||
return func | ||
|
||
union = _setop('union') | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -228,6 +228,17 @@ def _assert_safe_casting(cls, data, subarr): | |
raise TypeError('Unsafe NumPy casting, you must ' | ||
'explicitly cast') | ||
|
||
def _is_inconsistent(self, other): | ||
from pandas.core.indexes.range import RangeIndex | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use ABCRangeIndex |
||
is_inconsistent = super(Int64Index, self)._is_inconsistent(other) | ||
if is_inconsistent: | ||
if type(self) is Int64Index and isinstance(other, RangeIndex): | ||
return False | ||
else: | ||
return True | ||
else: | ||
return is_inconsistent | ||
|
||
|
||
Int64Index._add_numeric_methods() | ||
Int64Index._add_logical_methods() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -416,6 +416,13 @@ def _extended_gcd(self, a, b): | |
old_t, t = t, old_t - quotient * t | ||
return old_r, old_s, old_t | ||
|
||
def _is_inconsistent(self, other): | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
is_inconsistent = super(RangeIndex, self)._is_inconsistent(other) | ||
if is_inconsistent: | ||
return not type(other) is Int64Index | ||
else: | ||
return is_inconsistent | ||
|
||
def union(self, other): | ||
""" | ||
Form the union of two Index objects and sorts if possible | ||
|
@@ -428,9 +435,11 @@ def union(self, other): | |
------- | ||
union : Index | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this needed any longer? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see related comment in DatetimeIndex module |
||
self._assert_can_do_setop(other) | ||
return super(RangeIndex, self).union(other) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this needed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is another case where I was preserving the original public docstring, which differed from the super docstring. Should I just go ahead and remove all the overriden public union methods and allow the base docstring to propagate for all instances? |
||
|
||
def _union(self, other): | ||
if len(other) == 0 or self.equals(other) or len(self) == 0: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comment as above |
||
return super(RangeIndex, self).union(other) | ||
return super(RangeIndex, self)._union(other) | ||
|
||
if isinstance(other, RangeIndex): | ||
start_s, step_s = self._start, self._step | ||
|
@@ -469,7 +478,7 @@ def union(self, other): | |
(end_s - step_o <= end_o)): | ||
return RangeIndex(start_r, end_r + step_o, step_o) | ||
|
||
return self._int64index.union(other) | ||
return self._int64index._union(other) | ||
|
||
@Appender(_index_shared_docs['join']) | ||
def join(self, other, how='left', level=None, return_indexers=False, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,10 +29,13 @@ def test_union2(self): | |
assert tm.equalContents(union, everything) | ||
|
||
# GH 10149 | ||
expected = first.astype('O').union( | ||
pd.Index(second.values, dtype='O') | ||
).astype('O') | ||
cases = [klass(second.values) for klass in [np.array, Series, list]] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ideally can parametrize this here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
for case in cases: | ||
result = first.union(case) | ||
assert tm.equalContents(result, everything) | ||
assert tm.equalContents(result, expected) | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
@pytest.mark.parametrize("tz", tz) | ||
def test_union(self, tz): | ||
|
@@ -256,11 +259,11 @@ def test_datetimeindex_union_join_empty(self): | |
empty = Index([]) | ||
|
||
result = dti.union(empty) | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
assert isinstance(result, DatetimeIndex) | ||
assert result is result | ||
tm.assert_index_equal(result, dti.astype('O')) | ||
|
||
result = dti.join(empty) | ||
assert isinstance(result, DatetimeIndex) | ||
tm.assert_index_equal(result, dti) | ||
|
||
def test_join_nonunique(self): | ||
idx1 = to_datetime(['2012-11-06 16:00:11.477563', | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -105,9 +105,9 @@ def test_union_misc(self): | |
with pytest.raises(period.IncompatibleFrequency): | ||
index.union(index2) | ||
|
||
msg = 'can only call with other PeriodIndex-ed objects' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we have cases where we would raise ever? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do not believe so. Only the incompatible frequency error which is covered in the line above. |
||
with tm.assert_raises_regex(ValueError, msg): | ||
index.join(index.to_timestamp()) | ||
# msg = 'can only call with other PeriodIndex-ed objects' | ||
# with tm.assert_raises_regex(ValueError, msg): | ||
# index.join(index.to_timestamp()) | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
index3 = period_range('1/1/2000', '1/20/2000', freq='2D') | ||
with pytest.raises(period.IncompatibleFrequency): | ||
|
Uh oh!
There was an error while loading. Please reload this page.