Description
Code Sample, a copy-pastable example if possible
dr = pd.date_range('20160101', '20161130', freq='4H', tz='America/New_York')
df = pd.DataFrame({'a':np.arange(len(dr)), 'b':np.arange(len(dr))}, index=dr)
dfnov_view = df.loc['2016-11']
drnov = pd.date_range('20161101', '20161130', freq='4H', tz='America/New_York')
dfnov = pd.DataFrame({'a':np.arange(len(drnov)), 'b':np.arange(len(drnov))}, index=drnov)
df == df # works
dfnov_view == dfnov_view # works
dfnov == dfnov # works
dfnov.T == dfnov.T # works
df.T == df.T # raises KeyError on master; works on 0.22
dfnov_view.T == dfnov_view.T # raises KeyError on master; works on 0.22
# KeyError stacktrace:
In [7]: dfnov_view.T == dfnov_view.T # raises KeyError
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
C:\projects\pandas-dk\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc()
457 try:
--> 458 return self.mapping.get_item(val.value)
459 except KeyError:
C:\projects\pandas-dk\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
933
--> 934 cpdef get_item(self, int64_t val):
935 cdef khiter_t k
C:\projects\pandas-dk\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
939 else:
--> 940 raise KeyError(val)
941
KeyError: 1478412000000000000
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
C:\projects\pandas-dk\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2635 try:
-> 2636 return self._engine.get_loc(key)
2637 except KeyError:
C:\projects\pandas-dk\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc()
429
--> 430 cpdef get_loc(self, object val):
431 if is_definitely_invalid_key(val):
C:\projects\pandas-dk\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc()
459 except KeyError:
--> 460 raise KeyError(val)
461 except AttributeError:
KeyError: Timestamp('2016-11-06 01:00:00-0500', tz='America/New_York')
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
C:\projects\pandas-dk\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc()
457 try:
--> 458 return self.mapping.get_item(val.value)
459 except KeyError:
C:\projects\pandas-dk\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
933
--> 934 cpdef get_item(self, int64_t val):
935 cdef khiter_t k
C:\projects\pandas-dk\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
939 else:
--> 940 raise KeyError(val)
941
KeyError: 1478412000000000000
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-7-495715640ebe> in <module>()
----> 1 dfnov_view.T == dfnov_view.T # raises KeyError
C:\projects\pandas-dk\pandas\core\ops.py in f(self, other)
1558 raise ValueError('Can only compare identically-labeled '
1559 'DataFrame objects')
-> 1560 return self._compare_frame(other, func, str_rep)
1561
1562 elif isinstance(other, ABCSeries):
C:\projects\pandas-dk\pandas\core\frame.py in _compare_frame(self, other, func, str_rep)
4032 return {col: func(a[col], b[col]) for col in a.columns}
4033
-> 4034 new_data = expressions.evaluate(_compare, str_rep, self, other)
4035 return self._constructor(data=new_data, index=self.index,
4036 columns=self.columns, copy=False)
C:\projects\pandas-dk\pandas\core\computation\expressions.py in evaluate(op, op_str, a, b, use_numexpr, **eval_kwargs)
203 use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b)
204 if use_numexpr:
--> 205 return _evaluate(op, op_str, a, b, **eval_kwargs)
206 return _evaluate_standard(op, op_str, a, b)
207
C:\projects\pandas-dk\pandas\core\computation\expressions.py in _evaluate_numexpr(op, op_str, a, b, truediv, reversed, **eval_kwargs)
118
119 if result is None:
--> 120 result = _evaluate_standard(op, op_str, a, b)
121
122 return result
C:\projects\pandas-dk\pandas\core\computation\expressions.py in _evaluate_standard(op, op_str, a, b, **eval_kwargs)
63 _store_test_result(False)
64 with np.errstate(all='ignore'):
---> 65 return op(a, b)
66
67
C:\projects\pandas-dk\pandas\core\frame.py in _compare(a, b)
4030
4031 def _compare(a, b):
-> 4032 return {col: func(a[col], b[col]) for col in a.columns}
4033
4034 new_data = expressions.evaluate(_compare, str_rep, self, other)
C:\projects\pandas-dk\pandas\core\frame.py in <dictcomp>(.0)
4030
4031 def _compare(a, b):
-> 4032 return {col: func(a[col], b[col]) for col in a.columns}
4033
4034 new_data = expressions.evaluate(_compare, str_rep, self, other)
C:\projects\pandas-dk\pandas\core\frame.py in __getitem__(self, key)
2202 return self._getitem_multilevel(key)
2203 else:
-> 2204 return self._getitem_column(key)
2205
2206 def _getitem_column(self, key):
C:\projects\pandas-dk\pandas\core\frame.py in _getitem_column(self, key)
2209 # get column
2210 if self.columns.is_unique:
-> 2211 return self._get_item_cache(key)
2212
2213 # duplicate columns & possible reduce dimensionality
C:\projects\pandas-dk\pandas\core\generic.py in _get_item_cache(self, item)
2193 res = cache.get(item)
2194 if res is None:
-> 2195 values = self._data.get(item)
2196 res = self._box_item_values(item, values)
2197 cache[item] = res
C:\projects\pandas-dk\pandas\core\internals.py in get(self, item, fastpath)
4070
4071 if not isna(item):
-> 4072 loc = self.items.get_loc(item)
4073 else:
4074 indexer = np.arange(len(self.items))[isna(self.items)]
C:\projects\pandas-dk\pandas\core\indexes\datetimes.py in get_loc(self, key, method, tolerance)
1555 # needed to localize naive datetimes
1556 key = Timestamp(key, tz=self.tz)
-> 1557 return Index.get_loc(self, key, method, tolerance)
1558
1559 if isinstance(key, time):
C:\projects\pandas-dk\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2636 return self._engine.get_loc(key)
2637 except KeyError:
-> 2638 return self._engine.get_loc(self._maybe_cast_indexer(key))
2639
2640 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
C:\projects\pandas-dk\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc()
428 return algos.is_monotonic_int64(values, timelike=True)
429
--> 430 cpdef get_loc(self, object val):
431 if is_definitely_invalid_key(val):
432 raise TypeError
C:\projects\pandas-dk\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc()
458 return self.mapping.get_item(val.value)
459 except KeyError:
--> 460 raise KeyError(val)
461 except AttributeError:
462 pass
KeyError: Timestamp('2016-11-06 01:00:00-0500', tz='America/New_York')
Problem description
On current master, if you have a DataFrame with a tz-aware DatetimeIndex in the columns, comparison can fail with a KeyError. Oddly, it appears this occurs only if the DatetimeIndex is the columns and not on the index.
Based on the KeyError it appears that for some reason .loc is looking for the pre-DST change value when it should be looking at the post-DST value. Reproducing the bug requires at least 2 DST switches in the original DataFrame. DataFrames with just a single DST switch do not seem to exhibit the behavior unless they are views on a larger DataFrame with two switches.
This is new on master; behavior doesn't occur in 0.22.
Expected Output
Output of pd.show_versions()
INSTALLED VERSIONS
commit: e3b87c1
python: 3.6.4.final.0
python-bits: 64
OS: Windows
OS-release: 7
machine: AMD64
processor: Intel64 Family 6 Model 62 Stepping 4, GenuineIntel
byteorder: little
LC_ALL: None
LANG: None
LOCALE: None.None
pandas: 0.23.0.dev0+422.ge3b87c1
pytest: 3.3.2
pip: 9.0.1
setuptools: 38.4.0
Cython: 0.27.3
numpy: 1.14.0
scipy: 1.0.0
pyarrow: 0.8.0
xarray: 0.10.0
IPython: 6.2.1
sphinx: 1.6.6
patsy: 0.5.0
dateutil: 2.6.1
pytz: 2017.3
blosc: None
bottleneck: 1.2.1
tables: 3.4.2
numexpr: 2.6.4
feather: 0.4.0
matplotlib: 2.1.2
openpyxl: 2.4.10
xlrd: 1.1.0
xlwt: 1.3.0
xlsxwriter: 1.0.2
lxml: 4.1.1
bs4: 4.6.0
html5lib: 1.0.1
sqlalchemy: 1.2.1
pymysql: 0.7.11.None
psycopg2: None
jinja2: 2.10
s3fs: 0.1.2
fastparquet: 0.1.4
pandas_gbq: None
pandas_datareader: None