Skip to content

KeyError when comparing DataFrame with tz-aware DatetimeIndex on columns with DST change #19970

Closed
@Liam3851

Description

@Liam3851

Code Sample, a copy-pastable example if possible

dr = pd.date_range('20160101', '20161130', freq='4H', tz='America/New_York')
df = pd.DataFrame({'a':np.arange(len(dr)), 'b':np.arange(len(dr))}, index=dr)
dfnov_view = df.loc['2016-11']
drnov = pd.date_range('20161101', '20161130', freq='4H', tz='America/New_York')
dfnov = pd.DataFrame({'a':np.arange(len(drnov)), 'b':np.arange(len(drnov))}, index=drnov)

df == df # works
dfnov_view == dfnov_view # works
dfnov == dfnov # works

dfnov.T == dfnov.T # works
df.T == df.T # raises KeyError on master; works on 0.22
dfnov_view.T == dfnov_view.T # raises KeyError on master; works on 0.22

# KeyError stacktrace:
In [7]: dfnov_view.T == dfnov_view.T # raises KeyError
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
C:\projects\pandas-dk\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc()
    457         try:
--> 458             return self.mapping.get_item(val.value)
    459         except KeyError:

C:\projects\pandas-dk\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
    933
--> 934     cpdef get_item(self, int64_t val):
    935         cdef khiter_t k

C:\projects\pandas-dk\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
    939         else:
--> 940             raise KeyError(val)
    941

KeyError: 1478412000000000000

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
C:\projects\pandas-dk\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2635             try:
-> 2636                 return self._engine.get_loc(key)
   2637             except KeyError:

C:\projects\pandas-dk\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc()
    429
--> 430     cpdef get_loc(self, object val):
    431         if is_definitely_invalid_key(val):

C:\projects\pandas-dk\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc()
    459         except KeyError:
--> 460             raise KeyError(val)
    461         except AttributeError:

KeyError: Timestamp('2016-11-06 01:00:00-0500', tz='America/New_York')

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
C:\projects\pandas-dk\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc()
    457         try:
--> 458             return self.mapping.get_item(val.value)
    459         except KeyError:

C:\projects\pandas-dk\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
    933
--> 934     cpdef get_item(self, int64_t val):
    935         cdef khiter_t k

C:\projects\pandas-dk\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
    939         else:
--> 940             raise KeyError(val)
    941

KeyError: 1478412000000000000

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-7-495715640ebe> in <module>()
----> 1 dfnov_view.T == dfnov_view.T # raises KeyError

C:\projects\pandas-dk\pandas\core\ops.py in f(self, other)
   1558                 raise ValueError('Can only compare identically-labeled '
   1559                                  'DataFrame objects')
-> 1560             return self._compare_frame(other, func, str_rep)
   1561
   1562         elif isinstance(other, ABCSeries):

C:\projects\pandas-dk\pandas\core\frame.py in _compare_frame(self, other, func, str_rep)
   4032                 return {col: func(a[col], b[col]) for col in a.columns}
   4033
-> 4034             new_data = expressions.evaluate(_compare, str_rep, self, other)
   4035             return self._constructor(data=new_data, index=self.index,
   4036                                      columns=self.columns, copy=False)

C:\projects\pandas-dk\pandas\core\computation\expressions.py in evaluate(op, op_str, a, b, use_numexpr, **eval_kwargs)
    203     use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b)
    204     if use_numexpr:
--> 205         return _evaluate(op, op_str, a, b, **eval_kwargs)
    206     return _evaluate_standard(op, op_str, a, b)
    207

C:\projects\pandas-dk\pandas\core\computation\expressions.py in _evaluate_numexpr(op, op_str, a, b, truediv, reversed, **eval_kwargs)
    118
    119     if result is None:
--> 120         result = _evaluate_standard(op, op_str, a, b)
    121
    122     return result

C:\projects\pandas-dk\pandas\core\computation\expressions.py in _evaluate_standard(op, op_str, a, b, **eval_kwargs)
     63         _store_test_result(False)
     64     with np.errstate(all='ignore'):
---> 65         return op(a, b)
     66
     67

C:\projects\pandas-dk\pandas\core\frame.py in _compare(a, b)
   4030
   4031             def _compare(a, b):
-> 4032                 return {col: func(a[col], b[col]) for col in a.columns}
   4033
   4034             new_data = expressions.evaluate(_compare, str_rep, self, other)

C:\projects\pandas-dk\pandas\core\frame.py in <dictcomp>(.0)
   4030
   4031             def _compare(a, b):
-> 4032                 return {col: func(a[col], b[col]) for col in a.columns}
   4033
   4034             new_data = expressions.evaluate(_compare, str_rep, self, other)

C:\projects\pandas-dk\pandas\core\frame.py in __getitem__(self, key)
   2202             return self._getitem_multilevel(key)
   2203         else:
-> 2204             return self._getitem_column(key)
   2205
   2206     def _getitem_column(self, key):

C:\projects\pandas-dk\pandas\core\frame.py in _getitem_column(self, key)
   2209         # get column
   2210         if self.columns.is_unique:
-> 2211             return self._get_item_cache(key)
   2212
   2213         # duplicate columns & possible reduce dimensionality

C:\projects\pandas-dk\pandas\core\generic.py in _get_item_cache(self, item)
   2193         res = cache.get(item)
   2194         if res is None:
-> 2195             values = self._data.get(item)
   2196             res = self._box_item_values(item, values)
   2197             cache[item] = res

C:\projects\pandas-dk\pandas\core\internals.py in get(self, item, fastpath)
   4070
   4071             if not isna(item):
-> 4072                 loc = self.items.get_loc(item)
   4073             else:
   4074                 indexer = np.arange(len(self.items))[isna(self.items)]

C:\projects\pandas-dk\pandas\core\indexes\datetimes.py in get_loc(self, key, method, tolerance)
   1555             # needed to localize naive datetimes
   1556             key = Timestamp(key, tz=self.tz)
-> 1557             return Index.get_loc(self, key, method, tolerance)
   1558
   1559         if isinstance(key, time):

C:\projects\pandas-dk\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2636                 return self._engine.get_loc(key)
   2637             except KeyError:
-> 2638                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2639
   2640         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

C:\projects\pandas-dk\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc()
    428         return algos.is_monotonic_int64(values, timelike=True)
    429
--> 430     cpdef get_loc(self, object val):
    431         if is_definitely_invalid_key(val):
    432             raise TypeError

C:\projects\pandas-dk\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc()
    458             return self.mapping.get_item(val.value)
    459         except KeyError:
--> 460             raise KeyError(val)
    461         except AttributeError:
    462             pass

KeyError: Timestamp('2016-11-06 01:00:00-0500', tz='America/New_York')

Problem description

On current master, if you have a DataFrame with a tz-aware DatetimeIndex in the columns, comparison can fail with a KeyError. Oddly, it appears this occurs only if the DatetimeIndex is the columns and not on the index.

Based on the KeyError it appears that for some reason .loc is looking for the pre-DST change value when it should be looking at the post-DST value. Reproducing the bug requires at least 2 DST switches in the original DataFrame. DataFrames with just a single DST switch do not seem to exhibit the behavior unless they are views on a larger DataFrame with two switches.

This is new on master; behavior doesn't occur in 0.22.

Expected Output

Output of pd.show_versions()

INSTALLED VERSIONS

commit: e3b87c1
python: 3.6.4.final.0
python-bits: 64
OS: Windows
OS-release: 7
machine: AMD64
processor: Intel64 Family 6 Model 62 Stepping 4, GenuineIntel
byteorder: little
LC_ALL: None
LANG: None
LOCALE: None.None

pandas: 0.23.0.dev0+422.ge3b87c1
pytest: 3.3.2
pip: 9.0.1
setuptools: 38.4.0
Cython: 0.27.3
numpy: 1.14.0
scipy: 1.0.0
pyarrow: 0.8.0
xarray: 0.10.0
IPython: 6.2.1
sphinx: 1.6.6
patsy: 0.5.0
dateutil: 2.6.1
pytz: 2017.3
blosc: None
bottleneck: 1.2.1
tables: 3.4.2
numexpr: 2.6.4
feather: 0.4.0
matplotlib: 2.1.2
openpyxl: 2.4.10
xlrd: 1.1.0
xlwt: 1.3.0
xlsxwriter: 1.0.2
lxml: 4.1.1
bs4: 4.6.0
html5lib: 1.0.1
sqlalchemy: 1.2.1
pymysql: 0.7.11.None
psycopg2: None
jinja2: 2.10
s3fs: 0.1.2
fastparquet: 0.1.4
pandas_gbq: None
pandas_datareader: None

Metadata

Metadata

Assignees

No one assigned

    Labels

    RegressionFunctionality that used to work in a prior pandas versionTimezonesTimezone data dtype

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions