diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f65f7d57d5d08..69200d7142b9f 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -566,6 +566,7 @@ Performance Improvements - Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`) - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) - Improved performance of ``Index.difference`` (:issue:`12044`) +- Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`) .. _whatsnew_0190.bug_fixes: @@ -631,6 +632,7 @@ Bug Fixes - Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`) + - Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) @@ -654,6 +656,8 @@ Bug Fixes - Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`) - Bug in invalid ``Timedelta`` arithmetic and comparison may raise ``ValueError`` rather than ``TypeError`` (:issue:`13624`) +- Bug in invalid datetime parsing in ``to_datetime`` and ``DatetimeIndex`` may raise ``TypeError`` rather than ``ValueError`` (:issue:`11169`, :issue:`11287`) +- Bug in ``Index`` created with tz-aware ``Timestamp`` and mismatched ``tz`` option incorrectly coerces timezone (:issue:`13692`) - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 84ea2a92b8026..f6a84ea9debaa 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2440,7 +2440,7 @@ def converter(*date_cols): strs = _concat_date_cols(date_cols) try: - return tools._to_datetime( + return tools.to_datetime( _ensure_object(strs), utc=None, box=False, diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index af44767ae5be5..378e8c545ec83 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -170,16 +170,6 @@ def test_construction_index_with_mixed_timezones(self): self.assert_index_equal(result, exp, exact=True) self.assertFalse(isinstance(result, DatetimeIndex)) - # passing tz results in DatetimeIndex - result = Index([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - tz='Asia/Tokyo', name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 19:00'), - Timestamp('2011-01-03 00:00')], - tz='Asia/Tokyo', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - # length = 1 result = Index([Timestamp('2011-01-01')], name='idx') exp = DatetimeIndex([Timestamp('2011-01-01')], name='idx') @@ -253,17 +243,6 @@ def test_construction_index_with_mixed_timezones_with_NaT(self): self.assert_index_equal(result, exp, exact=True) self.assertFalse(isinstance(result, DatetimeIndex)) - # passing tz results in DatetimeIndex - result = Index([pd.NaT, Timestamp('2011-01-01 10:00'), - pd.NaT, Timestamp('2011-01-02 10:00', - tz='US/Eastern')], - tz='Asia/Tokyo', name='idx') - exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01 19:00'), - pd.NaT, Timestamp('2011-01-03 00:00')], - tz='Asia/Tokyo', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - # all NaT result = Index([pd.NaT, pd.NaT], name='idx') exp = DatetimeIndex([pd.NaT, pd.NaT], name='idx') @@ -323,12 +302,13 @@ def test_construction_dti_with_mixed_timezones(self): self.assertTrue(isinstance(result, DatetimeIndex)) # tz mismatch affecting to tz-aware raises TypeError/ValueError + with tm.assertRaises(ValueError): DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), Timestamp('2011-01-02 10:00', tz='US/Eastern')], name='idx') - with tm.assertRaises(TypeError): + with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'): DatetimeIndex([Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00', tz='US/Eastern')], tz='Asia/Tokyo', name='idx') @@ -338,6 +318,13 @@ def test_construction_dti_with_mixed_timezones(self): Timestamp('2011-01-02 10:00', tz='US/Eastern')], tz='US/Eastern', name='idx') + with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'): + # passing tz should results in DatetimeIndex, then mismatch raises + # TypeError + Index([pd.NaT, Timestamp('2011-01-01 10:00'), + pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], + tz='Asia/Tokyo', name='idx') + def test_construction_base_constructor(self): arr = [pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')] tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 47bb69b8d7ad6..d448ca9878b99 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -292,55 +292,32 @@ def __new__(cls, data=None, raise ValueError('DatetimeIndex() must be called with a ' 'collection of some kind, %s was passed' % repr(data)) - # other iterable of some kind if not isinstance(data, (list, tuple)): data = list(data) - data = np.asarray(data, dtype='O') + elif isinstance(data, ABCSeries): + data = data._values - # try a few ways to make it datetime64 - if lib.is_string_array(data): - data = tslib.parse_str_array_to_datetime(data, freq=freq, - dayfirst=dayfirst, - yearfirst=yearfirst) - else: - data = tools.to_datetime(data, errors='raise') - data.offset = freq - if isinstance(data, DatetimeIndex): - if name is not None: - data.name = name - - if tz is not None: - - # we might already be localized to this tz - # so passing the same tz is ok - # however any other tz is a no-no - if data.tz is None: - return data.tz_localize(tz, ambiguous=ambiguous) - elif str(tz) != str(data.tz): - raise TypeError("Already tz-aware, use tz_convert " - "to convert.") - - return data._deepcopy_if_needed(ref_to_data, copy) - - if issubclass(data.dtype.type, compat.string_types): - data = tslib.parse_str_array_to_datetime(data, freq=freq, - dayfirst=dayfirst, - yearfirst=yearfirst) + # data must be Index or np.ndarray here + if not (is_datetime64_dtype(data) or is_datetimetz(data) or + is_integer_dtype(data)): + data = tools.to_datetime(data, dayfirst=dayfirst, + yearfirst=yearfirst) if issubclass(data.dtype.type, np.datetime64) or is_datetimetz(data): - if isinstance(data, ABCSeries): - data = data._values + if isinstance(data, DatetimeIndex): if tz is None: tz = data.tz - + elif data.tz is None: + data = data.tz_localize(tz, ambiguous=ambiguous) else: # the tz's must match if str(tz) != str(data.tz): - raise TypeError("Already tz-aware, use tz_convert " - "to convert.") + msg = ('data is already tz-aware {0}, unable to ' + 'set specified tz: {1}') + raise TypeError(msg.format(data.tz, tz)) subarr = data.values @@ -356,35 +333,6 @@ def __new__(cls, data=None, if isinstance(data, Int64Index): raise TypeError('cannot convert Int64Index->DatetimeIndex') subarr = data.view(_NS_DTYPE) - else: - if isinstance(data, (ABCSeries, Index)): - values = data._values - else: - values = data - - if lib.is_string_array(values): - subarr = tslib.parse_str_array_to_datetime( - values, freq=freq, dayfirst=dayfirst, yearfirst=yearfirst) - else: - try: - subarr = tools.to_datetime(data, box=False) - - # make sure that we have a index/ndarray like (and not a - # Series) - if isinstance(subarr, ABCSeries): - subarr = subarr._values - if subarr.dtype == np.object_: - subarr = tools._to_datetime(subarr, box=False) - - except ValueError: - # tz aware - subarr = tools._to_datetime(data, box=False, utc=True) - - # we may not have been able to convert - if not (is_datetimetz(subarr) or - np.issubdtype(subarr.dtype, np.datetime64)): - raise ValueError('Unable to convert %s to datetime dtype' - % str(data)) if isinstance(subarr, DatetimeIndex): if tz is None: @@ -399,27 +347,21 @@ def __new__(cls, data=None, ints = subarr.view('i8') subarr = tslib.tz_localize_to_utc(ints, tz, ambiguous=ambiguous) - subarr = subarr.view(_NS_DTYPE) subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz) - - # if dtype is provided, coerce here if dtype is not None: - if not is_dtype_equal(subarr.dtype, dtype): - + # dtype must be coerced to DatetimeTZDtype above if subarr.tz is not None: raise ValueError("cannot localize from non-UTC data") - dtype = DatetimeTZDtype.construct_from_string(dtype) - subarr = subarr.tz_localize(dtype.tz) if verify_integrity and len(subarr) > 0: if freq is not None and not freq_infer: inferred = subarr.inferred_freq if inferred != freq.freqstr: - on_freq = cls._generate(subarr[0], None, len( - subarr), None, freq, tz=tz, ambiguous=ambiguous) + on_freq = cls._generate(subarr[0], None, len(subarr), None, + freq, tz=tz, ambiguous=ambiguous) if not np.array_equal(subarr.asi8, on_freq.asi8): raise ValueError('Inferred frequency {0} from passed ' 'dates does not conform to passed ' @@ -563,7 +505,6 @@ def _generate(cls, start, end, periods, name, offset, index = index[1:] if not right_closed and len(index) and index[-1] == end: index = index[:-1] - index = cls._simple_new(index, name=name, freq=offset, tz=tz) return index @@ -669,7 +610,7 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None, xdr = generate_range(offset=offset, start=_CACHE_START, end=_CACHE_END) - arr = tools._to_datetime(list(xdr), box=False) + arr = tools.to_datetime(list(xdr), box=False) cachedRange = DatetimeIndex._simple_new(arr) cachedRange.offset = offset diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 8d6955ab43711..e493e9d936b02 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -1046,7 +1046,12 @@ def _get_binner_for_grouping(self, obj): l = [] for key, group in grouper.get_iterator(self.ax): l.extend([key] * len(group)) - grouper = binner.__class__(l, freq=binner.freq, name=binner.name) + + if isinstance(self.ax, PeriodIndex): + grouper = binner.__class__(l, freq=binner.freq, name=binner.name) + else: + # resampling causes duplicated values, specifying freq is invalid + grouper = binner.__class__(l, name=binner.name) # since we may have had to sort # may need to reorder groups here diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 299ec374567e7..59fc147ead4eb 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -4087,8 +4087,9 @@ def test_dti_set_index_reindex(self): # 11314 # with tz - index = date_range(datetime(2015, 10, 1), datetime( - 2015, 10, 1, 23), freq='H', tz='US/Eastern') + index = date_range(datetime(2015, 10, 1), + datetime(2015, 10, 1, 23), + freq='H', tz='US/Eastern') df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index) new_index = date_range(datetime(2015, 10, 2), datetime(2015, 10, 2, 23), diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index f30f01e66cb0b..22bb3bddbc742 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -7,7 +7,8 @@ import datetime import pandas as pd -from pandas.core.api import Timestamp, Series, Timedelta, Period, to_datetime +from pandas.core.api import (Timestamp, Index, Series, Timedelta, Period, + to_datetime) from pandas.tslib import get_timezone from pandas._period import period_asfreq, period_ordinal from pandas.tseries.index import date_range, DatetimeIndex @@ -698,14 +699,19 @@ def test_parsers(self): yearfirst=yearfirst) result2 = to_datetime(date_str, yearfirst=yearfirst) result3 = to_datetime([date_str], yearfirst=yearfirst) + # result5 is used below result4 = to_datetime(np.array([date_str], dtype=object), yearfirst=yearfirst) - result6 = DatetimeIndex([date_str], yearfirst=yearfirst)[0] - self.assertEqual(result1, expected) - self.assertEqual(result2, expected) - self.assertEqual(result3, expected) - self.assertEqual(result4, expected) - self.assertEqual(result6, expected) + result6 = DatetimeIndex([date_str], yearfirst=yearfirst) + # result7 is used below + result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst) + result9 = DatetimeIndex(Series([date_str]), yearfirst=yearfirst) + + for res in [result1, result2]: + self.assertEqual(res, expected) + for res in [result3, result4, result6, result8, result9]: + exp = DatetimeIndex([pd.Timestamp(expected)]) + tm.assert_index_equal(res, exp) # these really need to have yearfist, but we don't support if not yearfirst: @@ -893,9 +899,7 @@ def test_parsers_monthfreq(self): for date_str, expected in compat.iteritems(cases): result1, _, _ = tools.parse_time_string(date_str, freq='M') - result2 = tools._to_datetime(date_str, freq='M') self.assertEqual(result1, expected) - self.assertEqual(result2, expected) def test_parsers_quarterly_with_freq(self): msg = ('Incorrect quarterly string is given, quarter ' diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 067e8ec19f644..93d35ff964e69 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -295,22 +295,12 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, 1 loop, best of 3: 471 ms per loop """ - return _to_datetime(arg, errors=errors, dayfirst=dayfirst, - yearfirst=yearfirst, - utc=utc, box=box, format=format, exact=exact, - unit=unit, infer_datetime_format=infer_datetime_format) - -def _to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, - utc=None, box=True, format=None, exact=True, - unit=None, freq=None, infer_datetime_format=False): - """ - Same as to_datetime, but accept freq for - DatetimeIndex internal construction - """ from pandas.tseries.index import DatetimeIndex - def _convert_listlike(arg, box, format, name=None): + tz = 'utc' if utc else None + + def _convert_listlike(arg, box, format, name=None, tz=tz): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') @@ -319,8 +309,7 @@ def _convert_listlike(arg, box, format, name=None): if is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: - return DatetimeIndex(arg, tz='utc' if utc else None, - name=name) + return DatetimeIndex(arg, tz=tz, name=name) except ValueError: pass @@ -328,7 +317,7 @@ def _convert_listlike(arg, box, format, name=None): elif is_datetime64tz_dtype(arg): if not isinstance(arg, DatetimeIndex): - return DatetimeIndex(arg, tz='utc' if utc else None) + return DatetimeIndex(arg, tz=tz, name=name) if utc: arg = arg.tz_convert(None).tz_localize('UTC') return arg @@ -344,8 +333,7 @@ def _convert_listlike(arg, box, format, name=None): from pandas import Index return Index(result) - return DatetimeIndex(result, tz='utc' if utc else None, - name=name) + return DatetimeIndex(result, tz=tz, name=name) return result elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' @@ -382,8 +370,8 @@ def _convert_listlike(arg, box, format, name=None): # fallback if result is None: try: - result = tslib.array_strptime( - arg, format, exact=exact, errors=errors) + result = tslib.array_strptime(arg, format, exact=exact, + errors=errors) except tslib.OutOfBoundsDatetime: if errors == 'raise': raise @@ -404,14 +392,11 @@ def _convert_listlike(arg, box, format, name=None): utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, - freq=freq, require_iso8601=require_iso8601 ) if is_datetime64_dtype(result) and box: - result = DatetimeIndex(result, - tz='utc' if utc else None, - name=name) + result = DatetimeIndex(result, tz=tz, name=name) return result except ValueError as e: diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 5624b84523705..016c49ea2b859 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -852,13 +852,6 @@ cdef inline bint _cmp_nat_dt(_NaT lhs, _Timestamp rhs, int op) except -1: return _nat_scalar_rules[op] -cdef _tz_format(object obj, object zone): - try: - return obj.strftime(' %%Z, tz=%s' % zone) - except: - return ', tz=%s' % zone - - cpdef object get_value_box(ndarray arr, object loc): cdef: Py_ssize_t i, sz @@ -1642,14 +1635,6 @@ cdef inline _check_dts_bounds(pandas_datetimestruct *dts): raise OutOfBoundsDatetime('Out of bounds nanosecond timestamp: %s' % fmt) -# elif isinstance(ts, _Timestamp): -# tmp = ts -# obj.value = (<_Timestamp> ts).value -# obj.dtval = -# elif isinstance(ts, object): -# # If all else fails -# obj.value = _dtlike_to_datetime64(ts, &obj.dts) -# obj.dtval = _dts_to_pydatetime(&obj.dts) def datetime_to_datetime64(ndarray[object] values): cdef: @@ -1689,7 +1674,7 @@ def datetime_to_datetime64(ndarray[object] values): cdef: set _not_datelike_strings = set(['a','A','m','M','p','P','t','T']) -cpdef object _does_string_look_like_datetime(object date_string): +cpdef bint _does_string_look_like_datetime(object date_string): if date_string.startswith('0'): # Strings starting with 0 are more consistent with a # date-like string than a number @@ -1827,8 +1812,14 @@ def parse_datetime_string(object date_string, object freq=None, except ValueError: pass - dt = parse_date(date_string, default=_DEFAULT_DATETIME, - dayfirst=dayfirst, yearfirst=yearfirst, **kwargs) + try: + dt = parse_date(date_string, default=_DEFAULT_DATETIME, + dayfirst=dayfirst, yearfirst=yearfirst, **kwargs) + except TypeError: + # following may be raised from dateutil + # TypeError: 'NoneType' object is not iterable + raise ValueError('Given date string not likely a datetime.') + return dt @@ -2214,7 +2205,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): cpdef array_to_datetime(ndarray[object] values, errors='raise', - dayfirst=False, yearfirst=False, freq=None, + dayfirst=False, yearfirst=False, format=None, utc=None, require_iso8601=False): cdef: @@ -2343,7 +2334,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', try: py_dt = parse_datetime_string(val, dayfirst=dayfirst, - yearfirst=yearfirst, freq=freq) + yearfirst=yearfirst) except Exception: if is_coerce: iresult[i] = NPY_NAT @@ -2423,7 +2414,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', try: oresult[i] = parse_datetime_string(val, dayfirst=dayfirst, - yearfirst=yearfirst, freq=freq) + yearfirst=yearfirst) _pydatetime_to_dts(oresult[i], &dts) _check_dts_bounds(&dts) except Exception: @@ -2438,28 +2429,6 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', return oresult -def parse_str_array_to_datetime(ndarray values, dayfirst=False, - yearfirst=False, object freq=None): - """Shortcut to parse str array for quicker DatetimeIndex construction""" - cdef: - Py_ssize_t i, n = len(values) - object val, py_dt - ndarray[int64_t] iresult - _TSObject _ts - - iresult = np.empty(n, dtype='i8') - - for i in range(n): - val = values[i] - try: - py_dt = parse_datetime_string(val, dayfirst=dayfirst, - yearfirst=yearfirst, freq=freq) - except Exception: - raise ValueError - _ts = convert_to_tsobject(py_dt, None, None, 0, 0) - iresult[i] = _ts.value - - return iresult # Similar to Timestamp/datetime, this is a construction requirement for timedeltas # we need to do object instantiation in python