From f3d6d2078e7b367f3500e7af1f362953de1d2eed Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 14 Oct 2017 16:52:25 -0700 Subject: [PATCH 1/4] Separate conversion functions out from tslib --- pandas/_libs/tslib.pyx | 455 +------------------------- pandas/_libs/tslibs/conversion.pxd | 17 + pandas/_libs/tslibs/conversion.pyx | 500 +++++++++++++++++++++++++++++ setup.py | 2 + 4 files changed, 526 insertions(+), 448 deletions(-) create mode 100644 pandas/_libs/tslibs/conversion.pxd create mode 100644 pandas/_libs/tslibs/conversion.pyx diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index f58aaa0ce3234..1976e42650bc2 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -72,6 +72,9 @@ from pandas.compat import iteritems, callable import collections import warnings +import pytz +UTC = pytz.utc + # initialize numpy import_array() #import_ufunc() @@ -91,6 +94,10 @@ from tslibs.timezones cimport ( from tslibs.fields import ( get_date_name_field, get_start_end_field, get_date_field, build_field_sarray) +from tslibs.conversion cimport tz_convert_single, _TSObject, _localize_tso +from tslibs.conversion import ( + tz_localize_to_utc, tz_convert, + tz_convert_single) cdef inline object create_timestamp_from_ts( @@ -1501,16 +1508,6 @@ cdef class _NaT(_Timestamp): return NotImplemented -# lightweight C object to hold datetime & int64 pair -cdef class _TSObject: - cdef: - pandas_datetimestruct dts # pandas_datetimestruct - int64_t value # numpy dt64 - object tzinfo - - property value: - def __get__(self): - return self.value # helper to extract datetime and int64 from several different possibilities @@ -1749,64 +1746,6 @@ def _test_parse_iso8601(object ts): else: return Timestamp(obj.value) -cdef inline void _localize_tso(_TSObject obj, object tz): - """ - Take a TSObject in UTC and localizes to timezone tz. - """ - cdef: - ndarray[int64_t] trans, deltas - Py_ssize_t delta, posn - - if is_utc(tz): - obj.tzinfo = tz - elif is_tzlocal(tz): - pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) - dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, obj.dts.hour, - obj.dts.min, obj.dts.sec, obj.dts.us, tz) - delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 - if obj.value != NPY_NAT: - pandas_datetime_to_datetimestruct(obj.value + delta, - PANDAS_FR_ns, &obj.dts) - else: - pandas_datetime_to_datetimestruct(obj.value, - PANDAS_FR_ns, &obj.dts) - obj.tzinfo = tz - else: - # Adjust datetime64 timestamp, recompute datetimestruct - trans, deltas, typ = get_dst_info(tz) - - pos = trans.searchsorted(obj.value, side='right') - 1 - - # static/pytz/dateutil specific code - if is_fixed_offset(tz): - # statictzinfo - if len(deltas) > 0 and obj.value != NPY_NAT: - pandas_datetime_to_datetimestruct(obj.value + deltas[0], - PANDAS_FR_ns, &obj.dts) - else: - pandas_datetime_to_datetimestruct( - obj.value, PANDAS_FR_ns, &obj.dts) - obj.tzinfo = tz - elif treat_tz_as_pytz(tz): - inf = tz._transition_info[pos] - if obj.value != NPY_NAT: - pandas_datetime_to_datetimestruct(obj.value + deltas[pos], - PANDAS_FR_ns, &obj.dts) - else: - pandas_datetime_to_datetimestruct(obj.value, - PANDAS_FR_ns, &obj.dts) - obj.tzinfo = tz._tzinfos[inf] - elif treat_tz_as_dateutil(tz): - if obj.value != NPY_NAT: - pandas_datetime_to_datetimestruct(obj.value + deltas[pos], - PANDAS_FR_ns, &obj.dts) - else: - pandas_datetime_to_datetimestruct(obj.value, - PANDAS_FR_ns, &obj.dts) - obj.tzinfo = tz - else: - obj.tzinfo = tz - cpdef inline object _localize_pydatetime(object dt, object tz): """ @@ -3510,386 +3449,6 @@ def i8_to_pydt(int64_t i8, object tzinfo = None): """ return Timestamp(i8) -#---------------------------------------------------------------------- -# time zone conversion helpers - -import pytz -UTC = pytz.utc - - -@cython.boundscheck(False) -@cython.wraparound(False) -def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): - """ - Convert the values (in i8) from timezone1 to timezone2 - - Parameters - ---------- - vals : int64 ndarray - tz1 : string / timezone object - tz2 : string / timezone object - - Returns - ------- - int64 ndarray of converted - """ - - cdef: - ndarray[int64_t] utc_dates, tt, result, trans, deltas - Py_ssize_t i, j, pos, n = len(vals) - ndarray[Py_ssize_t] posn - int64_t v, offset, delta - pandas_datetimestruct dts - - if len(vals) == 0: - return np.array([], dtype=np.int64) - - # Convert to UTC - if get_timezone(tz1) != 'UTC': - utc_dates = np.empty(n, dtype=np.int64) - if is_tzlocal(tz1): - for i in range(n): - v = vals[i] - if v == NPY_NAT: - utc_dates[i] = NPY_NAT - else: - pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz1) - delta = (int(get_utcoffset(tz1, dt).total_seconds()) - * 1000000000) - utc_dates[i] = v - delta - else: - trans, deltas, typ = get_dst_info(tz1) - - # all-NaT - tt = vals[vals!=NPY_NAT] - if not len(tt): - return vals - - posn = trans.searchsorted(tt, side='right') - j = 0 - for i in range(n): - v = vals[i] - if v == NPY_NAT: - utc_dates[i] = NPY_NAT - else: - pos = posn[j] - 1 - j = j + 1 - if pos < 0: - raise ValueError('First time before start of DST info') - offset = deltas[pos] - utc_dates[i] = v - offset - else: - utc_dates = vals - - if get_timezone(tz2) == 'UTC': - return utc_dates - - result = np.zeros(n, dtype=np.int64) - if is_tzlocal(tz2): - for i in range(n): - v = utc_dates[i] - if v == NPY_NAT: - result[i] = NPY_NAT - else: - pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz2) - delta = (int(get_utcoffset(tz2, dt).total_seconds()) - * 1000000000) - result[i] = v + delta - return result - - # Convert UTC to other timezone - trans, deltas, typ = get_dst_info(tz2) - - # use first non-NaT element - # if all-NaT, return all-NaT - if (result==NPY_NAT).all(): - return result - - # if all NaT, return all NaT - tt = utc_dates[utc_dates!=NPY_NAT] - if not len(tt): - return utc_dates - - posn = trans.searchsorted(tt, side='right') - - j = 0 - for i in range(n): - v = utc_dates[i] - if vals[i] == NPY_NAT: - result[i] = vals[i] - else: - pos = posn[j] - 1 - j = j + 1 - if pos < 0: - raise ValueError('First time before start of DST info') - offset = deltas[pos] - result[i] = v + offset - return result - - -cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): - """ - Convert the val (in i8) from timezone1 to timezone2 - - This is a single timezone versoin of tz_convert - - Parameters - ---------- - val : int64 - tz1 : string / timezone object - tz2 : string / timezone object - - Returns - ------- - int64 converted - - """ - - cdef: - ndarray[int64_t] trans, deltas - Py_ssize_t pos - int64_t v, offset, utc_date - pandas_datetimestruct dts - - if val == NPY_NAT: - return val - - # Convert to UTC - if is_tzlocal(tz1): - pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz1) - delta = int(get_utcoffset(tz1, dt).total_seconds()) * 1000000000 - utc_date = val - delta - elif get_timezone(tz1) != 'UTC': - trans, deltas, typ = get_dst_info(tz1) - pos = trans.searchsorted(val, side='right') - 1 - if pos < 0: - raise ValueError('First time before start of DST info') - offset = deltas[pos] - utc_date = val - offset - else: - utc_date = val - - if get_timezone(tz2) == 'UTC': - return utc_date - if is_tzlocal(tz2): - pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz2) - delta = int(get_utcoffset(tz2, dt).total_seconds()) * 1000000000 - return utc_date + delta - - # Convert UTC to other timezone - trans, deltas, typ = get_dst_info(tz2) - - pos = trans.searchsorted(utc_date, side='right') - 1 - if pos < 0: - raise ValueError('First time before start of DST info') - - offset = deltas[pos] - return utc_date + offset - - -@cython.boundscheck(False) -@cython.wraparound(False) -def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, - object errors='raise'): - """ - Localize tzinfo-naive i8 to given time zone (using pytz). If - there are ambiguities in the values, raise AmbiguousTimeError. - - Returns - ------- - localized : DatetimeIndex - """ - cdef: - ndarray[int64_t] trans, deltas, idx_shifted - ndarray ambiguous_array - Py_ssize_t i, idx, pos, ntrans, n = len(vals) - int64_t *tdata - int64_t v, left, right - ndarray[int64_t] result, result_a, result_b, dst_hours - pandas_datetimestruct dts - bint infer_dst = False, is_dst = False, fill = False - bint is_coerce = errors == 'coerce', is_raise = errors == 'raise' - - # Vectorized version of DstTzInfo.localize - - assert is_coerce or is_raise - - if tz == UTC or tz is None: - return vals - - result = np.empty(n, dtype=np.int64) - - if is_tzlocal(tz): - for i in range(n): - v = vals[i] - pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz) - delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 - result[i] = v - delta - return result - - if util.is_string_object(ambiguous): - if ambiguous == 'infer': - infer_dst = True - elif ambiguous == 'NaT': - fill = True - elif isinstance(ambiguous, bool): - is_dst = True - if ambiguous: - ambiguous_array = np.ones(len(vals), dtype=bool) - else: - ambiguous_array = np.zeros(len(vals), dtype=bool) - elif hasattr(ambiguous, '__iter__'): - is_dst = True - if len(ambiguous) != len(vals): - raise ValueError( - "Length of ambiguous bool-array must be the same size as vals") - ambiguous_array = np.asarray(ambiguous) - - trans, deltas, typ = get_dst_info(tz) - - tdata = trans.data - ntrans = len(trans) - - result_a = np.empty(n, dtype=np.int64) - result_b = np.empty(n, dtype=np.int64) - result_a.fill(NPY_NAT) - result_b.fill(NPY_NAT) - - # left side - idx_shifted = (np.maximum(0, trans.searchsorted( - vals - DAY_NS, side='right') - 1)).astype(np.int64) - - for i in range(n): - v = vals[i] - deltas[idx_shifted[i]] - pos = bisect_right_i8(tdata, v, ntrans) - 1 - - # timestamp falls to the left side of the DST transition - if v + deltas[pos] == vals[i]: - result_a[i] = v - - # right side - idx_shifted = (np.maximum(0, trans.searchsorted( - vals + DAY_NS, side='right') - 1)).astype(np.int64) - - for i in range(n): - v = vals[i] - deltas[idx_shifted[i]] - pos = bisect_right_i8(tdata, v, ntrans) - 1 - - # timestamp falls to the right side of the DST transition - if v + deltas[pos] == vals[i]: - result_b[i] = v - - if infer_dst: - dst_hours = np.empty(n, dtype=np.int64) - dst_hours.fill(NPY_NAT) - - # Get the ambiguous hours (given the above, these are the hours - # where result_a != result_b and neither of them are NAT) - both_nat = np.logical_and(result_a != NPY_NAT, result_b != NPY_NAT) - both_eq = result_a == result_b - trans_idx = np.squeeze(np.nonzero(np.logical_and(both_nat, ~both_eq))) - if trans_idx.size == 1: - stamp = Timestamp(vals[trans_idx]) - raise pytz.AmbiguousTimeError( - "Cannot infer dst time from %s as there " - "are no repeated times" % stamp) - # Split the array into contiguous chunks (where the difference between - # indices is 1). These are effectively dst transitions in different - # years which is useful for checking that there is not an ambiguous - # transition in an individual year. - if trans_idx.size > 0: - one_diff = np.where(np.diff(trans_idx) != 1)[0] +1 - trans_grp = np.array_split(trans_idx, one_diff) - - # Iterate through each day, if there are no hours where the - # delta is negative (indicates a repeat of hour) the switch - # cannot be inferred - for grp in trans_grp: - - delta = np.diff(result_a[grp]) - if grp.size == 1 or np.all(delta > 0): - stamp = Timestamp(vals[grp[0]]) - raise pytz.AmbiguousTimeError(stamp) - - # Find the index for the switch and pull from a for dst and b - # for standard - switch_idx = (delta <= 0).nonzero()[0] - if switch_idx.size > 1: - raise pytz.AmbiguousTimeError( - "There are %i dst switches when " - "there should only be 1." % switch_idx.size) - switch_idx = switch_idx[0] + 1 # Pull the only index and adjust - a_idx = grp[:switch_idx] - b_idx = grp[switch_idx:] - dst_hours[grp] = np.hstack((result_a[a_idx], result_b[b_idx])) - - for i in range(n): - left = result_a[i] - right = result_b[i] - if vals[i] == NPY_NAT: - result[i] = vals[i] - elif left != NPY_NAT and right != NPY_NAT: - if left == right: - result[i] = left - else: - if infer_dst and dst_hours[i] != NPY_NAT: - result[i] = dst_hours[i] - elif is_dst: - if ambiguous_array[i]: - result[i] = left - else: - result[i] = right - elif fill: - result[i] = NPY_NAT - else: - stamp = Timestamp(vals[i]) - raise pytz.AmbiguousTimeError( - "Cannot infer dst time from %r, try using the " - "'ambiguous' argument" % stamp) - elif left != NPY_NAT: - result[i] = left - elif right != NPY_NAT: - result[i] = right - else: - if is_coerce: - result[i] = NPY_NAT - else: - stamp = Timestamp(vals[i]) - raise pytz.NonExistentTimeError(stamp) - - return result - - -cdef inline bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n): - cdef Py_ssize_t pivot, left = 0, right = n - - # edge cases - if val > data[n - 1]: - return n - - if val < data[0]: - return 0 - - while left < right: - pivot = left + (right - left) // 2 - - if data[pivot] <= val: - left = pivot + 1 - else: - right = pivot - - return left - # Accessors #---------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd new file mode 100644 index 0000000000000..a042ee8949192 --- /dev/null +++ b/pandas/_libs/tslibs/conversion.pxd @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +# cython: profile=False + +from numpy cimport int64_t + +from datetime cimport pandas_datetimestruct + + +cdef class _TSObject: + cdef: + pandas_datetimestruct dts # pandas_datetimestruct + int64_t value # numpy dt64 + object tzinfo + +cdef void _localize_tso(_TSObject obj, object tz) + +cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx new file mode 100644 index 0000000000000..13dd2091f9725 --- /dev/null +++ b/pandas/_libs/tslibs/conversion.pyx @@ -0,0 +1,500 @@ +# -*- coding: utf-8 -*- +# cython: profile=False + +cimport cython +from cython cimport Py_ssize_t + +import numpy as np +cimport numpy as np +from numpy cimport int64_t, ndarray +np.import_array() + +import pytz + +from cpython.datetime cimport datetime + +from datetime cimport ( + PANDAS_FR_ns, + pandas_datetimestruct, + pandas_datetime_to_datetimestruct, + ) + +cimport util + +from timezones cimport ( + is_utc, is_tzlocal, is_fixed_offset, + treat_tz_as_dateutil, treat_tz_as_pytz, + get_utcoffset, get_dst_info, get_timezone) + +# ---------------------------------------------------------------------- +# Constants +cdef int64_t NPY_NAT = util.get_nat() + +cdef int64_t DAY_NS = 86400000000000LL + +UTC = pytz.UTC + + +# ---------------------------------------------------------------------- +# _TSObject Conversion + +# lightweight C object to hold datetime & int64 pair +cdef class _TSObject: + #cdef: + # pandas_datetimestruct dts # pandas_datetimestruct + # int64_t value # numpy dt64 + # object tzinfo + + property value: + def __get__(self): + return self.value + + +cdef inline void _localize_tso(_TSObject obj, object tz): + """ + Take a TSObject in UTC and localizes to timezone tz. + """ + cdef: + ndarray[int64_t] trans, deltas + Py_ssize_t delta, posn + + if is_utc(tz): + obj.tzinfo = tz + elif is_tzlocal(tz): + pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) + dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, obj.dts.hour, + obj.dts.min, obj.dts.sec, obj.dts.us, tz) + delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 + if obj.value != NPY_NAT: + pandas_datetime_to_datetimestruct(obj.value + delta, + PANDAS_FR_ns, &obj.dts) + else: + pandas_datetime_to_datetimestruct(obj.value, + PANDAS_FR_ns, &obj.dts) + obj.tzinfo = tz + else: + # Adjust datetime64 timestamp, recompute datetimestruct + trans, deltas, typ = get_dst_info(tz) + + pos = trans.searchsorted(obj.value, side='right') - 1 + + # static/pytz/dateutil specific code + if is_fixed_offset(tz): + # statictzinfo + if len(deltas) > 0 and obj.value != NPY_NAT: + pandas_datetime_to_datetimestruct(obj.value + deltas[0], + PANDAS_FR_ns, &obj.dts) + else: + pandas_datetime_to_datetimestruct( + obj.value, PANDAS_FR_ns, &obj.dts) + obj.tzinfo = tz + elif treat_tz_as_pytz(tz): + inf = tz._transition_info[pos] + if obj.value != NPY_NAT: + pandas_datetime_to_datetimestruct(obj.value + deltas[pos], + PANDAS_FR_ns, &obj.dts) + else: + pandas_datetime_to_datetimestruct(obj.value, + PANDAS_FR_ns, &obj.dts) + obj.tzinfo = tz._tzinfos[inf] + elif treat_tz_as_dateutil(tz): + if obj.value != NPY_NAT: + pandas_datetime_to_datetimestruct(obj.value + deltas[pos], + PANDAS_FR_ns, &obj.dts) + else: + pandas_datetime_to_datetimestruct(obj.value, + PANDAS_FR_ns, &obj.dts) + obj.tzinfo = tz + else: + obj.tzinfo = tz + + +# ---------------------------------------------------------------------- +# Localization / Timezone Conversion + + +cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): + """ + Convert the val (in i8) from timezone1 to timezone2 + + This is a single timezone versoin of tz_convert + + Parameters + ---------- + val : int64 + tz1 : string / timezone object + tz2 : string / timezone object + + Returns + ------- + int64 converted + + """ + + cdef: + ndarray[int64_t] trans, deltas + Py_ssize_t pos + int64_t v, offset, utc_date + pandas_datetimestruct dts + + if val == NPY_NAT: + return val + + # Convert to UTC + if is_tzlocal(tz1): + pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts) + dt = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, tz1) + delta = int(get_utcoffset(tz1, dt).total_seconds()) * 1000000000 + utc_date = val - delta + elif get_timezone(tz1) != 'UTC': + trans, deltas, typ = get_dst_info(tz1) + pos = trans.searchsorted(val, side='right') - 1 + if pos < 0: + raise ValueError('First time before start of DST info') + offset = deltas[pos] + utc_date = val - offset + else: + utc_date = val + + if get_timezone(tz2) == 'UTC': + return utc_date + if is_tzlocal(tz2): + pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts) + dt = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, tz2) + delta = int(get_utcoffset(tz2, dt).total_seconds()) * 1000000000 + return utc_date + delta + + # Convert UTC to other timezone + trans, deltas, typ = get_dst_info(tz2) + + pos = trans.searchsorted(utc_date, side='right') - 1 + if pos < 0: + raise ValueError('First time before start of DST info') + + offset = deltas[pos] + return utc_date + offset + + +@cython.boundscheck(False) +@cython.wraparound(False) +def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): + """ + Convert the values (in i8) from timezone1 to timezone2 + + Parameters + ---------- + vals : int64 ndarray + tz1 : string / timezone object + tz2 : string / timezone object + + Returns + ------- + int64 ndarray of converted + """ + + cdef: + ndarray[int64_t] utc_dates, tt, result, trans, deltas + Py_ssize_t i, j, pos, n = len(vals) + ndarray[Py_ssize_t] posn + int64_t v, offset, delta + pandas_datetimestruct dts + + if len(vals) == 0: + return np.array([], dtype=np.int64) + + # Convert to UTC + if get_timezone(tz1) != 'UTC': + utc_dates = np.empty(n, dtype=np.int64) + if is_tzlocal(tz1): + for i in range(n): + v = vals[i] + if v == NPY_NAT: + utc_dates[i] = NPY_NAT + else: + pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) + dt = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, tz1) + delta = (int(get_utcoffset(tz1, dt).total_seconds()) + * 1000000000) + utc_dates[i] = v - delta + else: + trans, deltas, typ = get_dst_info(tz1) + + # all-NaT + tt = vals[vals!=NPY_NAT] + if not len(tt): + return vals + + posn = trans.searchsorted(tt, side='right') + j = 0 + for i in range(n): + v = vals[i] + if v == NPY_NAT: + utc_dates[i] = NPY_NAT + else: + pos = posn[j] - 1 + j = j + 1 + if pos < 0: + raise ValueError('First time before start of DST info') + offset = deltas[pos] + utc_dates[i] = v - offset + else: + utc_dates = vals + + if get_timezone(tz2) == 'UTC': + return utc_dates + + result = np.zeros(n, dtype=np.int64) + if is_tzlocal(tz2): + for i in range(n): + v = utc_dates[i] + if v == NPY_NAT: + result[i] = NPY_NAT + else: + pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) + dt = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, tz2) + delta = (int(get_utcoffset(tz2, dt).total_seconds()) + * 1000000000) + result[i] = v + delta + return result + + # Convert UTC to other timezone + trans, deltas, typ = get_dst_info(tz2) + + # use first non-NaT element + # if all-NaT, return all-NaT + if (result==NPY_NAT).all(): + return result + + # if all NaT, return all NaT + tt = utc_dates[utc_dates!=NPY_NAT] + if not len(tt): + return utc_dates + + posn = trans.searchsorted(tt, side='right') + + j = 0 + for i in range(n): + v = utc_dates[i] + if vals[i] == NPY_NAT: + result[i] = vals[i] + else: + pos = posn[j] - 1 + j = j + 1 + if pos < 0: + raise ValueError('First time before start of DST info') + offset = deltas[pos] + result[i] = v + offset + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, + object errors='raise'): + """ + Localize tzinfo-naive i8 to given time zone (using pytz). If + there are ambiguities in the values, raise AmbiguousTimeError. + + Returns + ------- + localized : DatetimeIndex + """ + cdef: + ndarray[int64_t] trans, deltas, idx_shifted + ndarray ambiguous_array + Py_ssize_t i, idx, pos, ntrans, n = len(vals) + int64_t *tdata + int64_t v, left, right + ndarray[int64_t] result, result_a, result_b, dst_hours + pandas_datetimestruct dts + bint infer_dst = False, is_dst = False, fill = False + bint is_coerce = errors == 'coerce', is_raise = errors == 'raise' + + # Vectorized version of DstTzInfo.localize + + assert is_coerce or is_raise + + if tz == UTC or tz is None: + return vals + + result = np.empty(n, dtype=np.int64) + + if is_tzlocal(tz): + for i in range(n): + v = vals[i] + pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) + dt = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, tz) + delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 + result[i] = v - delta + return result + + if util.is_string_object(ambiguous): + if ambiguous == 'infer': + infer_dst = True + elif ambiguous == 'NaT': + fill = True + elif isinstance(ambiguous, bool): + is_dst = True + if ambiguous: + ambiguous_array = np.ones(len(vals), dtype=bool) + else: + ambiguous_array = np.zeros(len(vals), dtype=bool) + elif hasattr(ambiguous, '__iter__'): + is_dst = True + if len(ambiguous) != len(vals): + raise ValueError( + "Length of ambiguous bool-array must be the same size as vals") + ambiguous_array = np.asarray(ambiguous) + + trans, deltas, typ = get_dst_info(tz) + + tdata = trans.data + ntrans = len(trans) + + result_a = np.empty(n, dtype=np.int64) + result_b = np.empty(n, dtype=np.int64) + result_a.fill(NPY_NAT) + result_b.fill(NPY_NAT) + + # left side + idx_shifted = (np.maximum(0, trans.searchsorted( + vals - DAY_NS, side='right') - 1)).astype(np.int64) + + for i in range(n): + v = vals[i] - deltas[idx_shifted[i]] + pos = bisect_right_i8(tdata, v, ntrans) - 1 + + # timestamp falls to the left side of the DST transition + if v + deltas[pos] == vals[i]: + result_a[i] = v + + # right side + idx_shifted = (np.maximum(0, trans.searchsorted( + vals + DAY_NS, side='right') - 1)).astype(np.int64) + + for i in range(n): + v = vals[i] - deltas[idx_shifted[i]] + pos = bisect_right_i8(tdata, v, ntrans) - 1 + + # timestamp falls to the right side of the DST transition + if v + deltas[pos] == vals[i]: + result_b[i] = v + + if infer_dst: + dst_hours = np.empty(n, dtype=np.int64) + dst_hours.fill(NPY_NAT) + + # Get the ambiguous hours (given the above, these are the hours + # where result_a != result_b and neither of them are NAT) + both_nat = np.logical_and(result_a != NPY_NAT, result_b != NPY_NAT) + both_eq = result_a == result_b + trans_idx = np.squeeze(np.nonzero(np.logical_and(both_nat, ~both_eq))) + if trans_idx.size == 1: + stamp = _render_tstamp(vals[trans_idx]) + raise pytz.AmbiguousTimeError( + "Cannot infer dst time from %s as there " + "are no repeated times" % stamp) + # Split the array into contiguous chunks (where the difference between + # indices is 1). These are effectively dst transitions in different + # years which is useful for checking that there is not an ambiguous + # transition in an individual year. + if trans_idx.size > 0: + one_diff = np.where(np.diff(trans_idx) != 1)[0] +1 + trans_grp = np.array_split(trans_idx, one_diff) + + # Iterate through each day, if there are no hours where the + # delta is negative (indicates a repeat of hour) the switch + # cannot be inferred + for grp in trans_grp: + + delta = np.diff(result_a[grp]) + if grp.size == 1 or np.all(delta > 0): + stamp = _render_tstamp(vals[grp[0]]) + raise pytz.AmbiguousTimeError(stamp) + + # Find the index for the switch and pull from a for dst and b + # for standard + switch_idx = (delta <= 0).nonzero()[0] + if switch_idx.size > 1: + raise pytz.AmbiguousTimeError( + "There are %i dst switches when " + "there should only be 1." % switch_idx.size) + switch_idx = switch_idx[0] + 1 # Pull the only index and adjust + a_idx = grp[:switch_idx] + b_idx = grp[switch_idx:] + dst_hours[grp] = np.hstack((result_a[a_idx], result_b[b_idx])) + + for i in range(n): + left = result_a[i] + right = result_b[i] + if vals[i] == NPY_NAT: + result[i] = vals[i] + elif left != NPY_NAT and right != NPY_NAT: + if left == right: + result[i] = left + else: + if infer_dst and dst_hours[i] != NPY_NAT: + result[i] = dst_hours[i] + elif is_dst: + if ambiguous_array[i]: + result[i] = left + else: + result[i] = right + elif fill: + result[i] = NPY_NAT + else: + stamp = _render_tstamp(vals[i]) + raise pytz.AmbiguousTimeError( + "Cannot infer dst time from %r, try using the " + "'ambiguous' argument" % stamp) + elif left != NPY_NAT: + result[i] = left + elif right != NPY_NAT: + result[i] = right + else: + if is_coerce: + result[i] = NPY_NAT + else: + stamp = _render_tstamp(vals[i]) + raise pytz.NonExistentTimeError(stamp) + + return result + + +cdef inline bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n): + cdef Py_ssize_t pivot, left = 0, right = n + + # edge cases + if val > data[n - 1]: + return n + + if val < data[0]: + return 0 + + while left < right: + pivot = left + (right - left) // 2 + + if data[pivot] <= val: + left = pivot + 1 + else: + right = pivot + + return left + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef inline str _render_tstamp(int64_t val): + """ Helper function to equivalent to `str(Timestamp(val))` """ + stamp = np.int64(val).astype('datetime64[ns]') + # Render `stamp` as e.g. '2017-08-30 07:59:23.123456' + # as opposed to str(stamp) which would + # be '2017-08-30T07:59:23.123456789' + stamp = str(stamp).replace('T', ' ')[:26] + # Note: cython complains if we try to slice [:-3] + return stamp diff --git a/setup.py b/setup.py index 365d387dc54d6..6835bcc51bcee 100755 --- a/setup.py +++ b/setup.py @@ -343,6 +343,7 @@ class CheckSDist(sdist_class): 'pandas/_libs/parsers.pyx', 'pandas/_libs/tslibs/strptime.pyx', 'pandas/_libs/tslibs/timezones.pyx', + 'pandas/_libs/tslibs/conversion.pyx', 'pandas/_libs/tslibs/fields.pyx', 'pandas/_libs/tslibs/frequencies.pyx', 'pandas/_libs/tslibs/parsing.pyx', @@ -486,6 +487,7 @@ def pxd(name): 'depends': tseries_depends, 'sources': ['pandas/_libs/src/datetime/np_datetime.c', 'pandas/_libs/src/datetime/np_datetime_strings.c']}, + '_libs.tslibs.conversion': {'pyxfile': '_libs/tslibs/conversion'}, '_libs.tslibs.timezones': {'pyxfile': '_libs/tslibs/timezones'}, '_libs.tslibs.fields': {'pyxfile': '_libs/tslibs/fields', 'depends': tseries_depends, From 34e2bad08a1d456236c81263fcedf745c9275bbe Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 14 Oct 2017 18:30:11 -0700 Subject: [PATCH 2/4] add tseries_depends --- pandas/_libs/tslibs/conversion.pyx | 3 +-- setup.py | 5 ++++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 13dd2091f9725..648eed2cf4d32 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -16,8 +16,7 @@ from cpython.datetime cimport datetime from datetime cimport ( PANDAS_FR_ns, pandas_datetimestruct, - pandas_datetime_to_datetimestruct, - ) + pandas_datetime_to_datetimestruct) cimport util diff --git a/setup.py b/setup.py index 6835bcc51bcee..6cff8b3332548 100755 --- a/setup.py +++ b/setup.py @@ -487,7 +487,10 @@ def pxd(name): 'depends': tseries_depends, 'sources': ['pandas/_libs/src/datetime/np_datetime.c', 'pandas/_libs/src/datetime/np_datetime_strings.c']}, - '_libs.tslibs.conversion': {'pyxfile': '_libs/tslibs/conversion'}, + '_libs.tslibs.conversion': {'pyxfile': '_libs/tslibs/conversion', + 'depends': tseries_depends, + 'sources': ['pandas/_libs/src/datetime/np_datetime.c', + 'pandas/_libs/src/datetime/np_datetime_strings.c']}, '_libs.tslibs.timezones': {'pyxfile': '_libs/tslibs/timezones'}, '_libs.tslibs.fields': {'pyxfile': '_libs/tslibs/fields', 'depends': tseries_depends, From dfd49796acb074173b4c85ad1c0af4553f765bfb Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 17 Oct 2017 16:50:53 -0700 Subject: [PATCH 3/4] whitespace fixup --- pandas/_libs/tslib.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 1976e42650bc2..6aefde73a5f65 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1508,8 +1508,6 @@ cdef class _NaT(_Timestamp): return NotImplemented - - # helper to extract datetime and int64 from several different possibilities cdef convert_to_tsobject(object ts, object tz, object unit, bint dayfirst, bint yearfirst): From d3ffff70d3dee597c2230467a88e68570d6074a0 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 28 Oct 2017 11:08:55 -0700 Subject: [PATCH 4/4] edits per reviewer suggestions --- pandas/_libs/tslibs/conversion.pyx | 15 +++++---------- setup.py | 18 +++++++++--------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 648eed2cf4d32..69204e2ca4e1a 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -468,6 +468,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, cdef inline bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n): cdef Py_ssize_t pivot, left = 0, right = n + assert n >= 1 + # edge cases if val > data[n - 1]: return n @@ -486,14 +488,7 @@ cdef inline bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n): return left -@cython.boundscheck(False) -@cython.wraparound(False) cdef inline str _render_tstamp(int64_t val): - """ Helper function to equivalent to `str(Timestamp(val))` """ - stamp = np.int64(val).astype('datetime64[ns]') - # Render `stamp` as e.g. '2017-08-30 07:59:23.123456' - # as opposed to str(stamp) which would - # be '2017-08-30T07:59:23.123456789' - stamp = str(stamp).replace('T', ' ')[:26] - # Note: cython complains if we try to slice [:-3] - return stamp + """ Helper function to render exception messages""" + from pandas._libs.tslib import Timestamp + return str(Timestamp(val)) diff --git a/setup.py b/setup.py index 78bbeefab7379..c421eb4394e8d 100755 --- a/setup.py +++ b/setup.py @@ -473,8 +473,8 @@ def pxd(name): tseries_depends = ['pandas/_libs/src/datetime/np_datetime.h', 'pandas/_libs/src/datetime/np_datetime_strings.h', 'pandas/_libs/src/datetime.pxd'] -npdt_srces = ['pandas/_libs/src/datetime/np_datetime.c', - 'pandas/_libs/src/datetime/np_datetime_strings.c'] +np_datetime_sources = ['pandas/_libs/src/datetime/np_datetime.c', + 'pandas/_libs/src/datetime/np_datetime_strings.c'] # some linux distros require it libraries = ['m'] if not is_platform_windows() else [] @@ -489,30 +489,30 @@ def pxd(name): _pxi_dep['hashtable'])}, '_libs.tslibs.strptime': {'pyxfile': '_libs/tslibs/strptime', 'depends': tseries_depends, - 'sources': npdt_srces}, + 'sources': np_datetime_sources}, '_libs.tslib': {'pyxfile': '_libs/tslib', 'pxdfiles': ['_libs/src/util', '_libs/lib'], 'depends': tseries_depends, - 'sources': npdt_srces}, + 'sources': np_datetime_sources}, '_libs.tslibs.conversion': {'pyxfile': '_libs/tslibs/conversion', 'depends': tseries_depends, - 'sources': npdt_srces}, + 'sources': np_datetime_sources}, '_libs.tslibs.timedeltas': {'pyxfile': '_libs/tslibs/timedeltas'}, '_libs.tslibs.timezones': {'pyxfile': '_libs/tslibs/timezones'}, '_libs.tslibs.fields': {'pyxfile': '_libs/tslibs/fields', 'depends': tseries_depends, - 'sources': npdt_srces}, + 'sources': np_datetime_sources}, '_libs.period': {'pyxfile': '_libs/period', 'depends': (tseries_depends + ['pandas/_libs/src/period_helper.h']), - 'sources': npdt_srces + [ + 'sources': np_datetime_sources + [ 'pandas/_libs/src/period_helper.c']}, '_libs.tslibs.parsing': {'pyxfile': '_libs/tslibs/parsing', 'pxdfiles': ['_libs/src/util']}, '_libs.tslibs.frequencies': {'pyxfile': '_libs/tslibs/frequencies', 'pxdfiles': ['_libs/src/util']}, '_libs.index': {'pyxfile': '_libs/index', - 'sources': npdt_srces, + 'sources': np_datetime_sources, 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], 'depends': _pxi_dep['index']}, '_libs.algos': {'pyxfile': '_libs/algos', @@ -625,7 +625,7 @@ def pxd(name): 'pandas/_libs/src/ujson/python/JSONtoObj.c', 'pandas/_libs/src/ujson/lib/ultrajsonenc.c', 'pandas/_libs/src/ujson/lib/ultrajsondec.c'] + - npdt_srces), + np_datetime_sources), include_dirs=(['pandas/_libs/src/ujson/python', 'pandas/_libs/src/ujson/lib', 'pandas/_libs/src/datetime'] +