diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index cfb697b3c357a..73e799f9e0a36 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -606,7 +606,7 @@ def _concat_same_type(cls, to_concat): def copy(self, deep=False): values = self.asi8.copy() - return type(self)(values, dtype=self.dtype, freq=self.freq) + return type(self)._simple_new(values, dtype=self.dtype, freq=self.freq) def _values_for_factorize(self): return self.asi8, iNaT diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index efa1757a989fc..2f7cd3768b6ab 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -50,6 +50,7 @@ def tz_to_dtype(tz): if tz is None: return _NS_DTYPE else: + tz = timezones.tz_standardize(tz) return DatetimeTZDtype(tz=tz) @@ -254,77 +255,53 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, _dtype = None # type: Union[np.dtype, DatetimeTZDtype] _freq = None - def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): - if isinstance(values, (ABCSeries, ABCIndexClass)): - values = values._values - - if isinstance(values, type(self)): - # validation - dtz = getattr(dtype, 'tz', None) - if dtz and values.tz is None: - dtype = DatetimeTZDtype(tz=dtype.tz) - elif dtz and values.tz: - if not timezones.tz_compare(dtz, values.tz): - msg = ( - "Timezone of the array and 'dtype' do not match. " - "'{}' != '{}'" - ) - raise TypeError(msg.format(dtz, values.tz)) - elif values.tz: - dtype = values.dtype - # freq = validate_values_freq(values, freq) - if freq is None: - freq = values.freq - values = values._data + def __init__(self, values, dtype=None, freq=None, copy=False): + if freq == "infer": + raise ValueError( + "Frequency inference not allowed in DatetimeArray.__init__. " + "Use 'pd.array()' instead.") - if not isinstance(values, np.ndarray): - msg = ( - "Unexpected type '{}'. 'values' must be a DatetimeArray " + if not hasattr(values, "dtype"): + # e.g. list + raise ValueError( + "Unexpected type '{vals}'. 'values' must be a DatetimeArray " "ndarray, or Series or Index containing one of those." - ) - raise ValueError(msg.format(type(values).__name__)) - - if values.dtype == 'i8': - # for compat with datetime/timedelta/period shared methods, - # we can sometimes get here with int64 values. These represent - # nanosecond UTC (or tz-naive) unix timestamps - values = values.view(_NS_DTYPE) - - if values.dtype != _NS_DTYPE: - msg = ( + .format(vals=type(values).__name__)) + + if is_datetime64_dtype(values.dtype) and hasattr(dtype, "tz"): + # cast to make _from_sequence treat as unix instead of wall-times; + # see GH#24559 + values = type(self)._simple_new( + np.asarray(values), + freq=getattr(values, "freq", None), + dtype=tz_to_dtype(utc)).tz_convert(dtype.tz) + + elif not (is_datetime64tz_dtype(values.dtype) or + is_datetime64_dtype(values.dtype) or + values.dtype == 'i8'): + raise ValueError( "The dtype of 'values' is incorrect. Must be 'datetime64[ns]'." - " Got {} instead." - ) - raise ValueError(msg.format(values.dtype)) + " Got {dtype} instead." .format(dtype=values.dtype)) - dtype = _validate_dt64_dtype(dtype) - - if freq == "infer": - msg = ( - "Frequency inference not allowed in DatetimeArray.__init__. " - "Use 'pd.array()' instead." - ) - raise ValueError(msg) - - if copy: - values = values.copy() - if freq: - freq = to_offset(freq) - if getattr(dtype, 'tz', None): - # https://github.com/pandas-dev/pandas/issues/18595 - # Ensure that we have a standard timezone for pytz objects. - # Without this, things like adding an array of timedeltas and - # a tz-aware Timestamp (with a tz specific to its datetime) will - # be incorrect(ish?) for the array as a whole - dtype = DatetimeTZDtype(tz=timezones.tz_standardize(dtype.tz)) - - self._data = values - self._dtype = dtype - self._freq = freq + arr = type(self)._from_sequence(values, dtype=dtype, + freq=freq, copy=copy) + self._data = arr._data + self._freq = arr._freq + self._dtype = arr._dtype @classmethod - def _simple_new(cls, values, freq=None, dtype=None): - return cls(values, freq=freq, dtype=dtype) + def _simple_new(cls, values, freq=None, dtype=_NS_DTYPE): + """ + we require the we have a dtype compat for the values + if we are passed a non-dtype compat, then coerce using the constructor + """ + assert isinstance(values, np.ndarray), type(values) + + result = object.__new__(cls) + result._data = values.view('datetime64[ns]') + result._freq = freq + result._dtype = dtype + return result @classmethod def _from_sequence(cls, data, dtype=None, copy=False, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7bbbdd70e062e..6302e31510d2f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4377,7 +4377,8 @@ def _maybe_casted_values(index, labels=None): values, mask, np.nan) if issubclass(values_type, DatetimeLikeArray): - values = values_type(values, dtype=values_dtype) + values = values_type._simple_new(values, + dtype=values_dtype) return values diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 664ca9c5d2f05..aa61632441906 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -314,18 +314,20 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): if isinstance(values, DatetimeArray): if tz: tz = validate_tz_from_dtype(dtype, tz) + tz = timezones.tz_standardize(tz) dtype = DatetimeTZDtype(tz=tz) elif dtype is None: - dtype = _NS_DTYPE + dtype = values.dtype values = DatetimeArray(values, freq=freq, dtype=dtype) tz = values.tz freq = values.freq values = values._data + else: + tz = tz or getattr(dtype, 'tz', None) # DatetimeArray._simple_new will accept either i8 or M8[ns] dtypes - if isinstance(values, DatetimeIndex): - values = values._data + assert isinstance(values, np.ndarray) dtype = tz_to_dtype(tz) dtarr = DatetimeArray._simple_new(values, freq=freq, dtype=dtype) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 70e4f44cb5de8..dcc28ef9dec1d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -3078,7 +3078,7 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values): # TODO: This is no longer hit internally; does it need to be retained # for e.g. pyarrow? - values = DatetimeArray(values, dtype) + values = DatetimeArray(values.view('i8'), dtype) return klass(values, ndim=ndim, placement=placement) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index b83eab7d0eba0..c2a495cf6eaf2 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -656,7 +656,7 @@ def create_block(b): if is_datetime64tz_dtype(b[u'dtype']): assert isinstance(values, np.ndarray), type(values) assert values.dtype == 'M8[ns]', values.dtype - values = DatetimeArray(values, dtype=b[u'dtype']) + values = DatetimeArray(values.view('i8'), dtype=b[u'dtype']) return make_block(values=values, klass=getattr(internals, b[u'klass']), diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index f234e4fadec61..997e664d484d0 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -340,7 +340,7 @@ def test_from_array_keeps_base(self): arr = np.array(['2000-01-01', '2000-01-02'], dtype='M8[ns]') dta = DatetimeArray(arr) - assert dta._data is arr + assert dta._data.base is arr dta = DatetimeArray(arr[:0]) assert dta._data.base is arr diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 8228ed7652fea..32cafad584d01 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -16,6 +16,69 @@ class TestDatetimeArrayConstructor(object): + + @pytest.mark.parametrize('tz', [None, 'Asia/Singapore']) + def test_constructor_equivalence(self, tz): + # GH#24623 check that DatetimeArray.__init__ behavior matches: + # Timestamp.__new__ for int64 + # DatetimeArray._from_sequence for int64, datetime64[ns] + # DatetimeArray._simple_new for int64 + # DatetimeIndex.__new__ for int64, datetime64[ns] + # DatetimeIndex._simple_new for int64, datetime64[ns] + # + # and that DatetimeArray._simple_new behaves like + # DatetimeIndex._simple_new for both int64 and datetime64[ns] inputs + arr = np.random.randint(-10**9, 10**9, size=5, dtype=np.int64) + dt64arr = arr.view('datetime64[ns]') + dti = pd.date_range('1960-01-01', periods=1, tz=tz) + + v1 = DatetimeArray._simple_new(arr.view('i8'), dtype=dti.dtype) + v2 = DatetimeArray(arr.view('i8'), dtype=dti.dtype) + v3 = DatetimeArray._from_sequence(arr.view('i8'), dtype=dti.dtype) + v4 = pd.DatetimeIndex._simple_new(arr.view('i8'), tz=dti.tz) + v5 = pd.DatetimeIndex(arr.view('i8'), tz=dti.tz) + v6 = pd.to_datetime(arr, utc=True).tz_convert(dti.tz) + + # when dealing with _simple_new, i8 and M8[ns] are interchangeable + v7 = DatetimeArray._simple_new(arr.view('M8[ns]'), dtype=dti.dtype) + v8 = pd.DatetimeIndex._simple_new(arr.view('M8[ns]'), dtype=dti.dtype) + + # GH#24623 DatetimeArray.__init__ treats M8[ns] as unix timestamps, + # unlike DatetimeIndex.__new__. + v9 = DatetimeArray(dt64arr, dtype=dti.dtype) + + tm.assert_datetime_array_equal(v1, v2) + tm.assert_datetime_array_equal(v1, v3) + tm.assert_datetime_array_equal(v1, v4._data) + tm.assert_datetime_array_equal(v1, v5._data) + tm.assert_datetime_array_equal(v1, v6._data) + tm.assert_datetime_array_equal(v1, v7) + tm.assert_datetime_array_equal(v1, v8._data) + tm.assert_datetime_array_equal(v1, v9) + + expected = [pd.Timestamp(i8, tz=dti.tz) for i8 in arr] + assert list(v1) == expected + + # The guarantees for datetime64 data are fewer + v1 = DatetimeArray._from_sequence(dt64arr, dtype=dti.dtype) + v2 = DatetimeArray._from_sequence(dt64arr, tz=dti.tz) + v3 = pd.DatetimeIndex(dt64arr, dtype=dti.dtype) + v4 = pd.DatetimeIndex(dt64arr, tz=dti.tz) + + tm.assert_datetime_array_equal(v1, v2) + tm.assert_datetime_array_equal(v1, v3._data) + tm.assert_datetime_array_equal(v1, v4._data) + + def test_freq_validation(self): + # GH#24623 check that invalid instances cannot be created with the + # public constructor + arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10**9 + + msg = ("Inferred frequency H from passed values does not " + "conform to passed frequency W-SUN") + with pytest.raises(ValueError, match=msg): + DatetimeArray(arr, freq="W") + @pytest.mark.parametrize('meth', [DatetimeArray._from_sequence, sequence_to_dt64ns, pd.to_datetime, @@ -35,6 +98,7 @@ def test_mixing_naive_tzaware_raises(self, meth): meth(obj) def test_from_pandas_array(self): + # GH#24623, GH#24615 arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10**9 result = DatetimeArray._from_sequence(arr, freq='infer') @@ -46,7 +110,8 @@ def test_mismatched_timezone_raises(self): arr = DatetimeArray(np.array(['2000-01-01T06:00:00'], dtype='M8[ns]'), dtype=DatetimeTZDtype(tz='US/Central')) dtype = DatetimeTZDtype(tz='US/Eastern') - with pytest.raises(TypeError, match='Timezone of the array'): + with pytest.raises(TypeError, + match='data is already tz-aware US/Central'): DatetimeArray(arr, dtype=dtype) def test_non_array_raises(self): @@ -69,10 +134,11 @@ def test_freq_infer_raises(self): def test_copy(self): data = np.array([1, 2, 3], dtype='M8[ns]') arr = DatetimeArray(data, copy=False) - assert arr._data is data + assert arr._data.base is data arr = DatetimeArray(data, copy=True) assert arr._data is not data + assert arr._data.base is not data class TestDatetimeArrayComparisons(object): diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 657f5f193c85e..f3e78a3157399 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1255,7 +1255,7 @@ def test_array(array, attr, box): array = getattr(array, attr) result = getattr(result, attr) - assert result is array + assert result is array or result.base is array.base def test_array_multiindex_raises():