From 2bf9fb510082bda7c460e51dd8d9490f5ed0d172 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 8 Dec 2015 07:02:44 -0500 Subject: [PATCH 1/2] BUG: bug in .copy of datetime tz-aware objects, #11794 Not always deep-copying the underlying impl, which is a DatetimeIndex where shallow copies are views --- doc/source/whatsnew/v0.18.0.txt | 2 +- pandas/core/dtypes.py | 3 +++ pandas/core/internals.py | 36 ++++++++++++++++++--------------- pandas/tests/test_internals.py | 36 +++++++++++++++++++++++---------- pandas/tests/test_series.py | 23 +++++++++++++++++---- 5 files changed, 68 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index c1b7ff82f4c76..94ca376df8436 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -170,7 +170,7 @@ Bug Fixes - Bug in ``Timedelta.round`` with negative values (:issue:`11690`) - Bug in ``.loc`` against ``CategoricalIndex`` may result in normal ``Index`` (:issue:`11586`) - Bug in ``DataFrame.info`` when duplicated column names exist (:issue:`11761`) - +- Bug in ``.copy`` of datetime tz-aware objects (:issue:`11794`) diff --git a/pandas/core/dtypes.py b/pandas/core/dtypes.py index 0b13471aadcfb..69957299aa9bb 100644 --- a/pandas/core/dtypes.py +++ b/pandas/core/dtypes.py @@ -65,6 +65,9 @@ def __hash__(self): def __eq__(self, other): raise NotImplementedError("sub-classes should implement an __eq__ method") + def __ne__(self, other): + return not self.__eq__(other) + @classmethod def is_dtype(cls, dtype): """ Return a boolean if we if the passed type is an actual dtype that we can match (via string or type) """ diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 28c845f63f9d4..cb8d0986489cc 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -168,17 +168,11 @@ def make_block(self, values, placement=None, ndim=None, **kwargs): return make_block(values, placement=placement, ndim=ndim, **kwargs) - def make_block_same_class(self, values, placement, copy=False, fastpath=True, - **kwargs): - """ - Wrap given values in a block of same type as self. - - `kwargs` are used in SparseBlock override. - - """ - if copy: - values = values.copy() - return make_block(values, placement, klass=self.__class__, + def make_block_same_class(self, values, placement=None, fastpath=True, **kwargs): + """ Wrap given values in a block of same type as self. """ + if placement is None: + placement = self.mgr_locs + return make_block(values, placement=placement, klass=self.__class__, fastpath=fastpath, **kwargs) @mgr_locs.setter @@ -573,12 +567,11 @@ def to_native_types(self, slicer=None, na_rep='nan', quoting=None, **kwargs): # block actions #### def copy(self, deep=True, mgr=None): + """ copy constructor """ values = self.values if deep: values = values.copy() - return self.make_block(values, - klass=self.__class__, - fastpath=True) + return self.make_block_same_class(values) def replace(self, to_replace, value, inplace=False, filter=None, regex=False, convert=True, mgr=None): @@ -2140,6 +2133,13 @@ def __init__(self, values, placement, ndim=2, placement=placement, ndim=ndim, **kwargs) + def copy(self, deep=True, mgr=None): + """ copy constructor """ + values = self.values + if deep: + values = values.copy(deep=True) + return self.make_block_same_class(values) + def external_values(self): """ we internally represent the data as a DatetimeIndex, but for external compat with ndarray, export as a ndarray of Timestamps """ @@ -3257,10 +3257,14 @@ def get_scalar(self, tup): full_loc = list(ax.get_loc(x) for ax, x in zip(self.axes, tup)) blk = self.blocks[self._blknos[full_loc[0]]] - full_loc[0] = self._blklocs[full_loc[0]] + values = blk.values # FIXME: this may return non-upcasted types? - return blk.values[tuple(full_loc)] + if values.ndim == 1: + return values[full_loc[1]] + + full_loc[0] = self._blklocs[full_loc[0]] + return values[tuple(full_loc)] def delete(self, item): """ diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index fbab0d2a92203..0f46c1106ed08 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -147,6 +147,8 @@ def create_mgr(descr, item_shape=None): block_placements = OrderedDict() for d in descr.split(';'): d = d.strip() + if not len(d): + continue names, blockstr = d.partition(':')[::2] blockstr = blockstr.strip() names = names.strip().split(',') @@ -324,7 +326,8 @@ class TestBlockManager(tm.TestCase): def setUp(self): self.mgr = create_mgr('a: f8; b: object; c: f8; d: object; e: f8;' - 'f: bool; g: i8; h: complex') + 'f: bool; g: i8; h: complex; i: datetime-1; j: datetime-2;' + 'k: M8[ns, US/Eastern]; l: M8[ns, CET];') def test_constructor_corner(self): pass @@ -476,16 +479,24 @@ def test_set_change_dtype_slice(self): # GH8850 DataFrame([[3], [6]], columns=cols[2:])) def test_copy(self): - shallow = self.mgr.copy(deep=False) - - # we don't guaranteee block ordering - for blk in self.mgr.blocks: - found = False - for cp_blk in shallow.blocks: - if cp_blk.values is blk.values: - found = True - break - self.assertTrue(found) + cp = self.mgr.copy(deep=False) + for blk, cp_blk in zip(self.mgr.blocks, cp.blocks): + + # view assertion + self.assertTrue(cp_blk.equals(blk)) + self.assertTrue(cp_blk.values.base is blk.values.base) + + cp = self.mgr.copy(deep=True) + for blk, cp_blk in zip(self.mgr.blocks, cp.blocks): + + # copy assertion + # we either have a None for a base or in case of some blocks it is an array (e.g. datetimetz), + # but was copied + self.assertTrue(cp_blk.equals(blk)) + if cp_blk.values.base is not None and blk.values.base is not None: + self.assertFalse(cp_blk.values.base is blk.values.base) + else: + self.assertTrue(cp_blk.values.base is None and blk.values.base is None) def test_sparse(self): mgr = create_mgr('a: sparse-1; b: sparse-2') @@ -688,7 +699,10 @@ def test_consolidate_ordering_issues(self): self.mgr.set('g', randn(N)) self.mgr.set('h', randn(N)) + # we have datetime/tz blocks in self.mgr cons = self.mgr.consolidate() + self.assertEqual(cons.nblocks, 4) + cons = self.mgr.consolidate().get_numeric_data() self.assertEqual(cons.nblocks, 1) assert_almost_equal(cons.blocks[0].mgr_locs, np.arange(len(cons.items))) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 0fb66ee2dfa7c..e49aef2778c1e 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -5111,12 +5111,27 @@ def test_cov(self): self.assertTrue(isnull(ts1.cov(ts2, min_periods=12))) def test_copy(self): - ts = self.ts.copy() - ts[::2] = np.NaN + for deep in [False, True]: + s = Series(np.arange(10),dtype='float64') + s2 = s.copy(deep=deep) + s2[::2] = np.NaN + + # Did not modify original Series + self.assertTrue(np.isnan(s2[0])) + self.assertFalse(np.isnan(s[0])) - # Did not modify original Series - self.assertFalse(np.isnan(self.ts[0])) + # GH 11794 + # copy of tz-aware + expected = Series([Timestamp('2012/01/01', tz='UTC')]) + expected2 = Series([Timestamp('1999/01/01', tz='UTC')]) + + for deep in [False, True]: + s = Series([Timestamp('2012/01/01', tz='UTC')]) + s2 = s.copy() + s2[0] = pd.Timestamp('1999/01/01', tz='UTC') + assert_series_equal(s, expected) + assert_series_equal(s2, expected2) def test_count(self): self.assertEqual(self.ts.count(), len(self.ts)) From d526a4f7d9e87190c7ad4a84ff0cf34861c7ca58 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 8 Dec 2015 08:30:34 -0500 Subject: [PATCH 2/2] COMPAT: avoid warnings from numeric/string-like comparisons --- pandas/core/common.py | 23 ++++++++++++++++----- pandas/core/internals.py | 13 +++++++----- pandas/tests/test_series.py | 41 +++++++++++++++++++++++++++++-------- 3 files changed, 58 insertions(+), 19 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index b3a42335e14da..e81b58a3f7eef 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -232,7 +232,7 @@ def _isnull_ndarraylike(obj): values = getattr(obj, 'values', obj) dtype = values.dtype - if dtype.kind in ('O', 'S', 'U'): + if is_string_dtype(dtype): if is_categorical_dtype(values): from pandas import Categorical if not isinstance(values, Categorical): @@ -243,7 +243,7 @@ def _isnull_ndarraylike(obj): # Working around NumPy ticket 1542 shape = values.shape - if dtype.kind in ('S', 'U'): + if is_string_like_dtype(dtype): result = np.zeros(values.shape, dtype=bool) else: result = np.empty(shape, dtype=bool) @@ -267,11 +267,11 @@ def _isnull_ndarraylike_old(obj): values = getattr(obj, 'values', obj) dtype = values.dtype - if dtype.kind in ('O', 'S', 'U'): + if is_string_dtype(dtype): # Working around NumPy ticket 1542 shape = values.shape - if values.dtype.kind in ('S', 'U'): + if is_string_like_dtype(dtype): result = np.zeros(values.shape, dtype=bool) else: result = np.empty(shape, dtype=bool) @@ -2208,13 +2208,17 @@ def is_numeric_v_string_like(a, b): is_a_numeric_array = is_a_array and is_numeric_dtype(a) is_b_numeric_array = is_b_array and is_numeric_dtype(b) + is_a_string_array = is_a_array and is_string_like_dtype(a) + is_b_string_array = is_b_array and is_string_like_dtype(b) is_a_scalar_string_like = not is_a_array and is_string_like(a) is_b_scalar_string_like = not is_b_array and is_string_like(b) return ( is_a_numeric_array and is_b_scalar_string_like) or ( - is_b_numeric_array and is_a_scalar_string_like + is_b_numeric_array and is_a_scalar_string_like) or ( + is_a_numeric_array and is_b_string_array) or ( + is_b_numeric_array and is_a_string_array ) def is_datetimelike_v_numeric(a, b): @@ -2257,6 +2261,15 @@ def is_numeric_dtype(arr_or_dtype): and not issubclass(tipo, (np.datetime64, np.timedelta64))) +def is_string_dtype(arr_or_dtype): + dtype = _get_dtype(arr_or_dtype) + return dtype.kind in ('O', 'S', 'U') + +def is_string_like_dtype(arr_or_dtype): + # exclude object as its a mixed dtype + dtype = _get_dtype(arr_or_dtype) + return dtype.kind in ('S', 'U') + def is_float_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.floating) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index cb8d0986489cc..efcb6eb818087 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4419,11 +4419,14 @@ def _putmask_smart(v, m, n): try: nn = n[m] nn_at = nn.astype(v.dtype) - comp = (nn == nn_at) - if is_list_like(comp) and comp.all(): - nv = v.copy() - nv[m] = nn_at - return nv + + # avoid invalid dtype comparisons + if not is_numeric_v_string_like(nn, nn_at): + comp = (nn == nn_at) + if is_list_like(comp) and comp.all(): + nv = v.copy() + nv[m] = nn_at + return nv except (ValueError, IndexError, TypeError): pass diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index e49aef2778c1e..bbdd7c3637981 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -5112,26 +5112,49 @@ def test_cov(self): def test_copy(self): - for deep in [False, True]: + for deep in [None, False, True]: s = Series(np.arange(10),dtype='float64') - s2 = s.copy(deep=deep) + + # default deep is True + if deep is None: + s2 = s.copy() + else: + s2 = s.copy(deep=deep) + s2[::2] = np.NaN - # Did not modify original Series - self.assertTrue(np.isnan(s2[0])) - self.assertFalse(np.isnan(s[0])) + if deep is None or deep is True: + # Did not modify original Series + self.assertTrue(np.isnan(s2[0])) + self.assertFalse(np.isnan(s[0])) + else: + + # we DID modify the original Series + self.assertTrue(np.isnan(s2[0])) + self.assertTrue(np.isnan(s[0])) # GH 11794 # copy of tz-aware expected = Series([Timestamp('2012/01/01', tz='UTC')]) expected2 = Series([Timestamp('1999/01/01', tz='UTC')]) - for deep in [False, True]: + for deep in [None, False, True]: s = Series([Timestamp('2012/01/01', tz='UTC')]) - s2 = s.copy() + + if deep is None: + s2 = s.copy() + else: + s2 = s.copy(deep=deep) + s2[0] = pd.Timestamp('1999/01/01', tz='UTC') - assert_series_equal(s, expected) - assert_series_equal(s2, expected2) + + # default deep is True + if deep is None or deep is True: + assert_series_equal(s, expected) + assert_series_equal(s2, expected2) + else: + assert_series_equal(s, expected2) + assert_series_equal(s2, expected2) def test_count(self): self.assertEqual(self.ts.count(), len(self.ts))