From 76f51d6ba0def51cf173875b966a3317b1cc04a6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 22 Oct 2015 10:10:41 -0400 Subject: [PATCH] BUG: Bug in merging datetime64[ns, tz] dtypes, #11405 --- doc/source/whatsnew/v0.17.1.txt | 2 +- pandas/core/common.py | 8 ++- pandas/core/internals.py | 15 ++--- pandas/tools/merge.py | 39 ++++++------- pandas/tools/tests/test_merge.py | 97 ++++++++++++++++++++------------ pandas/tseries/base.py | 8 ++- 6 files changed, 103 insertions(+), 66 deletions(-) diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt index 989b05003d76f..06ab19b6e4a40 100755 --- a/doc/source/whatsnew/v0.17.1.txt +++ b/doc/source/whatsnew/v0.17.1.txt @@ -74,7 +74,7 @@ Bug Fixes - Bug in ``.to_latex()`` output broken when the index has a name (:issue: `10660`) - Bug in ``HDFStore.append`` with strings whose encoded length exceded the max unencoded length (:issue:`11234`) - +- Bug in merging ``datetime64[ns, tz]`` dtypes (:issue:`11405`) - Bug in ``HDFStore.select`` when comparing with a numpy scalar in a where clause (:issue:`11283`) diff --git a/pandas/core/common.py b/pandas/core/common.py index c2c50bce04309..ac3e61a500bb6 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1081,6 +1081,9 @@ def _maybe_promote(dtype, fill_value=np.nan): fill_value = tslib.iNaT else: fill_value = tslib.iNaT + elif is_datetimetz(dtype): + if isnull(fill_value): + fill_value = tslib.iNaT elif is_float(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.object_ @@ -1107,7 +1110,9 @@ def _maybe_promote(dtype, fill_value=np.nan): # in case we have a string that looked like a number if is_categorical_dtype(dtype): - dtype = dtype + pass + elif is_datetimetz(dtype): + pass elif issubclass(np.dtype(dtype).type, compat.string_types): dtype = np.object_ @@ -2497,7 +2502,6 @@ def is_int64_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.int64) - def is_int_or_datetime_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return (issubclass(tipo, np.integer) or diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f1d82ec1f3b2e..b3e7e82b5feb7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4114,7 +4114,7 @@ def _interleaved_dtype(blocks): if not len(blocks): return None - counts = defaultdict(lambda: []) + counts = defaultdict(list) for x in blocks: counts[type(x)].append(x) @@ -4482,9 +4482,8 @@ def get_empty_dtype_and_na(join_units): else: dtypes[i] = unit.dtype - # dtypes = set() - upcast_classes = set() - null_upcast_classes = set() + upcast_classes = defaultdict(list) + null_upcast_classes = defaultdict(list) for dtype, unit in zip(dtypes, join_units): if dtype is None: continue @@ -4508,9 +4507,9 @@ def get_empty_dtype_and_na(join_units): # are only null blocks, when same upcasting rules must be applied to # null upcast classes. if unit.is_null: - null_upcast_classes.add(upcast_cls) + null_upcast_classes[upcast_cls].append(dtype) else: - upcast_classes.add(upcast_cls) + upcast_classes[upcast_cls].append(dtype) if not upcast_classes: upcast_classes = null_upcast_classes @@ -4528,7 +4527,8 @@ def get_empty_dtype_and_na(join_units): elif 'float' in upcast_classes: return np.dtype(np.float64), np.nan elif 'datetimetz' in upcast_classes: - return np.dtype('M8[ns]'), tslib.iNaT + dtype = upcast_classes['datetimetz'] + return dtype[0], tslib.iNaT elif 'datetime' in upcast_classes: return np.dtype('M8[ns]'), tslib.iNaT elif 'timedelta' in upcast_classes: @@ -4788,6 +4788,7 @@ def is_null(self): return True def get_reindexed_values(self, empty_dtype, upcasted_na): + if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 95c68aaa00b18..722ce439722c9 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -220,8 +220,8 @@ def get_result(self): return result def _indicator_pre_merge(self, left, right): - - columns = left.columns.union(right.columns) + + columns = left.columns.union(right.columns) for i in ['_left_indicator', '_right_indicator']: if i in columns: @@ -232,12 +232,12 @@ def _indicator_pre_merge(self, left, right): left = left.copy() right = right.copy() - left['_left_indicator'] = 1 - left['_left_indicator'] = left['_left_indicator'].astype('int8') - - right['_right_indicator'] = 2 - right['_right_indicator'] = right['_right_indicator'].astype('int8') - + left['_left_indicator'] = 1 + left['_left_indicator'] = left['_left_indicator'].astype('int8') + + right['_right_indicator'] = 2 + right['_right_indicator'] = right['_right_indicator'].astype('int8') + return left, right def _indicator_post_merge(self, result): @@ -246,8 +246,8 @@ def _indicator_post_merge(self, result): result['_right_indicator'] = result['_right_indicator'].fillna(0) result[self.indicator_name] = Categorical((result['_left_indicator'] + result['_right_indicator']), categories=[1,2,3]) - result[self.indicator_name] = result[self.indicator_name].cat.rename_categories(['left_only', 'right_only', 'both']) - + result[self.indicator_name] = result[self.indicator_name].cat.rename_categories(['left_only', 'right_only', 'both']) + result = result.drop(labels=['_left_indicator', '_right_indicator'], axis=1) return result @@ -261,7 +261,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): continue if name in result: - key_col = result[name] + key_indexer = result.columns.get_loc(name) if left_indexer is not None and right_indexer is not None: @@ -274,9 +274,8 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): continue right_na_indexer = right_indexer.take(na_indexer) - key_col.put( - na_indexer, com.take_1d(self.right_join_keys[i], - right_na_indexer)) + result.iloc[na_indexer,key_indexer] = com.take_1d(self.right_join_keys[i], + right_na_indexer) elif name in self.right: if len(self.right) == 0: continue @@ -286,9 +285,8 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): continue left_na_indexer = left_indexer.take(na_indexer) - key_col.put(na_indexer, com.take_1d(self.left_join_keys[i], - left_na_indexer)) - + result.iloc[na_indexer,key_indexer] = com.take_1d(self.left_join_keys[i], + left_na_indexer) elif left_indexer is not None \ and isinstance(self.left_join_keys[i], np.ndarray): @@ -664,10 +662,13 @@ def _right_outer_join(x, y, max_groups): def _factorize_keys(lk, rk, sort=True): + if com.is_datetime64tz_dtype(lk) and com.is_datetime64tz_dtype(rk): + lk = lk.values + rk = rk.values if com.is_int_or_datetime_dtype(lk) and com.is_int_or_datetime_dtype(rk): klass = _hash.Int64Factorizer - lk = com._ensure_int64(lk) - rk = com._ensure_int64(rk) + lk = com._ensure_int64(com._values_from_object(lk)) + rk = com._ensure_int64(com._values_from_object(rk)) else: klass = _hash.Factorizer lk = com._ensure_object(lk) diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index b555a7dc2b3a1..3a77cfec5fbc3 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -13,6 +13,7 @@ from pandas import compat from pandas.tseries.index import DatetimeIndex from pandas.tools.merge import merge, concat, ordered_merge, MergeError +from pandas import Categorical, Timestamp from pandas.util.testing import (assert_frame_equal, assert_series_equal, assert_almost_equal, makeCustomDataframe as mkdf, @@ -947,29 +948,53 @@ def test_overlapping_columns_error_message(self): df2.columns = ['key1', 'foo', 'foo'] self.assertRaises(ValueError, merge, df, df2) + def test_merge_on_datetime64tz(self): + + # GH11405 + left = pd.DataFrame({'key' : pd.date_range('20151010',periods=2,tz='US/Eastern'), + 'value' : [1,2]}) + right = pd.DataFrame({'key' : pd.date_range('20151011',periods=3,tz='US/Eastern'), + 'value' : [1,2,3]}) + + expected = DataFrame({'key' : pd.date_range('20151010',periods=4,tz='US/Eastern'), + 'value_x' : [1,2,np.nan,np.nan], + 'value_y' : [np.nan,1,2,3]}) + result = pd.merge(left, right, on='key', how='outer') + assert_frame_equal(result, expected) + + left = pd.DataFrame({'value' : pd.date_range('20151010',periods=2,tz='US/Eastern'), + 'key' : [1,2]}) + right = pd.DataFrame({'value' : pd.date_range('20151011',periods=2,tz='US/Eastern'), + 'key' : [2,3]}) + expected = DataFrame({'value_x' : list(pd.date_range('20151010',periods=2,tz='US/Eastern')) + [pd.NaT], + 'value_y' : [pd.NaT] + list(pd.date_range('20151011',periods=2,tz='US/Eastern')), + 'key' : [1.,2,3]}) + result = pd.merge(left, right, on='key', how='outer') + assert_frame_equal(result, expected) + def test_indicator(self): # PR #10054. xref #7412 and closes #8790. - df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b'], 'col_conflict':[1,2]}) + df1 = DataFrame({'col1':[0,1], 'col_left':['a','b'], 'col_conflict':[1,2]}) df1_copy = df1.copy() - df2 = pd.DataFrame({'col1':[1,2,3,4,5],'col_right':[2,2,2,2,2], - 'col_conflict':[1,2,3,4,5]}) + df2 = DataFrame({'col1':[1,2,3,4,5],'col_right':[2,2,2,2,2], + 'col_conflict':[1,2,3,4,5]}) df2_copy = df2.copy() - df_result = pd.DataFrame({'col1':[0,1,2,3,4,5], + df_result = DataFrame({'col1':[0,1,2,3,4,5], 'col_conflict_x':[1,2,np.nan,np.nan,np.nan,np.nan], 'col_left':['a','b', np.nan,np.nan,np.nan,np.nan], 'col_conflict_y':[np.nan,1,2,3,4,5], 'col_right':[np.nan, 2,2,2,2,2]}, dtype='float64') - df_result['_merge'] = pd.Categorical(['left_only','both','right_only', + df_result['_merge'] = Categorical(['left_only','both','right_only', 'right_only','right_only','right_only'] , categories=['left_only', 'right_only', 'both']) df_result = df_result[['col1', 'col_conflict_x', 'col_left', 'col_conflict_y', 'col_right', '_merge' ]] - test = pd.merge(df1, df2, on='col1', how='outer', indicator=True) + test = merge(df1, df2, on='col1', how='outer', indicator=True) assert_frame_equal(test, df_result) test = df1.merge(df2, on='col1', how='outer', indicator=True) assert_frame_equal(test, df_result) @@ -982,63 +1007,63 @@ def test_indicator(self): df_result_custom_name = df_result df_result_custom_name = df_result_custom_name.rename(columns={'_merge':'custom_name'}) - test_custom_name = pd.merge(df1, df2, on='col1', how='outer', indicator='custom_name') + test_custom_name = merge(df1, df2, on='col1', how='outer', indicator='custom_name') assert_frame_equal(test_custom_name, df_result_custom_name) test_custom_name = df1.merge(df2, on='col1', how='outer', indicator='custom_name') assert_frame_equal(test_custom_name, df_result_custom_name) # Check only accepts strings and booleans with tm.assertRaises(ValueError): - pd.merge(df1, df2, on='col1', how='outer', indicator=5) + merge(df1, df2, on='col1', how='outer', indicator=5) with tm.assertRaises(ValueError): df1.merge(df2, on='col1', how='outer', indicator=5) # Check result integrity - test2 = pd.merge(df1, df2, on='col1', how='left', indicator=True) + test2 = merge(df1, df2, on='col1', how='left', indicator=True) self.assertTrue((test2._merge != 'right_only').all()) test2 = df1.merge(df2, on='col1', how='left', indicator=True) self.assertTrue((test2._merge != 'right_only').all()) - test3 = pd.merge(df1, df2, on='col1', how='right', indicator=True) + test3 = merge(df1, df2, on='col1', how='right', indicator=True) self.assertTrue((test3._merge != 'left_only').all()) test3 = df1.merge(df2, on='col1', how='right', indicator=True) self.assertTrue((test3._merge != 'left_only').all()) - test4 = pd.merge(df1, df2, on='col1', how='inner', indicator=True) + test4 = merge(df1, df2, on='col1', how='inner', indicator=True) self.assertTrue((test4._merge == 'both').all()) test4 = df1.merge(df2, on='col1', how='inner', indicator=True) self.assertTrue((test4._merge == 'both').all()) # Check if working name in df for i in ['_right_indicator', '_left_indicator', '_merge']: - df_badcolumn = pd.DataFrame({'col1':[1,2], i:[2,2]}) + df_badcolumn = DataFrame({'col1':[1,2], i:[2,2]}) with tm.assertRaises(ValueError): - pd.merge(df1, df_badcolumn, on='col1', how='outer', indicator=True) + merge(df1, df_badcolumn, on='col1', how='outer', indicator=True) with tm.assertRaises(ValueError): df1.merge(df_badcolumn, on='col1', how='outer', indicator=True) # Check for name conflict with custom name - df_badcolumn = pd.DataFrame({'col1':[1,2], 'custom_column_name':[2,2]}) + df_badcolumn = DataFrame({'col1':[1,2], 'custom_column_name':[2,2]}) with tm.assertRaises(ValueError): - pd.merge(df1, df_badcolumn, on='col1', how='outer', indicator='custom_column_name') + merge(df1, df_badcolumn, on='col1', how='outer', indicator='custom_column_name') with tm.assertRaises(ValueError): df1.merge(df_badcolumn, on='col1', how='outer', indicator='custom_column_name') # Merge on multiple columns - df3 = pd.DataFrame({'col1':[0,1], 'col2':['a','b']}) + df3 = DataFrame({'col1':[0,1], 'col2':['a','b']}) - df4 = pd.DataFrame({'col1':[1,1,3], 'col2':['b','x','y']}) + df4 = DataFrame({'col1':[1,1,3], 'col2':['b','x','y']}) - hand_coded_result = pd.DataFrame({'col1':[0,1,1,3.0], + hand_coded_result = DataFrame({'col1':[0,1,1,3.0], 'col2':['a','b','x','y']}) - hand_coded_result['_merge'] = pd.Categorical( + hand_coded_result['_merge'] = Categorical( ['left_only','both','right_only','right_only'] , categories=['left_only', 'right_only', 'both']) - test5 = pd.merge(df3, df4, on=['col1', 'col2'], how='outer', indicator=True) + test5 = merge(df3, df4, on=['col1', 'col2'], how='outer', indicator=True) assert_frame_equal(test5, hand_coded_result) test5 = df3.merge(df4, on=['col1', 'col2'], how='outer', indicator=True) assert_frame_equal(test5, hand_coded_result) @@ -1464,18 +1489,18 @@ def test_int64_overflow_issues(self): columns=list('ABCDEFG')) # confirm that this is checking what it is supposed to check - shape = left.apply(pd.Series.nunique).values + shape = left.apply(Series.nunique).values self.assertTrue(_int64_overflow_possible(shape)) # add duplicates to left frame - left = pd.concat([left, left], ignore_index=True) + left = concat([left, left], ignore_index=True) right = DataFrame(np.random.randint(low, high, (n // 2, 7)).astype('int64'), columns=list('ABCDEFG')) # add duplicates & overlap with left to the right frame i = np.random.choice(len(left), n) - right = pd.concat([right, right, left.iloc[i]], ignore_index=True) + right = concat([right, right, left.iloc[i]], ignore_index=True) left['left'] = np.random.randn(len(left)) right['right'] = np.random.randn(len(right)) @@ -1980,19 +2005,19 @@ def test_concat_dataframe_keys_bug(self): def test_concat_series_partial_columns_names(self): # GH10698 - foo = pd.Series([1,2], name='foo') - bar = pd.Series([1,2]) - baz = pd.Series([4,5]) + foo = Series([1,2], name='foo') + bar = Series([1,2]) + baz = Series([4,5]) - result = pd.concat([foo, bar, baz], axis=1) + result = concat([foo, bar, baz], axis=1) expected = DataFrame({'foo' : [1,2], 0 : [1,2], 1 : [4,5]}, columns=['foo',0,1]) tm.assert_frame_equal(result, expected) - result = pd.concat([foo, bar, baz], axis=1, keys=['red','blue','yellow']) + result = concat([foo, bar, baz], axis=1, keys=['red','blue','yellow']) expected = DataFrame({'red' : [1,2], 'blue' : [1,2], 'yellow' : [4,5]}, columns=['red','blue','yellow']) tm.assert_frame_equal(result, expected) - result = pd.concat([foo, bar, baz], axis=1, ignore_index=True) + result = concat([foo, bar, baz], axis=1, ignore_index=True) expected = DataFrame({0 : [1,2], 1 : [1,2], 2 : [4,5]}) tm.assert_frame_equal(result, expected) @@ -2059,13 +2084,13 @@ def test_concat_multiindex_with_tz(self): datetime(2014, 1, 3)], 'b': ['A', 'B', 'C'], 'c': [1, 2, 3], 'd': [4, 5, 6]}) - df['dt'] = df['dt'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) + df['dt'] = df['dt'].apply(lambda d: Timestamp(d, tz='US/Pacific')) df = df.set_index(['dt', 'b']) - exp_idx1 = pd.DatetimeIndex(['2014-01-01', '2014-01-02', '2014-01-03'] * 2, + exp_idx1 = DatetimeIndex(['2014-01-01', '2014-01-02', '2014-01-03'] * 2, tz='US/Pacific', name='dt') exp_idx2 = Index(['A', 'B', 'C'] * 2, name='b') - exp_idx = pd.MultiIndex.from_arrays([exp_idx1, exp_idx2]) + exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) expected = DataFrame({'c': [1, 2, 3] * 2, 'd': [4, 5, 6] * 2}, index=exp_idx, columns=['c', 'd']) @@ -2640,10 +2665,10 @@ def test_concat_iterables(self): df1 = DataFrame([1, 2, 3]) df2 = DataFrame([4, 5, 6]) expected = DataFrame([1, 2, 3, 4, 5, 6]) - assert_frame_equal(pd.concat((df1, df2), ignore_index=True), expected) - assert_frame_equal(pd.concat([df1, df2], ignore_index=True), expected) - assert_frame_equal(pd.concat((df for df in (df1, df2)), ignore_index=True), expected) - assert_frame_equal(pd.concat(deque((df1, df2)), ignore_index=True), expected) + assert_frame_equal(concat((df1, df2), ignore_index=True), expected) + assert_frame_equal(concat([df1, df2], ignore_index=True), expected) + assert_frame_equal(concat((df for df in (df1, df2)), ignore_index=True), expected) + assert_frame_equal(concat(deque((df1, df2)), ignore_index=True), expected) class CustomIterator1(object): def __len__(self): return 2 diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 2f4858300293e..50137493e6b01 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -180,7 +180,7 @@ def sort_values(self, return_indexer=False, ascending=True): return self._simple_new(sorted_values, **attribs) - def take(self, indices, axis=0, **kwargs): + def take(self, indices, axis=0, allow_fill=True, fill_value=None): """ Analogous to ndarray.take """ @@ -189,6 +189,12 @@ def take(self, indices, axis=0, **kwargs): if isinstance(maybe_slice, slice): return self[maybe_slice] taken = self.asi8.take(com._ensure_platform_int(indices)) + + # only fill if we are passing a non-None fill_value + if allow_fill and fill_value is not None: + mask = indices == -1 + if mask.any(): + taken[mask] = tslib.iNaT return self._shallow_copy(taken, freq=None) def get_duplicates(self):