From 76f51d6ba0def51cf173875b966a3317b1cc04a6 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Thu, 22 Oct 2015 10:10:41 -0400
Subject: [PATCH] BUG: Bug in merging datetime64[ns, tz] dtypes, #11405

---
 doc/source/whatsnew/v0.17.1.txt  |  2 +-
 pandas/core/common.py            |  8 ++-
 pandas/core/internals.py         | 15 ++---
 pandas/tools/merge.py            | 39 ++++++-------
 pandas/tools/tests/test_merge.py | 97 ++++++++++++++++++++------------
 pandas/tseries/base.py           |  8 ++-
 6 files changed, 103 insertions(+), 66 deletions(-)

diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt
index 989b05003d76f..06ab19b6e4a40 100755
--- a/doc/source/whatsnew/v0.17.1.txt
+++ b/doc/source/whatsnew/v0.17.1.txt
@@ -74,7 +74,7 @@ Bug Fixes
 
 - Bug in ``.to_latex()`` output broken when the index has a name (:issue: `10660`)
 - Bug in ``HDFStore.append`` with strings whose encoded length exceded the max unencoded length (:issue:`11234`)
-
+- Bug in merging ``datetime64[ns, tz]`` dtypes (:issue:`11405`)
 - Bug in ``HDFStore.select`` when comparing with a numpy scalar in a where clause (:issue:`11283`)
 
 
diff --git a/pandas/core/common.py b/pandas/core/common.py
index c2c50bce04309..ac3e61a500bb6 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -1081,6 +1081,9 @@ def _maybe_promote(dtype, fill_value=np.nan):
                     fill_value = tslib.iNaT
             else:
                 fill_value = tslib.iNaT
+    elif is_datetimetz(dtype):
+        if isnull(fill_value):
+            fill_value = tslib.iNaT
     elif is_float(fill_value):
         if issubclass(dtype.type, np.bool_):
             dtype = np.object_
@@ -1107,7 +1110,9 @@ def _maybe_promote(dtype, fill_value=np.nan):
 
     # in case we have a string that looked like a number
     if is_categorical_dtype(dtype):
-        dtype = dtype
+        pass
+    elif is_datetimetz(dtype):
+        pass
     elif issubclass(np.dtype(dtype).type, compat.string_types):
         dtype = np.object_
 
@@ -2497,7 +2502,6 @@ def is_int64_dtype(arr_or_dtype):
     tipo = _get_dtype_type(arr_or_dtype)
     return issubclass(tipo, np.int64)
 
-
 def is_int_or_datetime_dtype(arr_or_dtype):
     tipo = _get_dtype_type(arr_or_dtype)
     return (issubclass(tipo, np.integer) or
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index f1d82ec1f3b2e..b3e7e82b5feb7 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -4114,7 +4114,7 @@ def _interleaved_dtype(blocks):
     if not len(blocks):
         return None
 
-    counts = defaultdict(lambda: [])
+    counts = defaultdict(list)
     for x in blocks:
         counts[type(x)].append(x)
 
@@ -4482,9 +4482,8 @@ def get_empty_dtype_and_na(join_units):
         else:
             dtypes[i] = unit.dtype
 
-    # dtypes = set()
-    upcast_classes = set()
-    null_upcast_classes = set()
+    upcast_classes = defaultdict(list)
+    null_upcast_classes = defaultdict(list)
     for dtype, unit in zip(dtypes, join_units):
         if dtype is None:
             continue
@@ -4508,9 +4507,9 @@ def get_empty_dtype_and_na(join_units):
         # are only null blocks, when same upcasting rules must be applied to
         # null upcast classes.
         if unit.is_null:
-            null_upcast_classes.add(upcast_cls)
+            null_upcast_classes[upcast_cls].append(dtype)
         else:
-            upcast_classes.add(upcast_cls)
+            upcast_classes[upcast_cls].append(dtype)
 
     if not upcast_classes:
         upcast_classes = null_upcast_classes
@@ -4528,7 +4527,8 @@ def get_empty_dtype_and_na(join_units):
     elif 'float' in upcast_classes:
         return np.dtype(np.float64), np.nan
     elif 'datetimetz' in upcast_classes:
-        return np.dtype('M8[ns]'), tslib.iNaT
+        dtype = upcast_classes['datetimetz']
+        return dtype[0], tslib.iNaT
     elif 'datetime' in upcast_classes:
         return np.dtype('M8[ns]'), tslib.iNaT
     elif 'timedelta' in upcast_classes:
@@ -4788,6 +4788,7 @@ def is_null(self):
         return True
 
     def get_reindexed_values(self, empty_dtype, upcasted_na):
+
         if upcasted_na is None:
             # No upcasting is necessary
             fill_value = self.block.fill_value
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
index 95c68aaa00b18..722ce439722c9 100644
--- a/pandas/tools/merge.py
+++ b/pandas/tools/merge.py
@@ -220,8 +220,8 @@ def get_result(self):
         return result
 
     def _indicator_pre_merge(self, left, right):
-                
-        columns = left.columns.union(right.columns)  
+
+        columns = left.columns.union(right.columns)
 
         for i in ['_left_indicator', '_right_indicator']:
             if i in columns:
@@ -232,12 +232,12 @@ def _indicator_pre_merge(self, left, right):
         left = left.copy()
         right = right.copy()
 
-        left['_left_indicator'] = 1  
-        left['_left_indicator'] = left['_left_indicator'].astype('int8')  
-        
-        right['_right_indicator'] = 2     
-        right['_right_indicator'] = right['_right_indicator'].astype('int8') 
-        
+        left['_left_indicator'] = 1
+        left['_left_indicator'] = left['_left_indicator'].astype('int8')
+
+        right['_right_indicator'] = 2
+        right['_right_indicator'] = right['_right_indicator'].astype('int8')
+
         return left, right
 
     def _indicator_post_merge(self, result):
@@ -246,8 +246,8 @@ def _indicator_post_merge(self, result):
         result['_right_indicator'] = result['_right_indicator'].fillna(0)
 
         result[self.indicator_name] = Categorical((result['_left_indicator'] + result['_right_indicator']), categories=[1,2,3])
-        result[self.indicator_name] = result[self.indicator_name].cat.rename_categories(['left_only', 'right_only', 'both'])        
- 
+        result[self.indicator_name] = result[self.indicator_name].cat.rename_categories(['left_only', 'right_only', 'both'])
+
         result = result.drop(labels=['_left_indicator', '_right_indicator'], axis=1)
 
         return result
@@ -261,7 +261,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
                 continue
 
             if name in result:
-                key_col = result[name]
+                key_indexer = result.columns.get_loc(name)
 
                 if left_indexer is not None and right_indexer is not None:
 
@@ -274,9 +274,8 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
                             continue
 
                         right_na_indexer = right_indexer.take(na_indexer)
-                        key_col.put(
-                            na_indexer, com.take_1d(self.right_join_keys[i],
-                                                    right_na_indexer))
+                        result.iloc[na_indexer,key_indexer] = com.take_1d(self.right_join_keys[i],
+                                                                          right_na_indexer)
                     elif name in self.right:
                         if len(self.right) == 0:
                             continue
@@ -286,9 +285,8 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
                             continue
 
                         left_na_indexer = left_indexer.take(na_indexer)
-                        key_col.put(na_indexer, com.take_1d(self.left_join_keys[i],
-                                                            left_na_indexer))
-
+                        result.iloc[na_indexer,key_indexer] = com.take_1d(self.left_join_keys[i],
+                                                                          left_na_indexer)
             elif left_indexer is not None \
                     and isinstance(self.left_join_keys[i], np.ndarray):
 
@@ -664,10 +662,13 @@ def _right_outer_join(x, y, max_groups):
 
 
 def _factorize_keys(lk, rk, sort=True):
+    if com.is_datetime64tz_dtype(lk) and com.is_datetime64tz_dtype(rk):
+        lk = lk.values
+        rk = rk.values
     if com.is_int_or_datetime_dtype(lk) and com.is_int_or_datetime_dtype(rk):
         klass = _hash.Int64Factorizer
-        lk = com._ensure_int64(lk)
-        rk = com._ensure_int64(rk)
+        lk = com._ensure_int64(com._values_from_object(lk))
+        rk = com._ensure_int64(com._values_from_object(rk))
     else:
         klass = _hash.Factorizer
         lk = com._ensure_object(lk)
diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py
index b555a7dc2b3a1..3a77cfec5fbc3 100644
--- a/pandas/tools/tests/test_merge.py
+++ b/pandas/tools/tests/test_merge.py
@@ -13,6 +13,7 @@
 from pandas import compat
 from pandas.tseries.index import DatetimeIndex
 from pandas.tools.merge import merge, concat, ordered_merge, MergeError
+from pandas import Categorical, Timestamp
 from pandas.util.testing import (assert_frame_equal, assert_series_equal,
                                  assert_almost_equal,
                                  makeCustomDataframe as mkdf,
@@ -947,29 +948,53 @@ def test_overlapping_columns_error_message(self):
         df2.columns = ['key1', 'foo', 'foo']
         self.assertRaises(ValueError, merge, df, df2)
 
+    def test_merge_on_datetime64tz(self):
+
+        # GH11405
+        left = pd.DataFrame({'key' : pd.date_range('20151010',periods=2,tz='US/Eastern'),
+                             'value' : [1,2]})
+        right = pd.DataFrame({'key' : pd.date_range('20151011',periods=3,tz='US/Eastern'),
+                              'value' : [1,2,3]})
+
+        expected = DataFrame({'key' : pd.date_range('20151010',periods=4,tz='US/Eastern'),
+                              'value_x' : [1,2,np.nan,np.nan],
+                              'value_y' : [np.nan,1,2,3]})
+        result = pd.merge(left, right, on='key', how='outer')
+        assert_frame_equal(result, expected)
+
+        left = pd.DataFrame({'value' : pd.date_range('20151010',periods=2,tz='US/Eastern'),
+                             'key' : [1,2]})
+        right = pd.DataFrame({'value' : pd.date_range('20151011',periods=2,tz='US/Eastern'),
+                              'key' : [2,3]})
+        expected = DataFrame({'value_x' : list(pd.date_range('20151010',periods=2,tz='US/Eastern')) + [pd.NaT],
+                              'value_y' : [pd.NaT] + list(pd.date_range('20151011',periods=2,tz='US/Eastern')),
+                              'key' : [1.,2,3]})
+        result = pd.merge(left, right, on='key', how='outer')
+        assert_frame_equal(result, expected)
+
     def test_indicator(self):
         # PR #10054. xref #7412 and closes #8790.
-        df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b'], 'col_conflict':[1,2]})
+        df1 = DataFrame({'col1':[0,1], 'col_left':['a','b'], 'col_conflict':[1,2]})
         df1_copy = df1.copy()
 
-        df2 = pd.DataFrame({'col1':[1,2,3,4,5],'col_right':[2,2,2,2,2],
-                            'col_conflict':[1,2,3,4,5]})
+        df2 = DataFrame({'col1':[1,2,3,4,5],'col_right':[2,2,2,2,2],
+                         'col_conflict':[1,2,3,4,5]})
         df2_copy = df2.copy()
 
-        df_result = pd.DataFrame({'col1':[0,1,2,3,4,5],
+        df_result = DataFrame({'col1':[0,1,2,3,4,5],
                 'col_conflict_x':[1,2,np.nan,np.nan,np.nan,np.nan],
                 'col_left':['a','b', np.nan,np.nan,np.nan,np.nan],
                 'col_conflict_y':[np.nan,1,2,3,4,5],
                 'col_right':[np.nan, 2,2,2,2,2]},
                 dtype='float64')
-        df_result['_merge'] = pd.Categorical(['left_only','both','right_only',
+        df_result['_merge'] = Categorical(['left_only','both','right_only',
             'right_only','right_only','right_only']
             , categories=['left_only', 'right_only', 'both'])
 
         df_result = df_result[['col1', 'col_conflict_x', 'col_left',
                                'col_conflict_y', 'col_right', '_merge' ]]
 
-        test = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
+        test = merge(df1, df2, on='col1', how='outer', indicator=True)
         assert_frame_equal(test, df_result)
         test = df1.merge(df2, on='col1', how='outer', indicator=True)
         assert_frame_equal(test, df_result)
@@ -982,63 +1007,63 @@ def test_indicator(self):
         df_result_custom_name = df_result
         df_result_custom_name = df_result_custom_name.rename(columns={'_merge':'custom_name'})
 
-        test_custom_name = pd.merge(df1, df2, on='col1', how='outer', indicator='custom_name')
+        test_custom_name = merge(df1, df2, on='col1', how='outer', indicator='custom_name')
         assert_frame_equal(test_custom_name, df_result_custom_name)
         test_custom_name = df1.merge(df2, on='col1', how='outer', indicator='custom_name')
         assert_frame_equal(test_custom_name, df_result_custom_name)
 
         # Check only accepts strings and booleans
         with tm.assertRaises(ValueError):
-            pd.merge(df1, df2, on='col1', how='outer', indicator=5)
+            merge(df1, df2, on='col1', how='outer', indicator=5)
         with tm.assertRaises(ValueError):
             df1.merge(df2, on='col1', how='outer', indicator=5)
 
         # Check result integrity
 
-        test2 = pd.merge(df1, df2, on='col1', how='left', indicator=True)
+        test2 = merge(df1, df2, on='col1', how='left', indicator=True)
         self.assertTrue((test2._merge != 'right_only').all())
         test2 = df1.merge(df2, on='col1', how='left', indicator=True)
         self.assertTrue((test2._merge != 'right_only').all())
 
-        test3 = pd.merge(df1, df2, on='col1', how='right', indicator=True)
+        test3 = merge(df1, df2, on='col1', how='right', indicator=True)
         self.assertTrue((test3._merge != 'left_only').all())
         test3 = df1.merge(df2, on='col1', how='right', indicator=True)
         self.assertTrue((test3._merge != 'left_only').all())
 
-        test4 = pd.merge(df1, df2, on='col1', how='inner', indicator=True)
+        test4 = merge(df1, df2, on='col1', how='inner', indicator=True)
         self.assertTrue((test4._merge == 'both').all())
         test4 = df1.merge(df2, on='col1', how='inner', indicator=True)
         self.assertTrue((test4._merge == 'both').all())
 
         # Check if working name in df
         for i in ['_right_indicator', '_left_indicator', '_merge']:
-            df_badcolumn = pd.DataFrame({'col1':[1,2], i:[2,2]})
+            df_badcolumn = DataFrame({'col1':[1,2], i:[2,2]})
 
             with tm.assertRaises(ValueError):
-                pd.merge(df1, df_badcolumn, on='col1', how='outer', indicator=True)
+                merge(df1, df_badcolumn, on='col1', how='outer', indicator=True)
             with tm.assertRaises(ValueError):
                 df1.merge(df_badcolumn, on='col1', how='outer', indicator=True)
 
         # Check for name conflict with custom name
-        df_badcolumn = pd.DataFrame({'col1':[1,2], 'custom_column_name':[2,2]})
+        df_badcolumn = DataFrame({'col1':[1,2], 'custom_column_name':[2,2]})
 
         with tm.assertRaises(ValueError):
-            pd.merge(df1, df_badcolumn, on='col1', how='outer', indicator='custom_column_name')
+            merge(df1, df_badcolumn, on='col1', how='outer', indicator='custom_column_name')
         with tm.assertRaises(ValueError):
             df1.merge(df_badcolumn, on='col1', how='outer', indicator='custom_column_name')
 
         # Merge on multiple columns
-        df3 = pd.DataFrame({'col1':[0,1], 'col2':['a','b']})
+        df3 = DataFrame({'col1':[0,1], 'col2':['a','b']})
 
-        df4 = pd.DataFrame({'col1':[1,1,3], 'col2':['b','x','y']})
+        df4 = DataFrame({'col1':[1,1,3], 'col2':['b','x','y']})
 
-        hand_coded_result = pd.DataFrame({'col1':[0,1,1,3.0],
+        hand_coded_result = DataFrame({'col1':[0,1,1,3.0],
                                          'col2':['a','b','x','y']})
-        hand_coded_result['_merge'] = pd.Categorical(
+        hand_coded_result['_merge'] = Categorical(
             ['left_only','both','right_only','right_only']
             , categories=['left_only', 'right_only', 'both'])
 
-        test5 = pd.merge(df3, df4, on=['col1', 'col2'], how='outer', indicator=True)
+        test5 = merge(df3, df4, on=['col1', 'col2'], how='outer', indicator=True)
         assert_frame_equal(test5, hand_coded_result)
         test5 = df3.merge(df4, on=['col1', 'col2'], how='outer', indicator=True)
         assert_frame_equal(test5, hand_coded_result)
@@ -1464,18 +1489,18 @@ def test_int64_overflow_issues(self):
                          columns=list('ABCDEFG'))
 
         # confirm that this is checking what it is supposed to check
-        shape = left.apply(pd.Series.nunique).values
+        shape = left.apply(Series.nunique).values
         self.assertTrue(_int64_overflow_possible(shape))
 
         # add duplicates to left frame
-        left = pd.concat([left, left], ignore_index=True)
+        left = concat([left, left], ignore_index=True)
 
         right = DataFrame(np.random.randint(low, high, (n // 2, 7)).astype('int64'),
                           columns=list('ABCDEFG'))
 
         # add duplicates & overlap with left to the right frame
         i = np.random.choice(len(left), n)
-        right = pd.concat([right, right, left.iloc[i]], ignore_index=True)
+        right = concat([right, right, left.iloc[i]], ignore_index=True)
 
         left['left'] = np.random.randn(len(left))
         right['right'] = np.random.randn(len(right))
@@ -1980,19 +2005,19 @@ def test_concat_dataframe_keys_bug(self):
 
     def test_concat_series_partial_columns_names(self):
         # GH10698
-        foo = pd.Series([1,2], name='foo')
-        bar = pd.Series([1,2])
-        baz = pd.Series([4,5])
+        foo = Series([1,2], name='foo')
+        bar = Series([1,2])
+        baz = Series([4,5])
 
-        result = pd.concat([foo, bar, baz], axis=1)
+        result = concat([foo, bar, baz], axis=1)
         expected = DataFrame({'foo' : [1,2], 0 : [1,2], 1 : [4,5]}, columns=['foo',0,1])
         tm.assert_frame_equal(result, expected)
 
-        result = pd.concat([foo, bar, baz], axis=1, keys=['red','blue','yellow'])
+        result = concat([foo, bar, baz], axis=1, keys=['red','blue','yellow'])
         expected = DataFrame({'red' : [1,2], 'blue' : [1,2], 'yellow' : [4,5]}, columns=['red','blue','yellow'])
         tm.assert_frame_equal(result, expected)
 
-        result = pd.concat([foo, bar, baz], axis=1, ignore_index=True)
+        result = concat([foo, bar, baz], axis=1, ignore_index=True)
         expected = DataFrame({0 : [1,2], 1 : [1,2], 2 : [4,5]})
         tm.assert_frame_equal(result, expected)
 
@@ -2059,13 +2084,13 @@ def test_concat_multiindex_with_tz(self):
                                datetime(2014, 1, 3)],
                         'b': ['A', 'B', 'C'],
                         'c': [1, 2, 3], 'd': [4, 5, 6]})
-        df['dt'] = df['dt'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific'))
+        df['dt'] = df['dt'].apply(lambda d: Timestamp(d, tz='US/Pacific'))
         df = df.set_index(['dt', 'b'])
 
-        exp_idx1 = pd.DatetimeIndex(['2014-01-01', '2014-01-02', '2014-01-03'] * 2,
+        exp_idx1 = DatetimeIndex(['2014-01-01', '2014-01-02', '2014-01-03'] * 2,
                                     tz='US/Pacific', name='dt')
         exp_idx2 = Index(['A', 'B', 'C'] * 2, name='b')
-        exp_idx = pd.MultiIndex.from_arrays([exp_idx1, exp_idx2])
+        exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
         expected = DataFrame({'c': [1, 2, 3] * 2, 'd': [4, 5, 6] * 2},
                              index=exp_idx, columns=['c', 'd'])
 
@@ -2640,10 +2665,10 @@ def test_concat_iterables(self):
         df1 = DataFrame([1, 2, 3])
         df2 = DataFrame([4, 5, 6])
         expected = DataFrame([1, 2, 3, 4, 5, 6])
-        assert_frame_equal(pd.concat((df1, df2), ignore_index=True), expected)
-        assert_frame_equal(pd.concat([df1, df2], ignore_index=True), expected)
-        assert_frame_equal(pd.concat((df for df in (df1, df2)), ignore_index=True), expected)
-        assert_frame_equal(pd.concat(deque((df1, df2)), ignore_index=True), expected)
+        assert_frame_equal(concat((df1, df2), ignore_index=True), expected)
+        assert_frame_equal(concat([df1, df2], ignore_index=True), expected)
+        assert_frame_equal(concat((df for df in (df1, df2)), ignore_index=True), expected)
+        assert_frame_equal(concat(deque((df1, df2)), ignore_index=True), expected)
         class CustomIterator1(object):
             def __len__(self):
                 return 2
diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py
index 2f4858300293e..50137493e6b01 100644
--- a/pandas/tseries/base.py
+++ b/pandas/tseries/base.py
@@ -180,7 +180,7 @@ def sort_values(self, return_indexer=False, ascending=True):
 
             return self._simple_new(sorted_values, **attribs)
 
-    def take(self, indices, axis=0, **kwargs):
+    def take(self, indices, axis=0, allow_fill=True, fill_value=None):
         """
         Analogous to ndarray.take
         """
@@ -189,6 +189,12 @@ def take(self, indices, axis=0, **kwargs):
         if isinstance(maybe_slice, slice):
             return self[maybe_slice]
         taken = self.asi8.take(com._ensure_platform_int(indices))
+
+        # only fill if we are passing a non-None fill_value
+        if allow_fill and fill_value is not None:
+            mask = indices == -1
+            if mask.any():
+                taken[mask] = tslib.iNaT
         return self._shallow_copy(taken, freq=None)
 
     def get_duplicates(self):