From 27b8470546ae6328d2252910ce2546446b31ced0 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Fri, 6 Sep 2013 21:49:57 -0400
Subject: [PATCH 1/2] BUG: ensure column ordering in HDFStore with dup columns
 and column where specified

---
 pandas/io/pytables.py            | 25 +++++++++++++++++--------
 pandas/io/tests/test_pytables.py | 14 +++++++++++++-
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index bcf2345913f1e..3242c7c1a702d 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -2910,9 +2910,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
 
         # reindex by our non_index_axes & compute data_columns
         for a in self.non_index_axes:
-            labels = _ensure_index(a[1])
-            if not labels.equals(obj._get_axis(a[0])):
-                obj = obj.reindex_axis(labels, axis=a[0])
+            obj = _reindex_axis(obj, a[0], a[1])
 
         # figure out data_columns and get out blocks
         block_obj = self.get_object(obj).consolidate()
@@ -3000,11 +2998,7 @@ def process_axes(self, obj, columns=None):
 
         # reorder by any non_index_axes & limit to the select columns
         for axis, labels in self.non_index_axes:
-            if columns is not None:
-                labels = Index(labels) & Index(columns)
-            labels = _ensure_index(labels)
-            if not labels.equals(obj._get_axis(axis)):
-                obj = obj.reindex_axis(labels, axis=axis)
+            obj = _reindex_axis(obj, axis, labels, columns)
 
         # apply the selection filters (but keep in the same order)
         if self.selection.filter:
@@ -3683,6 +3677,21 @@ class AppendableNDimTable(AppendablePanelTable):
     obj_type = Panel4D
 
 
+def _reindex_axis(obj, axis, labels, other=None):
+    ax = obj._get_axis(axis)
+    labels = _ensure_index(labels)
+    if other is None and labels.equals(ax):
+        return obj
+
+    labels = _ensure_index(labels.unique())
+    if other is not None:
+        labels = labels & _ensure_index(other)
+    if not labels.equals(ax):
+        slicer = [ slice(None, None) ] * obj.ndim
+        slicer[axis] = labels
+        obj = obj.loc[tuple(slicer)]
+    return obj
+
 def _get_info(info, name):
     """ get/create the info for this name """
     try:
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
index 2ef4a9287a664..81ea48be23d57 100644
--- a/pandas/io/tests/test_pytables.py
+++ b/pandas/io/tests/test_pytables.py
@@ -2298,7 +2298,6 @@ def test_wide_table(self):
 
     def test_select_with_dups(self):
 
-
         # single dtypes
         df = DataFrame(np.random.randn(10,4),columns=['A','A','B','B'])
         df.index = date_range('20130101 9:30',periods=10,freq='T')
@@ -2308,6 +2307,10 @@ def test_select_with_dups(self):
             result = store.select('df')
             assert_frame_equal(result,df)
 
+            result = store.select('df',columns=['A'])
+            expected = df.loc[:,['A']]
+            assert_frame_equal(result,expected)
+
         # dups accross dtypes
         df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']),
                      DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])],
@@ -2316,9 +2319,18 @@ def test_select_with_dups(self):
 
         with ensure_clean(self.path) as store:
             store.append('df',df)
+
             result = store.select('df')
             assert_frame_equal(result,df)
 
+            expected = df.loc[:,['A']]
+            result = store.select('df',columns=['A'])
+            assert_frame_equal(result,expected)
+
+            expected = df.loc[:,['B','A']]
+            result = store.select('df',columns=['B','A'])
+            assert_frame_equal(result,expected)
+
     def test_wide_table_dups(self):
         wp = tm.makePanel()
         with ensure_clean(self.path) as store:

From b07e0203e6428620391d668adb93fe5857479c27 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Fri, 6 Sep 2013 22:23:33 -0400
Subject: [PATCH 2/2] TST: more robust testing for HDFStore dups

---
 pandas/io/pytables.py            | 15 ++++++++-----
 pandas/io/tests/test_pytables.py | 19 +++++++++++++----
 pandas/util/testing.py           | 36 ++++++++++++++++++++++----------
 3 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index 3242c7c1a702d..0a9e6855f094a 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -667,7 +667,7 @@ def func(_start, _stop):
             axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0]
 
             # concat and return
-            return concat(objs, axis=axis, verify_integrity=True)
+            return concat(objs, axis=axis, verify_integrity=True).consolidate()
 
         if iterator or chunksize is not None:
             return TableIterator(self, func, nrows=nrows, start=start, stop=stop, chunksize=chunksize, auto_close=auto_close)
@@ -3213,7 +3213,7 @@ def read(self, where=None, columns=None, **kwargs):
         if len(objs) == 1:
             wp = objs[0]
         else:
-            wp = concat(objs, axis=0, verify_integrity=False)
+            wp = concat(objs, axis=0, verify_integrity=False).consolidate()
 
         # apply the selection filters & axis orderings
         wp = self.process_axes(wp, columns=columns)
@@ -3504,7 +3504,7 @@ def read(self, where=None, columns=None, **kwargs):
         if len(frames) == 1:
             df = frames[0]
         else:
-            df = concat(frames, axis=1, verify_integrity=False)
+            df = concat(frames, axis=1, verify_integrity=False).consolidate()
 
         # apply the selection filters & axis orderings
         df = self.process_axes(df, columns=columns)
@@ -3680,12 +3680,17 @@ class AppendableNDimTable(AppendablePanelTable):
 def _reindex_axis(obj, axis, labels, other=None):
     ax = obj._get_axis(axis)
     labels = _ensure_index(labels)
-    if other is None and labels.equals(ax):
+
+    # try not to reindex even if other is provided
+    # if it equals our current index
+    if other is not None:
+        other = _ensure_index(other)
+    if (other is None or labels.equals(other)) and labels.equals(ax):
         return obj
 
     labels = _ensure_index(labels.unique())
     if other is not None:
-        labels = labels & _ensure_index(other)
+        labels = labels & _ensure_index(other.unique())
     if not labels.equals(ax):
         slicer = [ slice(None, None) ] * obj.ndim
         slicer[axis] = labels
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
index 81ea48be23d57..e9f4cf7d0f96f 100644
--- a/pandas/io/tests/test_pytables.py
+++ b/pandas/io/tests/test_pytables.py
@@ -2304,8 +2304,14 @@ def test_select_with_dups(self):
 
         with ensure_clean(self.path) as store:
             store.append('df',df)
+
             result = store.select('df')
-            assert_frame_equal(result,df)
+            expected = df
+            assert_frame_equal(result,expected,by_blocks=True)
+
+            result = store.select('df',columns=df.columns)
+            expected = df
+            assert_frame_equal(result,expected,by_blocks=True)
 
             result = store.select('df',columns=['A'])
             expected = df.loc[:,['A']]
@@ -2321,15 +2327,20 @@ def test_select_with_dups(self):
             store.append('df',df)
 
             result = store.select('df')
-            assert_frame_equal(result,df)
+            expected = df
+            assert_frame_equal(result,expected,by_blocks=True)
+
+            result = store.select('df',columns=df.columns)
+            expected = df
+            assert_frame_equal(result,expected,by_blocks=True)
 
             expected = df.loc[:,['A']]
             result = store.select('df',columns=['A'])
-            assert_frame_equal(result,expected)
+            assert_frame_equal(result,expected,by_blocks=True)
 
             expected = df.loc[:,['B','A']]
             result = store.select('df',columns=['B','A'])
-            assert_frame_equal(result,expected)
+            assert_frame_equal(result,expected,by_blocks=True)
 
     def test_wide_table_dups(self):
         wp = tm.makePanel()
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
index c652c2da3214c..abc13fb2ad9ee 100644
--- a/pandas/util/testing.py
+++ b/pandas/util/testing.py
@@ -258,27 +258,41 @@ def assert_frame_equal(left, right, check_dtype=True,
                        check_column_type=False,
                        check_frame_type=False,
                        check_less_precise=False,
-                       check_names=True):
+                       check_names=True,
+                       by_blocks=False):
     if check_frame_type:
         assert_isinstance(left, type(right))
     assert_isinstance(left, DataFrame)
     assert_isinstance(right, DataFrame)
 
     if check_less_precise:
-        assert_almost_equal(left.columns, right.columns)
+        if not by_blocks:
+            assert_almost_equal(left.columns, right.columns)
         assert_almost_equal(left.index, right.index)
     else:
-        assert_index_equal(left.columns, right.columns)
+        if not by_blocks:
+            assert_index_equal(left.columns, right.columns)
         assert_index_equal(left.index, right.index)
 
-    for i, col in enumerate(left.columns):
-        assert col in right
-        lcol = left.icol(i)
-        rcol = right.icol(i)
-        assert_series_equal(lcol, rcol,
-                            check_dtype=check_dtype,
-                            check_index_type=check_index_type,
-                            check_less_precise=check_less_precise)
+    # compare by blocks
+    if by_blocks:
+        rblocks = right.blocks
+        lblocks = left.blocks
+        for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))):
+            assert dtype in lblocks
+            assert dtype in rblocks
+            assert_frame_equal(lblocks[dtype],rblocks[dtype],check_dtype=check_dtype)
+
+    # compare by columns
+    else:
+        for i, col in enumerate(left.columns):
+            assert col in right
+            lcol = left.icol(i)
+            rcol = right.icol(i)
+            assert_series_equal(lcol, rcol,
+                                check_dtype=check_dtype,
+                                check_index_type=check_index_type,
+                                check_less_precise=check_less_precise)
 
     if check_index_type:
         assert_isinstance(left.index, type(right.index))