From 27b8470546ae6328d2252910ce2546446b31ced0 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 6 Sep 2013 21:49:57 -0400 Subject: [PATCH 1/2] BUG: ensure column ordering in HDFStore with dup columns and column where specified --- pandas/io/pytables.py | 25 +++++++++++++++++-------- pandas/io/tests/test_pytables.py | 14 +++++++++++++- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index bcf2345913f1e..3242c7c1a702d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2910,9 +2910,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, # reindex by our non_index_axes & compute data_columns for a in self.non_index_axes: - labels = _ensure_index(a[1]) - if not labels.equals(obj._get_axis(a[0])): - obj = obj.reindex_axis(labels, axis=a[0]) + obj = _reindex_axis(obj, a[0], a[1]) # figure out data_columns and get out blocks block_obj = self.get_object(obj).consolidate() @@ -3000,11 +2998,7 @@ def process_axes(self, obj, columns=None): # reorder by any non_index_axes & limit to the select columns for axis, labels in self.non_index_axes: - if columns is not None: - labels = Index(labels) & Index(columns) - labels = _ensure_index(labels) - if not labels.equals(obj._get_axis(axis)): - obj = obj.reindex_axis(labels, axis=axis) + obj = _reindex_axis(obj, axis, labels, columns) # apply the selection filters (but keep in the same order) if self.selection.filter: @@ -3683,6 +3677,21 @@ class AppendableNDimTable(AppendablePanelTable): obj_type = Panel4D +def _reindex_axis(obj, axis, labels, other=None): + ax = obj._get_axis(axis) + labels = _ensure_index(labels) + if other is None and labels.equals(ax): + return obj + + labels = _ensure_index(labels.unique()) + if other is not None: + labels = labels & _ensure_index(other) + if not labels.equals(ax): + slicer = [ slice(None, None) ] * obj.ndim + slicer[axis] = labels + obj = obj.loc[tuple(slicer)] + return obj + def _get_info(info, name): """ get/create the info for this name """ try: diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 2ef4a9287a664..81ea48be23d57 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -2298,7 +2298,6 @@ def test_wide_table(self): def test_select_with_dups(self): - # single dtypes df = DataFrame(np.random.randn(10,4),columns=['A','A','B','B']) df.index = date_range('20130101 9:30',periods=10,freq='T') @@ -2308,6 +2307,10 @@ def test_select_with_dups(self): result = store.select('df') assert_frame_equal(result,df) + result = store.select('df',columns=['A']) + expected = df.loc[:,['A']] + assert_frame_equal(result,expected) + # dups accross dtypes df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']), DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])], @@ -2316,9 +2319,18 @@ def test_select_with_dups(self): with ensure_clean(self.path) as store: store.append('df',df) + result = store.select('df') assert_frame_equal(result,df) + expected = df.loc[:,['A']] + result = store.select('df',columns=['A']) + assert_frame_equal(result,expected) + + expected = df.loc[:,['B','A']] + result = store.select('df',columns=['B','A']) + assert_frame_equal(result,expected) + def test_wide_table_dups(self): wp = tm.makePanel() with ensure_clean(self.path) as store: From b07e0203e6428620391d668adb93fe5857479c27 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 6 Sep 2013 22:23:33 -0400 Subject: [PATCH 2/2] TST: more robust testing for HDFStore dups --- pandas/io/pytables.py | 15 ++++++++----- pandas/io/tests/test_pytables.py | 19 +++++++++++++---- pandas/util/testing.py | 36 ++++++++++++++++++++++---------- 3 files changed, 50 insertions(+), 20 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 3242c7c1a702d..0a9e6855f094a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -667,7 +667,7 @@ def func(_start, _stop): axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0] # concat and return - return concat(objs, axis=axis, verify_integrity=True) + return concat(objs, axis=axis, verify_integrity=True).consolidate() if iterator or chunksize is not None: return TableIterator(self, func, nrows=nrows, start=start, stop=stop, chunksize=chunksize, auto_close=auto_close) @@ -3213,7 +3213,7 @@ def read(self, where=None, columns=None, **kwargs): if len(objs) == 1: wp = objs[0] else: - wp = concat(objs, axis=0, verify_integrity=False) + wp = concat(objs, axis=0, verify_integrity=False).consolidate() # apply the selection filters & axis orderings wp = self.process_axes(wp, columns=columns) @@ -3504,7 +3504,7 @@ def read(self, where=None, columns=None, **kwargs): if len(frames) == 1: df = frames[0] else: - df = concat(frames, axis=1, verify_integrity=False) + df = concat(frames, axis=1, verify_integrity=False).consolidate() # apply the selection filters & axis orderings df = self.process_axes(df, columns=columns) @@ -3680,12 +3680,17 @@ class AppendableNDimTable(AppendablePanelTable): def _reindex_axis(obj, axis, labels, other=None): ax = obj._get_axis(axis) labels = _ensure_index(labels) - if other is None and labels.equals(ax): + + # try not to reindex even if other is provided + # if it equals our current index + if other is not None: + other = _ensure_index(other) + if (other is None or labels.equals(other)) and labels.equals(ax): return obj labels = _ensure_index(labels.unique()) if other is not None: - labels = labels & _ensure_index(other) + labels = labels & _ensure_index(other.unique()) if not labels.equals(ax): slicer = [ slice(None, None) ] * obj.ndim slicer[axis] = labels diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 81ea48be23d57..e9f4cf7d0f96f 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -2304,8 +2304,14 @@ def test_select_with_dups(self): with ensure_clean(self.path) as store: store.append('df',df) + result = store.select('df') - assert_frame_equal(result,df) + expected = df + assert_frame_equal(result,expected,by_blocks=True) + + result = store.select('df',columns=df.columns) + expected = df + assert_frame_equal(result,expected,by_blocks=True) result = store.select('df',columns=['A']) expected = df.loc[:,['A']] @@ -2321,15 +2327,20 @@ def test_select_with_dups(self): store.append('df',df) result = store.select('df') - assert_frame_equal(result,df) + expected = df + assert_frame_equal(result,expected,by_blocks=True) + + result = store.select('df',columns=df.columns) + expected = df + assert_frame_equal(result,expected,by_blocks=True) expected = df.loc[:,['A']] result = store.select('df',columns=['A']) - assert_frame_equal(result,expected) + assert_frame_equal(result,expected,by_blocks=True) expected = df.loc[:,['B','A']] result = store.select('df',columns=['B','A']) - assert_frame_equal(result,expected) + assert_frame_equal(result,expected,by_blocks=True) def test_wide_table_dups(self): wp = tm.makePanel() diff --git a/pandas/util/testing.py b/pandas/util/testing.py index c652c2da3214c..abc13fb2ad9ee 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -258,27 +258,41 @@ def assert_frame_equal(left, right, check_dtype=True, check_column_type=False, check_frame_type=False, check_less_precise=False, - check_names=True): + check_names=True, + by_blocks=False): if check_frame_type: assert_isinstance(left, type(right)) assert_isinstance(left, DataFrame) assert_isinstance(right, DataFrame) if check_less_precise: - assert_almost_equal(left.columns, right.columns) + if not by_blocks: + assert_almost_equal(left.columns, right.columns) assert_almost_equal(left.index, right.index) else: - assert_index_equal(left.columns, right.columns) + if not by_blocks: + assert_index_equal(left.columns, right.columns) assert_index_equal(left.index, right.index) - for i, col in enumerate(left.columns): - assert col in right - lcol = left.icol(i) - rcol = right.icol(i) - assert_series_equal(lcol, rcol, - check_dtype=check_dtype, - check_index_type=check_index_type, - check_less_precise=check_less_precise) + # compare by blocks + if by_blocks: + rblocks = right.blocks + lblocks = left.blocks + for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))): + assert dtype in lblocks + assert dtype in rblocks + assert_frame_equal(lblocks[dtype],rblocks[dtype],check_dtype=check_dtype) + + # compare by columns + else: + for i, col in enumerate(left.columns): + assert col in right + lcol = left.icol(i) + rcol = right.icol(i) + assert_series_equal(lcol, rcol, + check_dtype=check_dtype, + check_index_type=check_index_type, + check_less_precise=check_less_precise) if check_index_type: assert_isinstance(left.index, type(right.index))