diff --git a/doc/source/release.rst b/doc/source/release.rst index 9a0854494a897..8179c710b7a8a 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -104,6 +104,8 @@ Bug Fixes - Fixed string-representation of ``NaT`` to be "NaT" (:issue:`5708`) - Fixed string-representation for Timestamp to show nanoseconds if present (:issue:`5912`) - ``pd.match`` not returning passed sentinel + - ``Panel.to_frame()`` no longer fails when ``major_axis`` is a + ``MultiIndex`` (:issue:`5402`). pandas 0.13.0 ------------- diff --git a/pandas/core/index.py b/pandas/core/index.py index 5c77c1e5e9516..ed964e76dd470 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -2396,6 +2396,44 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False, else: return result_levels + def to_hierarchical(self, n_repeat, n_shuffle=1): + """ + Return a MultiIndex reshaped to conform to the + shapes given by n_repeat and n_shuffle. + + Useful to replicate and rearrange a MultiIndex for combination + with another Index with n_repeat items. + + Parameters + ---------- + n_repeat : int + Number of times to repeat the labels on self + n_shuffle : int + Controls the reordering of the labels. If the result is going + to be an inner level in a MultiIndex, n_shuffle will need to be + greater than one. The size of each label must divisible by + n_shuffle. + + Returns + ------- + MultiIndex + + Examples + -------- + >>> idx = MultiIndex.from_tuples([(1, u'one'), (1, u'two'), + (2, u'one'), (2, u'two')]) + >>> idx.to_hierarchical(3) + MultiIndex(levels=[[1, 2], [u'one', u'two']], + labels=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], + [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]]) + """ + levels = self.levels + labels = [np.repeat(x, n_repeat) for x in self.labels] + # Assumes that each label is divisible by n_shuffle + labels = [x.reshape(n_shuffle, -1).ravel(1) for x in labels] + names = self.names + return MultiIndex(levels=levels, labels=labels, names=names) + @property def is_all_dates(self): return False diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 8c50396c503a0..832874f08561b 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -796,7 +796,9 @@ def groupby(self, function, axis='major'): def to_frame(self, filter_observations=True): """ - Transform wide format into long (stacked) format as DataFrame + Transform wide format into long (stacked) format as DataFrame whose + columns are the Panel's items and whose index is a MultiIndex formed + of the Panel's major and minor axes. Parameters ---------- @@ -811,6 +813,7 @@ def to_frame(self, filter_observations=True): _, N, K = self.shape if filter_observations: + # shaped like the return DataFrame mask = com.notnull(self.values).all(axis=0) # size = mask.sum() selector = mask.ravel() @@ -822,19 +825,45 @@ def to_frame(self, filter_observations=True): for item in self.items: data[item] = self[item].values.ravel()[selector] - major_labels = np.arange(N).repeat(K)[selector] + def construct_multi_parts(idx, n_repeat, n_shuffle=1): + axis_idx = idx.to_hierarchical(n_repeat, n_shuffle) + labels = [x[selector] for x in axis_idx.labels] + levels = axis_idx.levels + names = axis_idx.names + return labels, levels, names + + def construct_index_parts(idx, major=True): + levels = [idx] + if major: + labels = [np.arange(N).repeat(K)[selector]] + names = idx.name or 'major' + else: + labels = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)] + labels = [labels.ravel()[selector]] + names = idx.name or 'minor' + names = [names] + return labels, levels, names + + if isinstance(self.major_axis, MultiIndex): + major_labels, major_levels, major_names = construct_multi_parts( + self.major_axis, n_repeat=K) + else: + major_labels, major_levels, major_names = construct_index_parts( + self.major_axis) - # Anyone think of a better way to do this? np.repeat does not - # do what I want - minor_labels = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)] - minor_labels = minor_labels.ravel()[selector] + if isinstance(self.minor_axis, MultiIndex): + minor_labels, minor_levels, minor_names = construct_multi_parts( + self.minor_axis, n_repeat=N, n_shuffle=K) + else: + minor_labels, minor_levels, minor_names = construct_index_parts( + self.minor_axis, major=False) - maj_name = self.major_axis.name or 'major' - min_name = self.minor_axis.name or 'minor' + levels = major_levels + minor_levels + labels = major_labels + minor_labels + names = major_names + minor_names - index = MultiIndex(levels=[self.major_axis, self.minor_axis], - labels=[major_labels, minor_labels], - names=[maj_name, min_name], verify_integrity=False) + index = MultiIndex(levels=levels, labels=labels, + names=names, verify_integrity=False) return DataFrame(data, index=index, columns=self.items) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 1afabc8d4c882..7daf95ac15a95 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -1990,6 +1990,36 @@ def test_format_sparse_config(self): warnings.filters = warn_filters + def test_to_hierarchical(self): + index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), + (2, 'one'), (2, 'two')]) + result = index.to_hierarchical(3) + expected = MultiIndex(levels=[[1, 2], ['one', 'two']], + labels=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], + [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]]) + tm.assert_index_equal(result, expected) + self.assertEqual(result.names, index.names) + + # K > 1 + result = index.to_hierarchical(3, 2) + expected = MultiIndex(levels=[[1, 2], ['one', 'two']], + labels=[[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1], + [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]) + tm.assert_index_equal(result, expected) + self.assertEqual(result.names, index.names) + + # non-sorted + index = MultiIndex.from_tuples([(2, 'c'), (1, 'b'), + (2, 'a'), (2, 'b')], + names=['N1', 'N2']) + + result = index.to_hierarchical(2) + expected = MultiIndex.from_tuples([(2, 'c'), (2, 'c'), (1, 'b'), (1, 'b'), + (2, 'a'), (2, 'a'), (2, 'b'), (2, 'b')], + names=['N1', 'N2']) + tm.assert_index_equal(result, expected) + self.assertEqual(result.names, index.names) + def test_bounds(self): self.index._bounds diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 08d3afe63ec86..2589f7b82aedb 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1450,6 +1450,86 @@ def test_to_frame_mixed(self): # Previously, this was mutating the underlying index and changing its name assert_frame_equal(wp['bool'], panel['bool'], check_names=False) + def test_to_frame_multi_major(self): + idx = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), + (2, 'two')]) + df = DataFrame([[1, 'a', 1], [2, 'b', 1], [3, 'c', 1], [4, 'd', 1]], + columns=['A', 'B', 'C'], index=idx) + wp = Panel({'i1': df, 'i2': df}) + expected_idx = MultiIndex.from_tuples([(1, 'one', 'A'), (1, 'one', 'B'), + (1, 'one', 'C'), (1, 'two', 'A'), + (1, 'two', 'B'), (1, 'two', 'C'), + (2, 'one', 'A'), (2, 'one', 'B'), + (2, 'one', 'C'), (2, 'two', 'A'), + (2, 'two', 'B'), (2, 'two', 'C')], + names=[None, None, 'minor']) + expected = DataFrame({'i1': [1, 'a', 1, 2, 'b', 1, 3, 'c', 1, 4, 'd', 1], + 'i2': [1, 'a', 1, 2, 'b', 1, 3, 'c', 1, 4, 'd', 1]}, + index=expected_idx) + result = wp.to_frame() + assert_frame_equal(result, expected) + + wp.iloc[0, 0].iloc[0] = np.nan # BUG on setting. GH #5773 + result = wp.to_frame() + assert_frame_equal(result, expected[1:]) + + idx = MultiIndex.from_tuples([(1, 'two'), (1, 'one'), (2, 'one'), + (np.nan, 'two')]) + df = DataFrame([[1, 'a', 1], [2, 'b', 1], [3, 'c', 1], [4, 'd', 1]], + columns=['A', 'B', 'C'], index=idx) + wp = Panel({'i1': df, 'i2': df}) + ex_idx = MultiIndex.from_tuples([(1, 'two', 'A'), (1, 'two', 'B'), (1, 'two', 'C'), + (1, 'one', 'A'), (1, 'one', 'B'), (1, 'one', 'C'), + (2, 'one', 'A'), (2, 'one', 'B'), (2, 'one', 'C'), + (np.nan, 'two', 'A'), (np.nan, 'two', 'B'), + (np.nan, 'two', 'C')], + names=[None, None, 'minor']) + expected.index = ex_idx + result = wp.to_frame() + assert_frame_equal(result, expected) + + def test_to_frame_multi_major_minor(self): + cols = MultiIndex(levels=[['C_A', 'C_B'], ['C_1', 'C_2']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + idx = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), + (2, 'two'), (3, 'three'), (4, 'four')]) + df = DataFrame([[1, 2, 11, 12], [3, 4, 13, 14], ['a', 'b', 'w', 'x'], + ['c', 'd', 'y', 'z'], [-1, -2, -3, -4], [-5, -6, -7, -8] + ], columns=cols, index=idx) + wp = Panel({'i1': df, 'i2': df}) + + exp_idx = MultiIndex.from_tuples([(1, 'one', 'C_A', 'C_1'), (1, 'one', 'C_A', 'C_2'), + (1, 'one', 'C_B', 'C_1'), (1, 'one', 'C_B', 'C_2'), + (1, 'two', 'C_A', 'C_1'), (1, 'two', 'C_A', 'C_2'), + (1, 'two', 'C_B', 'C_1'), (1, 'two', 'C_B', 'C_2'), + (2, 'one', 'C_A', 'C_1'), (2, 'one', 'C_A', 'C_2'), + (2, 'one', 'C_B', 'C_1'), (2, 'one', 'C_B', 'C_2'), + (2, 'two', 'C_A', 'C_1'), (2, 'two', 'C_A', 'C_2'), + (2, 'two', 'C_B', 'C_1'), (2, 'two', 'C_B', 'C_2'), + (3, 'three', 'C_A', 'C_1'), (3, 'three', 'C_A', 'C_2'), + (3, 'three', 'C_B', 'C_1'), (3, 'three', 'C_B', 'C_2'), + (4, 'four', 'C_A', 'C_1'), (4, 'four', 'C_A', 'C_2'), + (4, 'four', 'C_B', 'C_1'), (4, 'four', 'C_B', 'C_2')], + names=[None, None, None, None]) + exp_val = [[1, 1], [2, 2], [11, 11], [12, 12], [3, 3], [4, 4], [13, 13], + [14, 14], ['a', 'a'], ['b', 'b'], ['w', 'w'], ['x', 'x'], + ['c', 'c'], ['d', 'd'], ['y', 'y'], ['z', 'z'], [-1, -1], + [-2, -2], [-3, -3], [-4, -4], [-5, -5], [-6, -6], [-7, -7], + [-8, -8]] + result = wp.to_frame() + expected = DataFrame(exp_val, columns=['i1', 'i2'], index=exp_idx) + assert_frame_equal(result, expected) + + def test_to_frame_multi_drop_level(self): + idx = MultiIndex.from_tuples([(1, 'one'), (2, 'one'), (2, 'two')]) + df = DataFrame({'A': [np.nan, 1, 2]}, index=idx) + wp = Panel({'i1': df, 'i2': df}) + result = wp.to_frame() + exp_idx = MultiIndex.from_tuples([(2, 'one', 'A'), (2, 'two', 'A')], + names=[None, None, 'minor']) + expected = DataFrame({'i1': [1., 2], 'i2': [1., 2]}, index=exp_idx) + assert_frame_equal(result, expected) + def test_to_panel_na_handling(self): df = DataFrame(np.random.randint(0, 10, size=20).reshape((10, 2)), index=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1],