Skip to content

BUG: Panel.to_frame() with MultiIndex major axis #5417

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 15, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ Bug Fixes
- Fixed string-representation of ``NaT`` to be "NaT" (:issue:`5708`)
- Fixed string-representation for Timestamp to show nanoseconds if present (:issue:`5912`)
- ``pd.match`` not returning passed sentinel
- ``Panel.to_frame()`` no longer fails when ``major_axis`` is a
``MultiIndex`` (:issue:`5402`).

pandas 0.13.0
-------------
Expand Down
38 changes: 38 additions & 0 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2396,6 +2396,44 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False,
else:
return result_levels

def to_hierarchical(self, n_repeat, n_shuffle=1):
"""
Return a MultiIndex reshaped to conform to the
shapes given by n_repeat and n_shuffle.

Useful to replicate and rearrange a MultiIndex for combination
with another Index with n_repeat items.

Parameters
----------
n_repeat : int
Number of times to repeat the labels on self
n_shuffle : int
Controls the reordering of the labels. If the result is going
to be an inner level in a MultiIndex, n_shuffle will need to be
greater than one. The size of each label must divisible by
n_shuffle.

Returns
-------
MultiIndex

Examples
--------
>>> idx = MultiIndex.from_tuples([(1, u'one'), (1, u'two'),
(2, u'one'), (2, u'two')])
>>> idx.to_hierarchical(3)
MultiIndex(levels=[[1, 2], [u'one', u'two']],
labels=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
[0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]])
"""
levels = self.levels
labels = [np.repeat(x, n_repeat) for x in self.labels]
# Assumes that each label is divisible by n_shuffle
labels = [x.reshape(n_shuffle, -1).ravel(1) for x in labels]
names = self.names
return MultiIndex(levels=levels, labels=labels, names=names)

@property
def is_all_dates(self):
return False
Expand Down
51 changes: 40 additions & 11 deletions pandas/core/panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,7 +796,9 @@ def groupby(self, function, axis='major'):

def to_frame(self, filter_observations=True):
"""
Transform wide format into long (stacked) format as DataFrame
Transform wide format into long (stacked) format as DataFrame whose
columns are the Panel's items and whose index is a MultiIndex formed
of the Panel's major and minor axes.

Parameters
----------
Expand All @@ -811,6 +813,7 @@ def to_frame(self, filter_observations=True):
_, N, K = self.shape

if filter_observations:
# shaped like the return DataFrame
mask = com.notnull(self.values).all(axis=0)
# size = mask.sum()
selector = mask.ravel()
Expand All @@ -822,19 +825,45 @@ def to_frame(self, filter_observations=True):
for item in self.items:
data[item] = self[item].values.ravel()[selector]

major_labels = np.arange(N).repeat(K)[selector]
def construct_multi_parts(idx, n_repeat, n_shuffle=1):
axis_idx = idx.to_hierarchical(n_repeat, n_shuffle)
labels = [x[selector] for x in axis_idx.labels]
levels = axis_idx.levels
names = axis_idx.names
return labels, levels, names

def construct_index_parts(idx, major=True):
levels = [idx]
if major:
labels = [np.arange(N).repeat(K)[selector]]
names = idx.name or 'major'
else:
labels = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)]
labels = [labels.ravel()[selector]]
names = idx.name or 'minor'
names = [names]
return labels, levels, names

if isinstance(self.major_axis, MultiIndex):
major_labels, major_levels, major_names = construct_multi_parts(
self.major_axis, n_repeat=K)
else:
major_labels, major_levels, major_names = construct_index_parts(
self.major_axis)

# Anyone think of a better way to do this? np.repeat does not
# do what I want
minor_labels = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)]
minor_labels = minor_labels.ravel()[selector]
if isinstance(self.minor_axis, MultiIndex):
minor_labels, minor_levels, minor_names = construct_multi_parts(
self.minor_axis, n_repeat=N, n_shuffle=K)
else:
minor_labels, minor_levels, minor_names = construct_index_parts(
self.minor_axis, major=False)

maj_name = self.major_axis.name or 'major'
min_name = self.minor_axis.name or 'minor'
levels = major_levels + minor_levels
labels = major_labels + minor_labels
names = major_names + minor_names

index = MultiIndex(levels=[self.major_axis, self.minor_axis],
labels=[major_labels, minor_labels],
names=[maj_name, min_name], verify_integrity=False)
index = MultiIndex(levels=levels, labels=labels,
names=names, verify_integrity=False)

return DataFrame(data, index=index, columns=self.items)

Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1990,6 +1990,36 @@ def test_format_sparse_config(self):

warnings.filters = warn_filters

def test_to_hierarchical(self):
index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'),
(2, 'one'), (2, 'two')])
result = index.to_hierarchical(3)
expected = MultiIndex(levels=[[1, 2], ['one', 'two']],
labels=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
[0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]])
tm.assert_index_equal(result, expected)
self.assertEqual(result.names, index.names)

# K > 1
result = index.to_hierarchical(3, 2)
expected = MultiIndex(levels=[[1, 2], ['one', 'two']],
labels=[[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]])
tm.assert_index_equal(result, expected)
self.assertEqual(result.names, index.names)

# non-sorted
index = MultiIndex.from_tuples([(2, 'c'), (1, 'b'),
(2, 'a'), (2, 'b')],
names=['N1', 'N2'])

result = index.to_hierarchical(2)
expected = MultiIndex.from_tuples([(2, 'c'), (2, 'c'), (1, 'b'), (1, 'b'),
(2, 'a'), (2, 'a'), (2, 'b'), (2, 'b')],
names=['N1', 'N2'])
tm.assert_index_equal(result, expected)
self.assertEqual(result.names, index.names)

def test_bounds(self):
self.index._bounds

Expand Down
80 changes: 80 additions & 0 deletions pandas/tests/test_panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -1450,6 +1450,86 @@ def test_to_frame_mixed(self):
# Previously, this was mutating the underlying index and changing its name
assert_frame_equal(wp['bool'], panel['bool'], check_names=False)

def test_to_frame_multi_major(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

standard notes on testing MI: make sure you have 1 example that is non-lex-sorted and 1 that has nan in some of the levels.

idx = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'),
(2, 'two')])
df = DataFrame([[1, 'a', 1], [2, 'b', 1], [3, 'c', 1], [4, 'd', 1]],
columns=['A', 'B', 'C'], index=idx)
wp = Panel({'i1': df, 'i2': df})
expected_idx = MultiIndex.from_tuples([(1, 'one', 'A'), (1, 'one', 'B'),
(1, 'one', 'C'), (1, 'two', 'A'),
(1, 'two', 'B'), (1, 'two', 'C'),
(2, 'one', 'A'), (2, 'one', 'B'),
(2, 'one', 'C'), (2, 'two', 'A'),
(2, 'two', 'B'), (2, 'two', 'C')],
names=[None, None, 'minor'])
expected = DataFrame({'i1': [1, 'a', 1, 2, 'b', 1, 3, 'c', 1, 4, 'd', 1],
'i2': [1, 'a', 1, 2, 'b', 1, 3, 'c', 1, 4, 'd', 1]},
index=expected_idx)
result = wp.to_frame()
assert_frame_equal(result, expected)

wp.iloc[0, 0].iloc[0] = np.nan # BUG on setting. GH #5773
result = wp.to_frame()
assert_frame_equal(result, expected[1:])

idx = MultiIndex.from_tuples([(1, 'two'), (1, 'one'), (2, 'one'),
(np.nan, 'two')])
df = DataFrame([[1, 'a', 1], [2, 'b', 1], [3, 'c', 1], [4, 'd', 1]],
columns=['A', 'B', 'C'], index=idx)
wp = Panel({'i1': df, 'i2': df})
ex_idx = MultiIndex.from_tuples([(1, 'two', 'A'), (1, 'two', 'B'), (1, 'two', 'C'),
(1, 'one', 'A'), (1, 'one', 'B'), (1, 'one', 'C'),
(2, 'one', 'A'), (2, 'one', 'B'), (2, 'one', 'C'),
(np.nan, 'two', 'A'), (np.nan, 'two', 'B'),
(np.nan, 'two', 'C')],
names=[None, None, 'minor'])
expected.index = ex_idx
result = wp.to_frame()
assert_frame_equal(result, expected)

def test_to_frame_multi_major_minor(self):
cols = MultiIndex(levels=[['C_A', 'C_B'], ['C_1', 'C_2']],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]])
idx = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'),
(2, 'two'), (3, 'three'), (4, 'four')])
df = DataFrame([[1, 2, 11, 12], [3, 4, 13, 14], ['a', 'b', 'w', 'x'],
['c', 'd', 'y', 'z'], [-1, -2, -3, -4], [-5, -6, -7, -8]
], columns=cols, index=idx)
wp = Panel({'i1': df, 'i2': df})

exp_idx = MultiIndex.from_tuples([(1, 'one', 'C_A', 'C_1'), (1, 'one', 'C_A', 'C_2'),
(1, 'one', 'C_B', 'C_1'), (1, 'one', 'C_B', 'C_2'),
(1, 'two', 'C_A', 'C_1'), (1, 'two', 'C_A', 'C_2'),
(1, 'two', 'C_B', 'C_1'), (1, 'two', 'C_B', 'C_2'),
(2, 'one', 'C_A', 'C_1'), (2, 'one', 'C_A', 'C_2'),
(2, 'one', 'C_B', 'C_1'), (2, 'one', 'C_B', 'C_2'),
(2, 'two', 'C_A', 'C_1'), (2, 'two', 'C_A', 'C_2'),
(2, 'two', 'C_B', 'C_1'), (2, 'two', 'C_B', 'C_2'),
(3, 'three', 'C_A', 'C_1'), (3, 'three', 'C_A', 'C_2'),
(3, 'three', 'C_B', 'C_1'), (3, 'three', 'C_B', 'C_2'),
(4, 'four', 'C_A', 'C_1'), (4, 'four', 'C_A', 'C_2'),
(4, 'four', 'C_B', 'C_1'), (4, 'four', 'C_B', 'C_2')],
names=[None, None, None, None])
exp_val = [[1, 1], [2, 2], [11, 11], [12, 12], [3, 3], [4, 4], [13, 13],
[14, 14], ['a', 'a'], ['b', 'b'], ['w', 'w'], ['x', 'x'],
['c', 'c'], ['d', 'd'], ['y', 'y'], ['z', 'z'], [-1, -1],
[-2, -2], [-3, -3], [-4, -4], [-5, -5], [-6, -6], [-7, -7],
[-8, -8]]
result = wp.to_frame()
expected = DataFrame(exp_val, columns=['i1', 'i2'], index=exp_idx)
assert_frame_equal(result, expected)

def test_to_frame_multi_drop_level(self):
idx = MultiIndex.from_tuples([(1, 'one'), (2, 'one'), (2, 'two')])
df = DataFrame({'A': [np.nan, 1, 2]}, index=idx)
wp = Panel({'i1': df, 'i2': df})
result = wp.to_frame()
exp_idx = MultiIndex.from_tuples([(2, 'one', 'A'), (2, 'two', 'A')],
names=[None, None, 'minor'])
expected = DataFrame({'i1': [1., 2], 'i2': [1., 2]}, index=exp_idx)
assert_frame_equal(result, expected)

def test_to_panel_na_handling(self):
df = DataFrame(np.random.randint(0, 10, size=20).reshape((10, 2)),
index=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
Expand Down