Skip to content

ENH: Melt with MultiIndex columns #4150

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 12, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ pandas 0.12
to specify custom column names of the returned DataFrame (:issue:`3649`),
thanks @hoechenberger. If ``var_name`` is not specified and ``dataframe.columns.name``
is not None, then this will be used as the ``var_name`` (:issue:`4144`).
Also support for MultiIndex columns.
- clipboard functions use pyperclip (no dependencies on Windows, alternative
dependencies offered for Linux) (:issue:`3837`).
- Plotting functions now raise a ``TypeError`` before trying to plot anything
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1653,7 +1653,9 @@ def get_level_values(self, level):
num = self._get_level_number(level)
unique_vals = self.levels[num] # .values
labels = self.labels[num]
return unique_vals.take(labels)
values = unique_vals.take(labels)
values.name = self.names[num]
return values

def format(self, space=2, sparsify=None, adjoin=True, names=False,
na_rep='NaN', formatter=None):
Expand Down
108 changes: 76 additions & 32 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,20 @@

import numpy as np

import six

from pandas.core.series import Series
from pandas.core.frame import DataFrame

from pandas.core.categorical import Categorical
from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote,
_maybe_upcast, isnull)
isnull)
from pandas.core.groupby import (get_group_index, _compress_group_index,
decons_group_index)
import pandas.core.common as com
import pandas.algos as algos
from pandas import lib

from pandas.core.index import MultiIndex, Index
from pandas.core.index import MultiIndex


class ReshapeError(Exception):
Expand All @@ -35,21 +36,26 @@ class _Unstacker(object):

Examples
--------
>>> import pandas as pd
>>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
... ('two', 'a'), ('two', 'b')])
>>> s = pd.Series(np.arange(1.0, 5.0), index=index)
>>> s
one a 1.
one b 2.
two a 3.
two b 4.
one a 1
b 2
two a 3
b 4
dtype: float64

>>> s.unstack(level=-1)
a b
one 1. 2.
two 3. 4.
one 1 2
two 3 4

>>> s.unstack(level=0)
one two
a 1. 2.
b 3. 4.
a 1 2
b 3 4

Returns
-------
Expand Down Expand Up @@ -159,7 +165,7 @@ def get_result(self):
values[j] = orig_values[i]
else:
index = index.take(self.unique_groups)

return DataFrame(values, index=index, columns=columns)

def get_new_values(self):
Expand Down Expand Up @@ -601,7 +607,7 @@ def _stack_multi_columns(frame, level=-1, dropna=True):


def melt(frame, id_vars=None, value_vars=None,
var_name=None, value_name='value'):
var_name=None, value_name='value', col_level=None):
"""
"Unpivots" a DataFrame from wide format to long format, optionally leaving
id variables set
Expand All @@ -613,27 +619,47 @@ def melt(frame, id_vars=None, value_vars=None,
value_vars : tuple, list, or ndarray
var_name : scalar, if None uses frame.column.name or 'variable'
value_name : scalar, default 'value'
col_level : scalar, if columns are a MultiIndex then use this level to melt

Examples
--------
>>> import pandas as pd
>>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
... 'B': {0: 1, 1: 3, 2: 5},
... 'C': {0: 2, 1: 4, 2: 6}})

>>> df
A B C
a 1 2
b 3 4
c 5 6
A B C
0 a 1 2
1 b 3 4
2 c 5 6

>>> melt(df, id_vars=['A'], value_vars=['B'])
A variable value
a B 1
b B 3
c B 5
A variable value
0 a B 1
1 b B 3
2 c B 5

>>> melt(df, id_vars=['A'], value_vars=['B'],
... var_name='myVarname', value_name='myValname')
A myVarname myValname
a B 1
b B 3
c B 5
A myVarname myValname
0 a B 1
1 b B 3
2 c B 5

>>> df.columns = [list('ABC'), list('DEF')]

>>> melt(df, col_level=0, id_vars=['A'], value_vars=['B'])
A variable value
0 a B 1
1 b B 3
2 c B 5

>>> melt(df, id_vars=[('A', 'D')], value_vars=[('B', 'E')])
(A, D) variable_0 variable_1 value
0 a B E 1
1 b B E 3
2 c B E 5

"""
# TODO: what about the existing index?
Expand All @@ -652,8 +678,21 @@ def melt(frame, id_vars=None, value_vars=None,
else:
frame = frame.copy()

if col_level is not None: # allow list or other?
frame.columns = frame.columns.get_level_values(col_level) # frame is a copy

if var_name is None:
var_name = frame.columns.name if frame.columns.name is not None else 'variable'
if isinstance(frame.columns, MultiIndex):
if len(frame.columns.names) == len(set(frame.columns.names)):
var_name = frame.columns.names
else:
var_name = ['variable_%s' % i for i in
xrange(len(frame.columns.names))]
else:
var_name = [frame.columns.name if frame.columns.name is not None
else 'variable']
if isinstance(var_name, six.string_types):
var_name = [var_name]

N, K = frame.shape
K -= len(id_vars)
Expand All @@ -662,11 +701,13 @@ def melt(frame, id_vars=None, value_vars=None,
for col in id_vars:
mdata[col] = np.tile(frame.pop(col).values, K)

mcolumns = id_vars + [var_name, value_name]
mcolumns = id_vars + var_name + [value_name]

mdata[value_name] = frame.values.ravel('F')
mdata[var_name] = np.asarray(frame.columns).repeat(N)

for i, col in enumerate(var_name):
# asanyarray will keep the columns as an Index
mdata[col] = np.asanyarray(frame.columns.get_level_values(i)).repeat(N)

return DataFrame(mdata, columns=mcolumns)


Expand All @@ -683,13 +724,16 @@ def lreshape(data, groups, dropna=True, label=None):

Examples
--------
>>> import pandas as pd
>>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
... 'team': ['Red Sox', 'Yankees'],
... 'year1': [2007, 2008], 'year2': [2008, 2008]})
>>> data
hr1 hr2 team year1 year2
0 514 545 Red Sox 2007 2008
1 573 526 Yankees 2007 2008

>>> pd.lreshape(data, {'year': ['year1', 'year2'],
'hr': ['hr1', 'hr2']})
>>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
team hr year
0 Red Sox 514 2007
1 Yankees 573 2007
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1029,6 +1029,8 @@ def test_get_level_values(self):
expected = ['foo', 'foo', 'bar', 'baz', 'qux', 'qux']
self.assert_(np.array_equal(result, expected))

self.assertEquals(result.name, 'first')

result = self.index.get_level_values('first')
expected = self.index.get_level_values(0)
self.assert_(np.array_equal(result, expected))
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import nose

from pandas import DataFrame
import pandas as pd

from numpy import nan
import numpy as np
Expand All @@ -30,6 +31,12 @@ def setUp(self):
self.var_name = 'var'
self.value_name = 'val'

self.df1 = pd.DataFrame([[ 1.067683, -1.110463, 0.20867 ],
[-1.321405, 0.368915, -1.055342],
[-0.807333, 0.08298 , -0.873361]])
self.df1.columns = [list('ABC'), list('abc')]
self.df1.columns.names = ['CAP', 'low']

def test_default_col_names(self):
result = melt(self.df)
self.assertEqual(result.columns.tolist(), ['variable', 'value'])
Expand Down Expand Up @@ -128,6 +135,17 @@ def test_custom_var_and_value_name(self):
result20 = melt(self.df)
self.assertEqual(result20.columns.tolist(), ['foo', 'value'])

def test_col_level(self):
res1 = melt(self.df1, col_level=0)
res2 = melt(self.df1, col_level='CAP')
self.assertEqual(res1.columns.tolist(), ['CAP', 'value'])
self.assertEqual(res1.columns.tolist(), ['CAP', 'value'])

def test_multiindex(self):
res = pd.melt(self.df1)
self.assertEqual(res.columns.tolist(), ['CAP', 'low', 'value'])


class TestConvertDummies(unittest.TestCase):
def test_convert_dummies(self):
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
Expand Down