Skip to content

Commit 798cf28

Browse files
committed
ENH: Add wide_to_long helper function.
1 parent 6b2c5fd commit 798cf28

File tree

3 files changed

+100
-2
lines changed

3 files changed

+100
-2
lines changed

pandas/core/api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from pandas.core.panel4d import Panel4D
1616
from pandas.core.groupby import groupby
1717
from pandas.core.reshape import (pivot_simple as pivot, get_dummies,
18-
lreshape)
18+
lreshape, wide_to_long)
1919

2020
WidePanel = Panel
2121

pandas/core/reshape.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -786,6 +786,82 @@ def lreshape(data, groups, dropna=True, label=None):
786786

787787
return DataFrame(mdata, columns=id_cols + pivot_cols)
788788

789+
def wide_to_long(df, stubnames, i, j):
790+
"""
791+
Wide panel to long format. Less flexible but more user-friendly than melt.
792+
793+
Parameters
794+
----------
795+
df : DataFrame
796+
The wide-format DataFrame
797+
stubnames : list
798+
A list of stub names. The wide format variables are assumed to
799+
start with the stub names.
800+
i : str
801+
The name of the id variable.
802+
j : str
803+
The name of the subobservation variable.
804+
805+
Returns
806+
-------
807+
DataFrame
808+
A DataFrame that contains each stub name as a variable as well as
809+
variables for i and j.
810+
811+
Examples
812+
--------
813+
>>> import pandas as pd
814+
>>> import numpy as np
815+
>>> np.random.seed(123)
816+
>>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
817+
... "A1980" : {0 : "d", 1 : "e", 2 : "f"},
818+
... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
819+
... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
820+
... "X" : dict(zip(range(3), np.random.randn(3)))
821+
... })
822+
>>> df["id"] = df.index
823+
>>> df
824+
A1970 A1980 B1970 B1980 X
825+
0 a d 2.5 3.2 -1.085631
826+
1 b e 1.2 1.3 0.997345
827+
2 c f 0.7 0.1 0.282978
828+
>>> wide_to_long(df, ["A", "B"], i="id", j="year")
829+
X A B
830+
id year
831+
0 1970 -1.085631 a 2.5
832+
1 1970 0.997345 b 1.2
833+
2 1970 0.282978 c 0.7
834+
0 1980 -1.085631 d 3.2
835+
1 1980 0.997345 e 1.3
836+
2 1980 0.282978 f 0.1
837+
838+
Notes
839+
-----
840+
All extra variables are treated as extra id variables. This simply uses
841+
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
842+
in a typicaly case.
843+
"""
844+
def get_var_names(df, regex):
845+
return df.filter(regex=regex).columns.tolist()
846+
847+
def melt_stub(df, stub, i, j):
848+
varnames = get_var_names(df, "^"+stub)
849+
newdf = melt(df, id_vars=i, value_vars=varnames,
850+
value_name=stub, var_name=j)
851+
newdf[j] = newdf[j].str.replace(stub, "").astype(int)
852+
return newdf
853+
854+
id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames))
855+
if i not in id_vars:
856+
id_vars += [i]
857+
858+
stub = stubnames.pop(0)
859+
newdf = melt_stub(df, stub, id_vars, j)
860+
861+
for stub in stubnames:
862+
new = melt_stub(df, stub, id_vars, j)
863+
newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False)
864+
return newdf.set_index([i, j])
789865

790866
def convert_dummies(data, cat_variables, prefix_sep='_'):
791867
"""

pandas/tests/test_reshape.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
from pandas.util.testing import assert_frame_equal
1616
from numpy.testing import assert_array_equal
1717

18-
from pandas.core.reshape import melt, convert_dummies, lreshape, get_dummies
18+
from pandas.core.reshape import (melt, convert_dummies, lreshape, get_dummies,
19+
wide_to_long)
1920
import pandas.util.testing as tm
2021
from pandas.compat import StringIO, cPickle, range
2122

@@ -296,6 +297,27 @@ def test_pairs(self):
296297
'wt': ['wt%d' % i for i in range(1, 4)]}
297298
self.assertRaises(ValueError, lreshape, df, spec)
298299

300+
class TestWideToLong(tm.TestCase):
301+
def test_simple(self):
302+
np.random.seed(123)
303+
x = np.random.randn(3)
304+
df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
305+
"A1980" : {0 : "d", 1 : "e", 2 : "f"},
306+
"B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
307+
"B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
308+
"X" : dict(zip(range(3), x))
309+
})
310+
df["id"] = df.index
311+
exp_data = {"X" : x.tolist() + x.tolist(),
312+
"A" : ['a', 'b', 'c', 'd', 'e', 'f'],
313+
"B" : [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
314+
"year" : [1970, 1970, 1970, 1980, 1980, 1980],
315+
"id" : [0, 1, 2, 0, 1, 2]}
316+
exp_frame = DataFrame(exp_data)
317+
exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]]
318+
long_frame = wide_to_long(df, ["A", "B"], i="id", j="year")
319+
tm.assert_frame_equal(long_frame, exp_frame)
320+
299321

300322
if __name__ == '__main__':
301323
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)