ENH: Add wide_to_long helper function.

jseabold · jseabold · commit 798cf28748a4 · 2013-12-03T12:10:46.000Z
diff --git a/pandas/core/api.py b/pandas/core/api.py
@@ -15,7 +15,7 @@
 from pandas.core.panel4d import Panel4D
 from pandas.core.groupby import groupby
 from pandas.core.reshape import (pivot_simple as pivot, get_dummies,
-                                 lreshape)
+                                 lreshape, wide_to_long)
 
 WidePanel = Panel
 
diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
@@ -786,6 +786,82 @@ def lreshape(data, groups, dropna=True, label=None):
 
     return DataFrame(mdata, columns=id_cols + pivot_cols)
 
+def wide_to_long(df, stubnames, i, j):
+    """
+    Wide panel to long format. Less flexible but more user-friendly than melt.
+
+    Parameters
+    ----------
+    df : DataFrame
+        The wide-format DataFrame
+    stubnames : list
+        A list of stub names. The wide format variables are assumed to
+        start with the stub names.
+    i : str
+        The name of the id variable.
+    j : str
+        The name of the subobservation variable.
+
+    Returns
+    -------
+    DataFrame
+        A DataFrame that contains each stub name as a variable as well as
+        variables for i and j.
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> import numpy as np
+    >>> np.random.seed(123)
+    >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
+    ...                    "A1980" : {0 : "d", 1 : "e", 2 : "f"},
+    ...                    "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
+    ...                    "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
+    ...                    "X"     : dict(zip(range(3), np.random.randn(3)))
+    ...                   })
+    >>> df["id"] = df.index
+    >>> df
+      A1970 A1980  B1970  B1980         X
+    0     a     d    2.5    3.2 -1.085631
+    1     b     e    1.2    1.3  0.997345
+    2     c     f    0.7    0.1  0.282978
+    >>> wide_to_long(df, ["A", "B"], i="id", j="year")
+                    X  A    B
+    id year
+    0  1970 -1.085631  a  2.5
+    1  1970  0.997345  b  1.2
+    2  1970  0.282978  c  0.7
+    0  1980 -1.085631  d  3.2
+    1  1980  0.997345  e  1.3
+    2  1980  0.282978  f  0.1
+
+    Notes
+    -----
+    All extra variables are treated as extra id variables. This simply uses
+    `pandas.melt` under the hood, but is hard-coded to "do the right thing"
+    in a typicaly case.
+    """
+    def get_var_names(df, regex):
+        return df.filter(regex=regex).columns.tolist()
+
+    def melt_stub(df, stub, i, j):
+        varnames = get_var_names(df, "^"+stub)
+        newdf = melt(df, id_vars=i, value_vars=varnames,
+                         value_name=stub, var_name=j)
+        newdf[j] = newdf[j].str.replace(stub, "").astype(int)
+        return newdf
+
+    id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames))
+    if i not in id_vars:
+        id_vars += [i]
+
+    stub = stubnames.pop(0)
+    newdf = melt_stub(df, stub, id_vars, j)
+
+    for stub in stubnames:
+        new = melt_stub(df, stub, id_vars, j)
+        newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False)
+    return newdf.set_index([i, j])
 
 def convert_dummies(data, cat_variables, prefix_sep='_'):
     """
diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py
@@ -15,7 +15,8 @@
 from pandas.util.testing import assert_frame_equal
 from numpy.testing import assert_array_equal
 
-from pandas.core.reshape import melt, convert_dummies, lreshape, get_dummies
+from pandas.core.reshape import (melt, convert_dummies, lreshape, get_dummies,
+                                 wide_to_long)
 import pandas.util.testing as tm
 from pandas.compat import StringIO, cPickle, range
 
@@ -296,6 +297,27 @@ def test_pairs(self):
                 'wt': ['wt%d' % i for i in range(1, 4)]}
         self.assertRaises(ValueError, lreshape, df, spec)
 
+class TestWideToLong(tm.TestCase):
+    def test_simple(self):
+        np.random.seed(123)
+        x = np.random.randn(3)
+        df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
+                           "A1980" : {0 : "d", 1 : "e", 2 : "f"},
+                           "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
+                           "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
+                           "X"     : dict(zip(range(3), x))
+                          })
+        df["id"] = df.index
+        exp_data = {"X" : x.tolist() + x.tolist(),
+                    "A" : ['a', 'b', 'c', 'd', 'e', 'f'],
+                    "B" : [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
+                    "year" : [1970, 1970, 1970, 1980, 1980, 1980],
+                    "id" : [0, 1, 2, 0, 1, 2]}
+        exp_frame = DataFrame(exp_data)
+        exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]]
+        long_frame = wide_to_long(df, ["A", "B"], i="id", j="year")
+        tm.assert_frame_equal(long_frame, exp_frame)
+
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],