Closed
Description
there are already a few things floating around but having something more structured / more options + in the pandas namespace would be nice
from an e-mail on the statsmodels mailing list
Here's a quick hack at it (not too dissimilar to Aman's code it looks
like)-- I should find a place in the library to put this:
def make_dummies(data, cat_variables):
result = data.drop(cat_variables, axis=1)
for variable in cat_variables:
dummies = _get_dummy_frame(data, variable)
result = result.join(dummies)
return result
def _get_dummy_frame(data, column):
from pandas import Factor
factor = Factor(data[column])
dummy_mat = np.eye(len(factor.levels)).take(factor.labels, axis=0)
dummy_cols = ['%s.%s' % (column, v) for v in factor.levels]
dummies = DataFrame(dummy_mat, index=data.index,
columns=dummy_cols)
return dummies
In [29]: df
Out[29]:
gender hand color height age
0 male right green 5.75 23
1 female right brown 5.42 27
2 female left green 5.58 31
3 male right brown 5.92 39
4 male right blue 5.83 33
In [30]: make_dummies(df, ['gender', 'hand', 'color']).T
Out[30]:
0 1 2 3 4
height 5.75 5.42 5.58 5.92 5.83
age 23 27 31 39 33
gender.female 0 1 1 0 0
gender.male 1 0 0 1 1
hand.left 0 0 1 0 0
hand.right 1 1 0 1 1
color.blue 0 0 0 0 1
color.brown 0 1 0 1 0
color.green 1 0 1 0 0
(BTW I read in that data using df = read_clipboard(sep=','))