From 45f14e81f4fed874dbb6e211292bccc2ef30e81e Mon Sep 17 00:00:00 2001 From: Bran Yang Date: Wed, 27 Jan 2016 23:31:14 +0800 Subject: [PATCH 1/3] ENH: GH12042 Add parameter `drop_first` to get_dummies to get k-1 variables out of n levels. --- doc/source/reshaping.rst | 26 +++++++++++ pandas/core/reshape.py | 35 +++++++++++++-- pandas/tests/test_reshape.py | 85 ++++++++++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+), 3 deletions(-) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index dbf3b838593a9..190b30af5acf6 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -518,6 +518,32 @@ the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'}) from_dict +.. versionadded:: 0.18.0 + +Sometimes it will be useful to only keep k-1 levels of a categorical +variable to avoid collinearity when feeding the result to statistical models. +You can switch to this mode by turn on ``drop_first``. + +.. ipython:: python + + s = pd.Series(list('abcaa')) + + pd.get_dummies(s) + + pd.get_dummies(s, drop_first=True) + +When a column contains only one level, it will be omitted in the result. + +.. ipython:: python + + df = pd.DataFrame({'A':list('aaaaa'),'B':list('ababc')}) + + pd.get_dummies(df) + + pd.get_dummies(df, drop_first=True) + + + Factorizing values ------------------ diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 4dffaa0b0c416..fc6a660bf276d 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -971,7 +971,11 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, Otherwise returns a DataFrame with some SparseBlocks. .. versionadded:: 0.16.1 + drop_first : bool, default False + Whether to get k-1 dummies out of n categorical levels by removing the + first level. + .. versionadded:: 0.18.0 Returns ------- dummies : DataFrame or SparseDataFrame @@ -1011,6 +1015,21 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, 1 2 0 1 1 0 0 2 3 1 0 0 0 1 + >>> pd.get_dummies(pd.Series(list('abcaa'))) + a b c + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + 3 1 0 0 + 4 1 0 0 + + >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)) + b c + 0 0 0 + 1 1 0 + 2 0 1 + 3 0 0 + 4 0 0 See also ``Series.str.get_dummies``. """ @@ -1060,17 +1079,18 @@ def check_len(item, name): for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep): dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep, - dummy_na=dummy_na, sparse=sparse) + dummy_na=dummy_na, sparse=sparse, + drop_first=drop_first) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, - sparse=sparse) + sparse=sparse, drop_first=drop_first) return result def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, - sparse=False): + sparse=False, drop_first=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data), ordered=True) levels = cat.categories @@ -1113,6 +1133,11 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, continue sp_indices[code].append(ndx) + if drop_first: + # remove first categorical level to avoid perfect collinearity + # GH12042 + sp_indices = sp_indices[1:] + dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs), fill_value=0) @@ -1127,6 +1152,10 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, # reset NaN GH4446 dummy_mat[codes == -1] = 0 + if drop_first: + # remove first GH12042 + dummy_mat = dummy_mat[:, 1:] + dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols) diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 6de589f87cfd8..98b15c0ec850c 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -411,6 +411,91 @@ def test_dataframe_dummies_with_categorical(self): ]] assert_frame_equal(result, expected) + # GH12402 Add a new parameter `drop_first` to avoid collinearity + def test_basic_drop_first(self): + # Basic case + s_list = list('abc') + s_series = Series(s_list) + s_series_index = Series(s_list, list('ABC')) + + expected = DataFrame({'b': {0: 0.0, + 1: 1.0, + 2: 0.0}, + 'c': {0: 0.0, + 1: 0.0, + 2: 1.0}}) + + result = get_dummies(s_list, sparse=self.sparse, drop_first=True) + assert_frame_equal(result, expected) + + result = get_dummies(s_series, sparse=self.sparse, drop_first=True) + assert_frame_equal(result, expected) + + expected.index = list('ABC') + result = get_dummies(s_series_index, sparse=self.sparse, drop_first=True) + assert_frame_equal(result, expected) + + def test_basic_drop_first_NA(self): + # Test NA hadling together with drop_first + s_NA = ['a', 'b', np.nan] + res = get_dummies(s_NA, sparse=self.sparse, drop_first=True) + exp = DataFrame({'b': {0: 0.0, + 1: 1.0, + 2: 0.0}}) + assert_frame_equal(res, exp) + + # Sparse dataframes do not allow nan labelled columns, see #GH8822 + res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse, + drop_first=True) + exp_na = DataFrame({'b': {0: 0.0, + 1: 1.0, + 2: 0.0}, + nan: {0: 0.0, + 1: 0.0, + 2: 1.0}}).reindex_axis( + ['b', nan], 1) + assert_frame_equal(res_na, exp_na) + + res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse, + drop_first=True) + tm.assert_numpy_array_equal(res_just_na.empty, True) + + def test_dataframe_dummies_drop_first(self): + df = self.df[['A', 'B']] + result = get_dummies(df, sparse=self.sparse, drop_first=True) + expected = DataFrame({'A_b': [0., 1, 0], + 'B_c': [0., 0, 1]}) + assert_frame_equal(result, expected) + + def test_dataframe_dummies_drop_first_with_categorical(self): + df = self.df + df['cat'] = pd.Categorical(['x', 'y', 'y']) + result = get_dummies(df, sparse=self.sparse, drop_first=True) + expected = DataFrame({'C': [1, 2, 3], + 'A_b': [0., 1, 0], + 'B_c': [0., 0, 1], + 'cat_y': [0., 1, 1]}) + expected = expected[['C', 'A_b', 'B_c', 'cat_y']] + assert_frame_equal(result, expected) + + def test_dataframe_dummies_drop_first_with_na(self): + df = self.df + df.loc[3, :] = [np.nan, np.nan, np.nan] + result = get_dummies(df, dummy_na=True, sparse=self.sparse, + drop_first=True) + expected = DataFrame({'C': [1, 2, 3, np.nan], + 'A_b': [0., 1, 0, 0], + 'A_nan': [0., 0, 0, 1], + 'B_c': [0., 0, 1, 0], + 'B_nan': [0., 0, 0, 1]}) + expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']] + assert_frame_equal(result, expected) + + result = get_dummies(df, dummy_na=False, sparse=self.sparse, + drop_first=True) + expected = expected[['C', 'A_b', 'B_c']] + assert_frame_equal(result, expected) + class TestGetDummiesSparse(TestGetDummies): sparse = True From 0d99c2aeb6176239f28799c02cc4728e64db36d0 Mon Sep 17 00:00:00 2001 From: Bran Yang Date: Thu, 28 Jan 2016 00:03:47 +0800 Subject: [PATCH 2/3] Test the case that `drop_first` is on and categorical variable only has one level. --- pandas/core/reshape.py | 2 +- pandas/tests/test_reshape.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index fc6a660bf276d..bb7eba496e34a 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -944,7 +944,7 @@ def melt_stub(df, stub, i, j): def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, - columns=None, sparse=False): + columns=None, sparse=False, drop_first=False): """ Convert categorical variable into dummy/indicator variables diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 98b15c0ec850c..b0dd578ae2bf7 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -435,6 +435,11 @@ def test_basic_drop_first(self): result = get_dummies(s_series_index, sparse=self.sparse, drop_first=True) assert_frame_equal(result, expected) + # Test the case that categorical variable only has one level. + def test_basic_drop_first_one_level(self): + result = get_dummies(list('aaa'), sparse=self.sparse, drop_first=True) + self.assertEqual(result.empty, True) + def test_basic_drop_first_NA(self): # Test NA hadling together with drop_first s_NA = ['a', 'b', np.nan] From 0528c574f73c19aacc7285c89a555d9569a268bc Mon Sep 17 00:00:00 2001 From: Bran Yang Date: Tue, 2 Feb 2016 11:01:01 +0800 Subject: [PATCH 3/3] Compare with empty DataFrame, not just check empty --- pandas/core/reshape.py | 11 +++++++++-- pandas/tests/test_reshape.py | 27 +++++++++++++++++++++------ 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index bb7eba496e34a..c4b7005775536 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -1095,8 +1095,7 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, cat = Categorical.from_array(Series(data), ordered=True) levels = cat.categories - # if all NaN - if not dummy_na and len(levels) == 0: + def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: @@ -1106,11 +1105,19 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, else: return SparseDataFrame(index=index) + # if all NaN + if not dummy_na and len(levels) == 0: + return get_empty_Frame(data, sparse) + codes = cat.codes.copy() if dummy_na: codes[codes == -1] = len(cat.categories) levels = np.append(cat.categories, np.nan) + # if dummy_na, we just fake a nan level. drop_first will drop it again + if drop_first and len(levels) == 1: + return get_empty_Frame(data, sparse) + number_of_cols = len(levels) if prefix is not None: diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index b0dd578ae2bf7..671c345898ec2 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -432,13 +432,28 @@ def test_basic_drop_first(self): assert_frame_equal(result, expected) expected.index = list('ABC') - result = get_dummies(s_series_index, sparse=self.sparse, drop_first=True) + result = get_dummies(s_series_index, sparse=self.sparse, + drop_first=True) assert_frame_equal(result, expected) - # Test the case that categorical variable only has one level. def test_basic_drop_first_one_level(self): - result = get_dummies(list('aaa'), sparse=self.sparse, drop_first=True) - self.assertEqual(result.empty, True) + # Test the case that categorical variable only has one level. + s_list = list('aaa') + s_series = Series(s_list) + s_series_index = Series(s_list, list('ABC')) + + expected = DataFrame(index=np.arange(3)) + + result = get_dummies(s_list, sparse=self.sparse, drop_first=True) + assert_frame_equal(result, expected) + + result = get_dummies(s_series, sparse=self.sparse, drop_first=True) + assert_frame_equal(result, expected) + + expected = DataFrame(index=list('ABC')) + result = get_dummies(s_series_index, sparse=self.sparse, + drop_first=True) + assert_frame_equal(result, expected) def test_basic_drop_first_NA(self): # Test NA hadling together with drop_first @@ -449,7 +464,6 @@ def test_basic_drop_first_NA(self): 2: 0.0}}) assert_frame_equal(res, exp) - # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse, drop_first=True) exp_na = DataFrame({'b': {0: 0.0, @@ -463,7 +477,8 @@ def test_basic_drop_first_NA(self): res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse, drop_first=True) - tm.assert_numpy_array_equal(res_just_na.empty, True) + exp_just_na = DataFrame(index=np.arange(1)) + assert_frame_equal(res_just_na, exp_just_na) def test_dataframe_dummies_drop_first(self): df = self.df[['A', 'B']]