From 45f14e81f4fed874dbb6e211292bccc2ef30e81e Mon Sep 17 00:00:00 2001
From: Bran Yang <yangbo.84@gmail.com>
Date: Wed, 27 Jan 2016 23:31:14 +0800
Subject: [PATCH 1/3] ENH: GH12042 Add parameter `drop_first` to get_dummies to
 get k-1 variables out of n levels.

---
 doc/source/reshaping.rst     | 26 +++++++++++
 pandas/core/reshape.py       | 35 +++++++++++++--
 pandas/tests/test_reshape.py | 85 ++++++++++++++++++++++++++++++++++++
 3 files changed, 143 insertions(+), 3 deletions(-)

diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst
index dbf3b838593a9..190b30af5acf6 100644
--- a/doc/source/reshaping.rst
+++ b/doc/source/reshaping.rst
@@ -518,6 +518,32 @@ the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways
     from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'})
     from_dict
 
+.. versionadded:: 0.18.0
+
+Sometimes it will be useful to only keep k-1 levels of a categorical
+variable to avoid collinearity when feeding the result to statistical models.
+You can switch to this mode by turn on ``drop_first``.
+
+.. ipython:: python
+
+    s = pd.Series(list('abcaa'))
+
+    pd.get_dummies(s)
+
+    pd.get_dummies(s, drop_first=True)
+
+When a column contains only one level, it will be omitted in the result.
+
+.. ipython:: python
+
+    df = pd.DataFrame({'A':list('aaaaa'),'B':list('ababc')})
+
+    pd.get_dummies(df)
+
+    pd.get_dummies(df, drop_first=True)
+
+
+
 Factorizing values
 ------------------
 
diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
index 4dffaa0b0c416..fc6a660bf276d 100644
--- a/pandas/core/reshape.py
+++ b/pandas/core/reshape.py
@@ -971,7 +971,11 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
         Otherwise returns a DataFrame with some SparseBlocks.
 
         .. versionadded:: 0.16.1
+    drop_first : bool, default False
+        Whether to get k-1 dummies out of n categorical levels by removing the
+        first level.
 
+        .. versionadded:: 0.18.0
     Returns
     -------
     dummies : DataFrame or SparseDataFrame
@@ -1011,6 +1015,21 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
     1  2       0       1       1       0       0
     2  3       1       0       0       0       1
 
+    >>> pd.get_dummies(pd.Series(list('abcaa')))
+       a  b  c
+    0  1  0  0
+    1  0  1  0
+    2  0  0  1
+    3  1  0  0
+    4  1  0  0
+
+    >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True))
+       b  c
+    0  0  0
+    1  1  0
+    2  0  1
+    3  0  0
+    4  0  0
     See also ``Series.str.get_dummies``.
 
     """
@@ -1060,17 +1079,18 @@ def check_len(item, name):
         for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):
 
             dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,
-                                    dummy_na=dummy_na, sparse=sparse)
+                                    dummy_na=dummy_na, sparse=sparse,
+                                    drop_first=drop_first)
             with_dummies.append(dummy)
         result = concat(with_dummies, axis=1)
     else:
         result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
-                                 sparse=sparse)
+                                 sparse=sparse, drop_first=drop_first)
     return result
 
 
 def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
-                    sparse=False):
+                    sparse=False, drop_first=False):
     # Series avoids inconsistent NaN handling
     cat = Categorical.from_array(Series(data), ordered=True)
     levels = cat.categories
@@ -1113,6 +1133,11 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
                 continue
             sp_indices[code].append(ndx)
 
+        if drop_first:
+            # remove first categorical level to avoid perfect collinearity
+            # GH12042
+            sp_indices = sp_indices[1:]
+            dummy_cols = dummy_cols[1:]
         for col, ixs in zip(dummy_cols, sp_indices):
             sarr = SparseArray(np.ones(len(ixs)),
                                sparse_index=IntIndex(N, ixs), fill_value=0)
@@ -1127,6 +1152,10 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
             # reset NaN GH4446
             dummy_mat[codes == -1] = 0
 
+        if drop_first:
+            # remove first GH12042
+            dummy_mat = dummy_mat[:, 1:]
+            dummy_cols = dummy_cols[1:]
         return DataFrame(dummy_mat, index=index, columns=dummy_cols)
 
 
diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py
index 6de589f87cfd8..98b15c0ec850c 100644
--- a/pandas/tests/test_reshape.py
+++ b/pandas/tests/test_reshape.py
@@ -411,6 +411,91 @@ def test_dataframe_dummies_with_categorical(self):
                              ]]
         assert_frame_equal(result, expected)
 
+    # GH12402 Add a new parameter `drop_first` to avoid collinearity
+    def test_basic_drop_first(self):
+        # Basic case
+        s_list = list('abc')
+        s_series = Series(s_list)
+        s_series_index = Series(s_list, list('ABC'))
+
+        expected = DataFrame({'b': {0: 0.0,
+                                    1: 1.0,
+                                    2: 0.0},
+                              'c': {0: 0.0,
+                                    1: 0.0,
+                                    2: 1.0}})
+
+        result = get_dummies(s_list, sparse=self.sparse, drop_first=True)
+        assert_frame_equal(result, expected)
+
+        result = get_dummies(s_series, sparse=self.sparse, drop_first=True)
+        assert_frame_equal(result, expected)
+
+        expected.index = list('ABC')
+        result = get_dummies(s_series_index, sparse=self.sparse, drop_first=True)
+        assert_frame_equal(result, expected)
+
+    def test_basic_drop_first_NA(self):
+        # Test NA hadling together with drop_first
+        s_NA = ['a', 'b', np.nan]
+        res = get_dummies(s_NA, sparse=self.sparse, drop_first=True)
+        exp = DataFrame({'b': {0: 0.0,
+                               1: 1.0,
+                               2: 0.0}})
+        assert_frame_equal(res, exp)
+
+        # Sparse dataframes do not allow nan labelled columns, see #GH8822
+        res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse,
+                             drop_first=True)
+        exp_na = DataFrame({'b': {0: 0.0,
+                                  1: 1.0,
+                                  2: 0.0},
+                            nan: {0: 0.0,
+                                  1: 0.0,
+                                  2: 1.0}}).reindex_axis(
+                                      ['b', nan], 1)
+        assert_frame_equal(res_na, exp_na)
+
+        res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse,
+                                  drop_first=True)
+        tm.assert_numpy_array_equal(res_just_na.empty, True)
+
+    def test_dataframe_dummies_drop_first(self):
+        df = self.df[['A', 'B']]
+        result = get_dummies(df, sparse=self.sparse, drop_first=True)
+        expected = DataFrame({'A_b': [0., 1, 0],
+                              'B_c': [0., 0, 1]})
+        assert_frame_equal(result, expected)
+
+    def test_dataframe_dummies_drop_first_with_categorical(self):
+        df = self.df
+        df['cat'] = pd.Categorical(['x', 'y', 'y'])
+        result = get_dummies(df, sparse=self.sparse, drop_first=True)
+        expected = DataFrame({'C': [1, 2, 3],
+                              'A_b': [0., 1, 0],
+                              'B_c': [0., 0, 1],
+                              'cat_y': [0., 1, 1]})
+        expected = expected[['C', 'A_b', 'B_c', 'cat_y']]
+        assert_frame_equal(result, expected)
+
+    def test_dataframe_dummies_drop_first_with_na(self):
+        df = self.df
+        df.loc[3, :] = [np.nan, np.nan, np.nan]
+        result = get_dummies(df, dummy_na=True, sparse=self.sparse,
+                             drop_first=True)
+        expected = DataFrame({'C': [1, 2, 3, np.nan],
+                              'A_b': [0., 1, 0, 0],
+                              'A_nan': [0., 0, 0, 1],
+                              'B_c': [0., 0, 1, 0],
+                              'B_nan': [0., 0, 0, 1]})
+        expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']]
+        assert_frame_equal(result, expected)
+
+        result = get_dummies(df, dummy_na=False, sparse=self.sparse,
+                             drop_first=True)
+        expected = expected[['C', 'A_b', 'B_c']]
+        assert_frame_equal(result, expected)
+
 
 class TestGetDummiesSparse(TestGetDummies):
     sparse = True

From 0d99c2aeb6176239f28799c02cc4728e64db36d0 Mon Sep 17 00:00:00 2001
From: Bran Yang <yangbo.84@gmail.com>
Date: Thu, 28 Jan 2016 00:03:47 +0800
Subject: [PATCH 2/3] Test the case that `drop_first` is on and categorical
 variable only has one level.

---
 pandas/core/reshape.py       | 2 +-
 pandas/tests/test_reshape.py | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
index fc6a660bf276d..bb7eba496e34a 100644
--- a/pandas/core/reshape.py
+++ b/pandas/core/reshape.py
@@ -944,7 +944,7 @@ def melt_stub(df, stub, i, j):
 
 
 def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
-                columns=None, sparse=False):
+                columns=None, sparse=False, drop_first=False):
     """
     Convert categorical variable into dummy/indicator variables
 
diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py
index 98b15c0ec850c..b0dd578ae2bf7 100644
--- a/pandas/tests/test_reshape.py
+++ b/pandas/tests/test_reshape.py
@@ -435,6 +435,11 @@ def test_basic_drop_first(self):
         result = get_dummies(s_series_index, sparse=self.sparse, drop_first=True)
         assert_frame_equal(result, expected)
 
+    # Test the case that categorical variable only has one level.
+    def test_basic_drop_first_one_level(self):
+        result = get_dummies(list('aaa'), sparse=self.sparse, drop_first=True)
+        self.assertEqual(result.empty, True)
+
     def test_basic_drop_first_NA(self):
         # Test NA hadling together with drop_first
         s_NA = ['a', 'b', np.nan]

From 0528c574f73c19aacc7285c89a555d9569a268bc Mon Sep 17 00:00:00 2001
From: Bran Yang <yangbo.84@gmail.com>
Date: Tue, 2 Feb 2016 11:01:01 +0800
Subject: [PATCH 3/3] Compare with empty DataFrame, not just check empty

---
 pandas/core/reshape.py       | 11 +++++++++--
 pandas/tests/test_reshape.py | 27 +++++++++++++++++++++------
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
index bb7eba496e34a..c4b7005775536 100644
--- a/pandas/core/reshape.py
+++ b/pandas/core/reshape.py
@@ -1095,8 +1095,7 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
     cat = Categorical.from_array(Series(data), ordered=True)
     levels = cat.categories
 
-    # if all NaN
-    if not dummy_na and len(levels) == 0:
+    def get_empty_Frame(data, sparse):
         if isinstance(data, Series):
             index = data.index
         else:
@@ -1106,11 +1105,19 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
         else:
             return SparseDataFrame(index=index)
 
+    # if all NaN
+    if not dummy_na and len(levels) == 0:
+        return get_empty_Frame(data, sparse)
+
     codes = cat.codes.copy()
     if dummy_na:
         codes[codes == -1] = len(cat.categories)
         levels = np.append(cat.categories, np.nan)
 
+    # if dummy_na, we just fake a nan level. drop_first will drop it again
+    if drop_first and len(levels) == 1:
+        return get_empty_Frame(data, sparse)
+
     number_of_cols = len(levels)
 
     if prefix is not None:
diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py
index b0dd578ae2bf7..671c345898ec2 100644
--- a/pandas/tests/test_reshape.py
+++ b/pandas/tests/test_reshape.py
@@ -432,13 +432,28 @@ def test_basic_drop_first(self):
         assert_frame_equal(result, expected)
 
         expected.index = list('ABC')
-        result = get_dummies(s_series_index, sparse=self.sparse, drop_first=True)
+        result = get_dummies(s_series_index, sparse=self.sparse,
+                             drop_first=True)
         assert_frame_equal(result, expected)
 
-    # Test the case that categorical variable only has one level.
     def test_basic_drop_first_one_level(self):
-        result = get_dummies(list('aaa'), sparse=self.sparse, drop_first=True)
-        self.assertEqual(result.empty, True)
+        # Test the case that categorical variable only has one level.
+        s_list = list('aaa')
+        s_series = Series(s_list)
+        s_series_index = Series(s_list, list('ABC'))
+
+        expected = DataFrame(index=np.arange(3))
+
+        result = get_dummies(s_list, sparse=self.sparse, drop_first=True)
+        assert_frame_equal(result, expected)
+
+        result = get_dummies(s_series, sparse=self.sparse, drop_first=True)
+        assert_frame_equal(result, expected)
+
+        expected = DataFrame(index=list('ABC'))
+        result = get_dummies(s_series_index, sparse=self.sparse,
+                             drop_first=True)
+        assert_frame_equal(result, expected)
 
     def test_basic_drop_first_NA(self):
         # Test NA hadling together with drop_first
@@ -449,7 +464,6 @@ def test_basic_drop_first_NA(self):
                                2: 0.0}})
         assert_frame_equal(res, exp)
 
-        # Sparse dataframes do not allow nan labelled columns, see #GH8822
         res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse,
                              drop_first=True)
         exp_na = DataFrame({'b': {0: 0.0,
@@ -463,7 +477,8 @@ def test_basic_drop_first_NA(self):
 
         res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse,
                                   drop_first=True)
-        tm.assert_numpy_array_equal(res_just_na.empty, True)
+        exp_just_na = DataFrame(index=np.arange(1))
+        assert_frame_equal(res_just_na, exp_just_na)
 
     def test_dataframe_dummies_drop_first(self):
         df = self.df[['A', 'B']]