pandas-dev · TomAugspurger · May 1, 2018 · Apr 1, 2018 · Apr 22, 2018 · Apr 27, 2018
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -991,24 +991,24 @@ is only interesting over one column (here ``colname``), it may be filtered
 
 .. _groupby.observed:
 
-observed hanlding
-~~~~~~~~~~~~~~~~~
+Handling of (un)observed Categorical values
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 When using a ``Categorical`` grouper (as a single or as part of multipler groupers), the ``observed`` keyword
 controls whether to return a cartesian product of all possible groupers values (``observed=False``) or only those
-that are observed groupers (``observed=True``). The ``observed`` keyword will default to ``True`` in the future.
+that are observed groupers (``observed=True``).
 
-Show only the observed values:
+Show all values:
 
 .. ipython:: python
 
-   pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=True).count()
+   pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=False).count()
 
-Show all values:
+Show only the observed values:
 
 .. ipython:: python
 
-   pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=False).count()
+   pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=True).count()
 
 The returned dtype of the grouped will *always* include *all* of the catergories that were grouped.
 

diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -396,6 +396,58 @@ documentation. If you build an extension array, publicize it on our
 
 .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest/
 
+.. _whatsnew_0230.enhancements.categorical_grouping:
+
+Categorical Groupers has gained an observed keyword
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous versions, grouping by 1 or more categorical columns would result in an index that was the cartesian product of all of the categories for
+each grouper, not just the observed values.``.groupby()`` has gained the ``observed`` keyword to toggle this behavior. The default remains backward
+compatible (generate a cartesian product). (:issue:`14942`, :issue:`8138`, :issue:`15217`, :issue:`17594`, :issue:`8669`, :issue:`20583`)
+
+
+.. ipython:: python
+
+   cat1 = pd.Categorical(["a", "a", "b", "b"],
+                         categories=["a", "b", "z"], ordered=True)
+   cat2 = pd.Categorical(["c", "d", "c", "d"],
+                         categories=["c", "d", "y"], ordered=True)
+   df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
+   df['C'] = ['foo', 'bar'] * 2
+   df
+
+To show all values, the previous behavior:
+
+.. ipython:: python
+
+   df.groupby(['A', 'B', 'C'], observed=False).count()
+
+
+To show only observed values:
+
+.. ipython:: python
+
+   df.groupby(['A', 'B', 'C'], observed=True).count()
+
+For pivotting operations, this behavior is *already* controlled by the ``dropna`` keyword:
+
+.. ipython:: python
+
+   cat1 = pd.Categorical(["a", "a", "b", "b"],
+                         categories=["a", "b", "z"], ordered=True)
+   cat2 = pd.Categorical(["c", "d", "c", "d"],
+                         categories=["c", "d", "y"], ordered=True)
+   df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
+   df
+
+.. ipython:: python
+
+   pd.pivot_table(df, values='values', index=['A', 'B'],
+                  dropna=True)
+   pd.pivot_table(df, values='values', index=['A', 'B'],
+                  dropna=False)
+
+
 .. _whatsnew_0230.enhancements.other:
 
 Other Enhancements
@@ -527,68 +579,6 @@ If you wish to retain the old behavior while using Python >= 3.6, you can use
                'Taxes': -200,
                'Net result': 300}).sort_index()
 
-.. _whatsnew_0230.api_breaking.categorical_grouping:
-
-Categorical Groupers will now require passing the observed keyword
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-In previous versions, grouping by 1 or more categorical columns would result in an index that was the cartesian product of all of the categories for
-each grouper, not just the observed values.``.groupby()`` has gained the ``observed`` keyword to toggle this behavior. The default remains backward
-compatible (generate a cartesian product). Pandas will show a ``FutureWarning`` if the ``observed`` keyword is not passed; the default will
-change to ``observed=True`` in the future. (:issue:`14942`, :issue:`8138`, :issue:`15217`, :issue:`17594`, :issue:`8669`, :issue:`20583`)
-
-
-.. ipython:: python
-
-   cat1 = pd.Categorical(["a", "a", "b", "b"],
-                         categories=["a", "b", "z"], ordered=True)
-   cat2 = pd.Categorical(["c", "d", "c", "d"],
-                         categories=["c", "d", "y"], ordered=True)
-   df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
-   df['C'] = ['foo', 'bar'] * 2
-   df
-
-``observed`` must now be passed when grouping by categoricals, or a
-``FutureWarning`` will show:
-
-.. ipython:: python
-   :okwarning:
-
-   df.groupby(['A', 'B', 'C']).count()
-
-
-To suppress the warning, with previous Behavior (show all values):
-
-.. ipython:: python
-
-   df.groupby(['A', 'B', 'C'], observed=False).count()
-
-
-Future Behavior (show only observed values):
-
-.. ipython:: python
-
-   df.groupby(['A', 'B', 'C'], observed=True).count()
-
-For pivotting operations, this behavior is *already* controlled by the ``dropna`` keyword:
-
-.. ipython:: python
-
-   cat1 = pd.Categorical(["a", "a", "b", "b"],
-                         categories=["a", "b", "z"], ordered=True)
-   cat2 = pd.Categorical(["c", "d", "c", "d"],
-                         categories=["c", "d", "y"], ordered=True)
-   df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
-   df
-
-.. ipython:: python
-
-   pd.pivot_table(df, values='values', index=['A', 'B'],
-                  dropna=True)
-   pd.pivot_table(df, values='values', index=['A', 'B'],
-                  dropna=False)
-
-
 .. _whatsnew_0230.api_breaking.deprecate_panel:
 
 Deprecate Panel

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -66,12 +66,14 @@ def ip():
     return InteractiveShell()
 
 
-@pytest.fixture(params=[True, False])
+@pytest.fixture(params=[True, False, None])
 def observed(request):
     """ pass in the observed keyword to groupby for [True, False]
     This indicates whether categoricals should return values for
-    values which are not in the grouper [False], or only values which
-    appear in the grouper [True] """
+    values which are not in the grouper [False / None], or only values which
+    appear in the grouper [True]. [None] is supported for future compatiblity
+    if we decide to change the default (and would need to warn if this
+    parameter is not passed)"""
     return request.param
 
 

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -649,6 +649,11 @@ def _set_categories(self, categories, fastpath=False):
 
     def _codes_for_groupby(self, sort, observed):
         """
+        Code the categories to ensure we can groupby for categoricals.
+
+        If observed=True, we return a new Categorical with the observed
+        categories only.
+
         If sort=False, return a copy of self, coded with categories as
         returned by .unique(), followed by any categories not appearing in
         the data. If sort=True, return self.

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -6633,10 +6633,10 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
             reduce the dimensionality of the return type if possible,
             otherwise return a consistent type
         observed : boolean, default None
-            if True: only show observed values for categorical groupers
-            if False: show all values for categorical groupers
+            if True: only show observed values for categorical groupers.
+            if False: show all values for categorical groupers.
             if None: if any categorical groupers, show a FutureWarning,
-                default to False
+                default to False.
 
             .. versionadded:: 0.23.0
 

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -2890,7 +2890,8 @@ class Grouping(object):
     obj :
     name :
     level :
-    observed : If we are a Categorical, use the observed values
+    observed : boolean, default False
+        If we are a Categorical, use the observed values
     in_axis : if the Grouping is a column in self.obj and hence among
         Groupby.exclusions list
 
@@ -2963,16 +2964,11 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
             # a passed Categorical
             elif is_categorical_dtype(self.grouper):
 
-                # Use the observed values of the grouper if inidcated
-                observed = self.observed
-                if observed is None:
-                    msg = ("pass observed=True to ensure that a "
-                           "categorical grouper only returns the "
-                           "observed groupers, or\n"
-                           "observed=False to include"
-                           "unobserved categories.\n")
-                    warnings.warn(msg, FutureWarning, stacklevel=5)
-                    observed = False
+                # observed can be True/False/None
+                # we treat None as False. If in the future
+                # we need to warn if observed is not passed
+                # then we have this option
+                # gh-20583
 
                 self.all_grouper = self.grouper
                 self.grouper = self.grouper._codes_for_groupby(

diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -246,23 +246,6 @@ def test_apply(ordered):
     assert_series_equal(result, expected)
 
 
-def test_observed_warning():
-    # 20583 - future warning on observe
-
-    cat1 = Categorical(["a", "a", "b", "b"],
-                       categories=["a", "b", "z"], ordered=True)
-    cat2 = Categorical(["c", "d", "c", "d"],
-                       categories=["c", "d", "y"], ordered=True)
-    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
-    df['C'] = ['foo', 'bar'] * 2
-
-    with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
-        df.groupby(['A', 'B', 'C'])
-
-    with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
-        df.groupby('A')
-
-
 def test_observed(observed):
     # multiple groupers, don't re-expand the output space
     # of the grouper
@@ -412,6 +395,26 @@ def test_observed_perf():
     assert result.index.levels[2].nunique() == df.other_id.nunique()
 
 
+def test_observed_groups(observed):
+    # gh-20583
+    # test that we have the appropriate groups
+
+    cat = pd.Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c'])
+    df = pd.DataFrame({'cat': cat, 'vals': [1, 2, 3]})
+    g = df.groupby('cat', observed=observed)
+
+    result = g.groups
+    if observed:
+        expected = {'a': Index([0, 2], dtype='int64'),
+                    'c': Index([1], dtype='int64')}
+    else:
+        expected = {'a': Index([0, 2], dtype='int64'),
+                    'b': Index([], dtype='int64'),
+                    'c': Index([1], dtype='int64')}
+
+    tm.assert_dict_equal(result, expected)
+
+
 def test_datetime():
     # GH9049: ensure backward compatibility
     levels = pd.date_range('2014-01-01', periods=4)