From df32828a3f29e627f9d9c3ac25162df9dbdbdb05 Mon Sep 17 00:00:00 2001 From: Patrick Park Date: Thu, 11 Oct 2018 09:17:54 -0700 Subject: [PATCH 1/3] Added note about groupby excluding Decimal columns by default --- doc/source/groupby.rst | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 407fad39ba232..c0940823ad1f5 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -989,6 +989,31 @@ Note that ``df.groupby('A').colname.std().`` is more efficient than is only interesting over one column (here ``colname``), it may be filtered *before* applying the aggregation function. +.. note:: + Decimal and object columns are also "nuisance" columns. They are excluded from aggregate functions automatically in groupby. + + If you do wish to include decimal or object columns in an aggregation with other non-nuisance data types, you must do so explicitly. + +.. ipython:: python + + from decimal import Decimal + dec = pd.DataFrame( + {'id': [123, 456, 123, 456], + 'int_column': [1, 2, 3, 4], + 'dec_column1': [Decimal('0.50'), Decimal('0.15'), Decimal('0.25'), Decimal('0.40')] + }, + columns=['id','int_column','dec_column'] + ) + + # Decimal columns can be sum'd explicitly by themselves... + dec.groupby(['id'], as_index=False)['dec_column'].sum() + + # ...but cannot be combined with standard data types or they will be excluded + dec.groupby(['id'], as_index=False)['int_column','dec_column'].sum() + + # Use .agg function to aggregate over standard and "nuisance" data types at the same time + dec.groupby(['id'], as_index=False).agg({'int_column': 'sum', 'dec_column': 'sum'}) + .. _groupby.missing: NA and NaT group handling From 9e39cf2ae073dafc8117dba1ae98d77a166e3364 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 8 Nov 2018 15:09:51 +0100 Subject: [PATCH 2/3] reword + fix typo in example --- doc/source/groupby.rst | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 17f36706fb466..45c28a4cebc1e 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -985,29 +985,31 @@ is only interesting over one column (here ``colname``), it may be filtered *before* applying the aggregation function. .. note:: - Decimal and object columns are also "nuisance" columns. They are excluded from aggregate functions automatically in groupby. + Any object column, also if it contains numerical values such as ``Decimal`` + objects, is considered as a "nuisance" columns. They are excluded from + aggregate functions automatically in groupby. - If you do wish to include decimal or object columns in an aggregation with other non-nuisance data types, you must do so explicitly. + If you do wish to include decimal or object columns in an aggregation with + other non-nuisance data types, you must do so explicitly. .. ipython:: python from decimal import Decimal - dec = pd.DataFrame( - {'id': [123, 456, 123, 456], + df_dec = pd.DataFrame( + {'id': [1, 2, 1, 2], 'int_column': [1, 2, 3, 4], - 'dec_column1': [Decimal('0.50'), Decimal('0.15'), Decimal('0.25'), Decimal('0.40')] - }, - columns=['id','int_column','dec_column'] + 'dec_column': [Decimal('0.50'), Decimal('0.15'), Decimal('0.25'), Decimal('0.40')] + } ) # Decimal columns can be sum'd explicitly by themselves... - dec.groupby(['id'], as_index=False)['dec_column'].sum() + df_dec.groupby(['id'])[['dec_column']].sum() # ...but cannot be combined with standard data types or they will be excluded - dec.groupby(['id'], as_index=False)['int_column','dec_column'].sum() + df_dec.groupby(['id'])[['int_column','dec_column']].sum() # Use .agg function to aggregate over standard and "nuisance" data types at the same time - dec.groupby(['id'], as_index=False).agg({'int_column': 'sum', 'dec_column': 'sum'}) + df_dec.groupby(['id']).agg({'int_column': 'sum', 'dec_column': 'sum'}) .. _groupby.observed: From c755f2c4914abf2d8cfe3ad4d280e102a6efbb5a Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Thu, 8 Nov 2018 14:43:19 +0000 Subject: [PATCH 3/3] Fixing couple of pep8 issues. Some lines are >80, but I think in rst files we don't care --- doc/source/groupby.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 45c28a4cebc1e..1f0b43bab8d4d 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -997,8 +997,8 @@ is only interesting over one column (here ``colname``), it may be filtered from decimal import Decimal df_dec = pd.DataFrame( {'id': [1, 2, 1, 2], - 'int_column': [1, 2, 3, 4], - 'dec_column': [Decimal('0.50'), Decimal('0.15'), Decimal('0.25'), Decimal('0.40')] + 'int_column': [1, 2, 3, 4], + 'dec_column': [Decimal('0.50'), Decimal('0.15'), Decimal('0.25'), Decimal('0.40')] } ) @@ -1006,7 +1006,7 @@ is only interesting over one column (here ``colname``), it may be filtered df_dec.groupby(['id'])[['dec_column']].sum() # ...but cannot be combined with standard data types or they will be excluded - df_dec.groupby(['id'])[['int_column','dec_column']].sum() + df_dec.groupby(['id'])[['int_column', 'dec_column']].sum() # Use .agg function to aggregate over standard and "nuisance" data types at the same time df_dec.groupby(['id']).agg({'int_column': 'sum', 'dec_column': 'sum'})