Skip to content

DOC: add documentation to DataFrameGroupBy.skew and SeriesGroupBy.skew #50958

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Feb 5, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 115 additions & 2 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1006,14 +1006,65 @@ def take(
result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)
return result

@doc(Series.skew.__doc__)
def skew(
self,
axis: Axis | lib.NoDefault = lib.no_default,
skipna: bool = True,
numeric_only: bool = False,
**kwargs,
) -> Series:
"""
Return unbiased skew within groups.

Normalized by N-1.

Parameters
----------
axis : {0 or 'index', 1 or 'columns', None}, default 0
Axis for the function to be applied on.
This parameter is only for compatibility with DataFrame and is unused.

skipna : bool, default True
Exclude NA/null values when computing the result.

numeric_only : bool, default False
Include only float, int, boolean columns. Not implemented for Series.
Comment on lines +1030 to +1031
Copy link
Contributor Author

@marenwestermann marenwestermann Feb 1, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

numeric_only is a parameter for the skew method but it seems like it is not used. Should the documentation be updated? Or can numeric_only maybe be removed and this section with it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm seeing numeric_only functioning here:

df = pd.DataFrame({'a': [1, 1, 2], 'b': list('xyz'), 'c': [3, 4, 5]})
gb = df.groupby('a')
result = gb.skew(numeric_only=True)
print(result)

#     c
# a    
# 1 NaN
# 2 NaN

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can reproduce this. However, when I run this

ser = pd.Series([1, 2, 1, 'x', 3, 4, 3], index=['a', 'a', 'a', 'b', 'b', 'b', 'b'])
result = ser.groupby(level=0).skew(numeric_only=True)

I get the following error:

TypeError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 result = ser.groupby(level=0).skew(numeric_only=True)

File ~/open-source/pandas-maren/pandas/core/groupby/generic.py:1063, in SeriesGroupBy.skew(self, axis, skipna, numeric_only, **kwargs)
   1004 def skew(
   1005     self,
   1006     axis: Axis | lib.NoDefault = lib.no_default,
   (...)
   1009     **kwargs,
   1010 ) -> Series:
   1011     """
   1012     Return unbiased skew within groups.
   1013
   (...)
   1061     Name: Max Speed, dtype: float64
   1062     """
-> 1063     result = self._op_via_apply(
   1064         "skew",
   1065         axis=axis,
   1066         skipna=skipna,
   1067         numeric_only=numeric_only,
   1068         **kwargs,
   1069     )
   1070     return result

File ~/open-source/pandas-maren/pandas/core/groupby/groupby.py:987, in GroupBy._op_via_apply(self, name, *args, **kwargs)
    985 if is_transform and self._obj_with_exclusions.empty:
    986     return self._obj_with_exclusions
--> 987 result = self._python_apply_general(
    988     curried,
    989     self._obj_with_exclusions,
    990     is_transform=is_transform,
    991     not_indexed_same=not is_transform,
    992 )
    994 if self.grouper.has_dropped_na and is_transform:
    995     # result will have dropped rows due to nans, fill with null
    996     # and ensure index is ordered same as the input
    997     result = self._set_result_index_ordered(result)

File ~/open-source/pandas-maren/pandas/core/groupby/groupby.py:1488, in GroupBy._python_apply_general(self, f, data, not_indexed_same, is_transform, is_agg)
   1451 @final
   1452 def _python_apply_general(
   1453     self,
   (...)
   1458     is_agg: bool = False,
   1459 ) -> NDFrameT:
   1460     """
   1461     Apply function f in python space
   1462
   (...)
   1486         data after applying f
   1487     """
-> 1488     values, mutated = self.grouper.apply(f, data, self.axis)
   1489     if not_indexed_same is None:
   1490         not_indexed_same = mutated or self.mutated

File ~/open-source/pandas-maren/pandas/core/groupby/ops.py:786, in BaseGrouper.apply(self, f, data, axis)
    784 # group might be modified
    785 group_axes = group.axes
--> 786 res = f(group)
    787 if not mutated and not _is_indexed_like(res, group_axes, axis):
    788     mutated = True

File ~/open-source/pandas-maren/pandas/core/groupby/groupby.py:972, in GroupBy._op_via_apply.<locals>.curried(x)
    971 def curried(x):
--> 972     return f(x, *args, **kwargs)

File ~/open-source/pandas-maren/pandas/core/generic.py:11511, in NDFrame._add_numeric_operations.<locals>.skew(self, axis, skipna, numeric_only, **kwargs)
  11494 @doc(
  11495     _num_doc,
  11496     desc="Return unbiased skew over requested axis.\n\nNormalized by N-1.",
   (...)
  11509     **kwargs,
  11510 ):
> 11511     return NDFrame.skew(self, axis, skipna, numeric_only, **kwargs)

File ~/open-source/pandas-maren/pandas/core/generic.py:11157, in NDFrame.skew(self, axis, skipna, numeric_only, **kwargs)
  11150 def skew(
  11151     self,
  11152     axis: Axis | None = 0,
   (...)
  11155     **kwargs,
  11156 ) -> Series | float:
> 11157     return self._stat_function(
  11158         "skew", nanops.nanskew, axis, skipna, numeric_only, **kwargs
  11159     )

File ~/open-source/pandas-maren/pandas/core/generic.py:11092, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
  11088     nv.validate_stat_func((), kwargs, fname=name)
  11090 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 11092 return self._reduce(
  11093     func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
  11094 )

File ~/open-source/pandas-maren/pandas/core/series.py:4628, in Series._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   4626         kwd_name = "bool_only"
   4627     # GH#47500 - change to TypeError to match other methods
-> 4628     raise TypeError(
   4629         f"Series.{name} does not allow {kwd_name}={numeric_only} "
   4630         "with non-numeric dtypes."
   4631     )
   4632 with np.errstate(all="ignore"):
   4633     return op(delegate, skipna=skipna, **kwds)

TypeError: Series.skew does not allow numeric_only=True with non-numeric dtypes.

When I set numeric_only=False, I get TypeError: could not convert string to float: 'x' which makes sense. So numeric_only can't really be used for Series beause it results in an error either way.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah - I didn't realize you were referring to Series here. It is true that operating on a non-numeric Series will always fail with skew, and as far as I know this is consistent across Series and SeriesGroupBy ops. Perhaps there is a better behavior or the argument should be removed, but I think that's for a separate issue.


**kwargs
Additional keyword arguments to be passed to the function.

Returns
-------
Series

See Also
--------
Series.skew : Return unbiased skew over requested axis.

Examples
--------
>>> ser = pd.Series([390., 350., 357., np.nan, 22., 20., 30.],
... index=['Falcon', 'Falcon', 'Falcon', 'Falcon',
... 'Parrot', 'Parrot', 'Parrot'],
... name="Max Speed")
>>> ser
Falcon 390.0
Falcon 350.0
Falcon 357.0
Falcon NaN
Parrot 22.0
Parrot 20.0
Parrot 30.0
Name: Max Speed, dtype: float64
>>> ser.groupby(level=0).skew()
Falcon 1.525174
Parrot 1.457863
Name: Max Speed, dtype: float64
>>> ser.groupby(level=0).skew(skipna=False)
Falcon NaN
Parrot 1.457863
Name: Max Speed, dtype: float64
"""
result = self._op_via_apply(
"skew",
axis=axis,
Expand Down Expand Up @@ -2473,14 +2524,76 @@ def take(
result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)
return result

@doc(DataFrame.skew.__doc__)
def skew(
self,
axis: Axis | None | lib.NoDefault = lib.no_default,
skipna: bool = True,
numeric_only: bool = False,
**kwargs,
) -> DataFrame:
"""
Return unbiased skew within groups.

Normalized by N-1.

Parameters
----------
axis : {0 or 'index', 1 or 'columns', None}, default 0
Axis for the function to be applied on.

Specifying ``axis=None`` will apply the aggregation across both axes.

.. versionadded:: 2.0.0

skipna : bool, default True
Exclude NA/null values when computing the result.

numeric_only : bool, default False
Include only float, int, boolean columns.

**kwargs
Additional keyword arguments to be passed to the function.

Returns
-------
DataFrame

See Also
--------
DataFrame.skew : Return unbiased skew over requested axis.

Examples
--------
>>> arrays = [['falcon', 'parrot', 'cockatoo', 'kiwi',
... 'lion', 'monkey', 'rabbit'],
... ['bird', 'bird', 'bird', 'bird',
... 'mammal', 'mammal', 'mammal']]
>>> index = pd.MultiIndex.from_arrays(arrays, names=('name', 'class'))
>>> df = pd.DataFrame({'max_speed': [389.0, 24.0, 70.0, np.nan,
... 80.5, 21.5, 15.0]},
... index=index)
>>> df
max_speed
name class
falcon bird 389.0
parrot bird 24.0
cockatoo bird 70.0
kiwi bird NaN
lion mammal 80.5
monkey mammal 21.5
rabbit mammal 15.0
>>> gb = df.groupby(["class"])
>>> gb.skew()
max_speed
class
bird 1.628296
mammal 1.669046
>>> gb.skew(skipna=False)
max_speed
class
bird NaN
mammal 1.669046
"""
result = self._op_via_apply(
"skew",
axis=axis,
Expand Down