From 14d49735d4af03854f153b0913185479546e7ef2 Mon Sep 17 00:00:00 2001 From: tomytp Date: Tue, 28 May 2024 17:41:47 -0300 Subject: [PATCH 01/13] Named agg allow kwargs --- pandas/core/groupby/generic.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a20577e8d3df9..8dadd74415f1c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -108,7 +108,12 @@ ScalarResult = TypeVar("ScalarResult") -class NamedAgg(NamedTuple): +class _BaseNamedAgg(NamedTuple): + column: Hashable + aggfunc: AggScalar + + +def NamedAgg(column: Hashable, aggfunc: AggScalar, *args, **kwargs): """ Helper for column specific aggregation with control over output column names. @@ -134,8 +139,28 @@ class NamedAgg(NamedTuple): 2 1 12.0 """ - column: Hashable - aggfunc: AggScalar + class NamedAggWrapper(_BaseNamedAgg): + def __new__(cls, _column, _aggfunc, *_args, **_kwargs): + original_aggfunc = _aggfunc + if not isinstance(_aggfunc, str): + _aggfunc = cls._get_wrapped_aggfunc(_aggfunc, *_args, **_kwargs) + + self = _BaseNamedAgg.__new__(_column, _aggfunc) + self.original_aggfunc = original_aggfunc + return self + + @staticmethod + def _get_wrapped_aggfunc(function, *initial_args, **initial_kwargs): + def wrapped_aggfunc(*new_args, **new_kwargs): + final_args = new_args + initial_args + final_kwargs = {**initial_kwargs, **new_kwargs} + return function(*final_args, **final_kwargs) + return wrapped_aggfunc + + def __repr__(self): + return f"NamedAgg(column='{self.column}', aggfunc={self.original_aggfunc})" + + return NamedAggWrapper(column, aggfunc, *args, **kwargs) class SeriesGroupBy(GroupBy[Series]): From 8ba87acb9ff064bc7cebdaffe9aaf39304817b49 Mon Sep 17 00:00:00 2001 From: tomytp Date: Tue, 28 May 2024 17:54:02 -0300 Subject: [PATCH 02/13] fixed NamedAgg initialization --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8dadd74415f1c..bb01ed088ff14 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -145,7 +145,7 @@ def __new__(cls, _column, _aggfunc, *_args, **_kwargs): if not isinstance(_aggfunc, str): _aggfunc = cls._get_wrapped_aggfunc(_aggfunc, *_args, **_kwargs) - self = _BaseNamedAgg.__new__(_column, _aggfunc) + self = _BaseNamedAgg.__new__(cls, _column, _aggfunc) self.original_aggfunc = original_aggfunc return self From 816aa1768a49bd492f1b53c6411868a423f344ad Mon Sep 17 00:00:00 2001 From: tomytp Date: Tue, 28 May 2024 19:24:15 -0300 Subject: [PATCH 03/13] Included NammedAgg changes in whatsnew --- doc/source/whatsnew/v3.0.0.rst | 3 +- .../tests/groupby/aggregate/test_aggregate.py | 28 +++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 6a6abcf2d48fe..b797e8a92dd40 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -31,6 +31,7 @@ Other enhancements - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) +- :func:`pandas.NamedAgg` now forwards any *args and **kwargs to calls of ``aggfunc`` (:issue:`58283`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) @@ -43,7 +44,7 @@ Other enhancements - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) -- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) +- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie chgit aarts, allowing for explicit control over the y-axis label (:issue:`58239`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) - diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 3362d6209af6d..39357fa2b9547 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -827,6 +827,34 @@ def test_agg_namedtuple(self): expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count")) tm.assert_frame_equal(result, expected) + def test_single_named_agg_with_args_and_kwargs(self): + df = DataFrame({"A": [0, 1, 2, 3], "B": [1, 2, 3, 4]}) + + def n_between(ser, low, high): + return ser.between(low, high).sum() + + result = df.groupby("A").agg(n_between=pd.NamedAgg("B", n_between, 0, high=2)) + expected = df.groupby("A").agg(n_between=("B", lambda x: x.between(0, 2).sum())) + tm.assert_frame_equal(result, expected) + + def test_multiple_named_agg_with_args_and_kwargs(self): + df = DataFrame({"A": [0, 1, 2, 3], "B": [1, 2, 3, 4]}) + + def n_between(ser, low, high): + return ser.between(low, high).sum() + + result = df.groupby("A").agg( + n_between01=pd.NamedAgg("B", n_between, 0, 1), + n_between13=pd.NamedAgg("B", n_between, 1, 3), + n_between02=pd.NamedAgg("B", n_between, 0, 2), + ) + expected = df.groupby("A").agg( + n_between01=("B", lambda x: x.between(0, 1).sum()), + n_between13=("B", lambda x: x.between(0, 3).sum()), + n_between02=("B", lambda x: x.between(0, 2).sum()), + ) + tm.assert_frame_equal(result, expected) + def test_mangled(self): df = DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]}) result = df.groupby("A").agg(b=("B", lambda x: 0), c=("C", lambda x: 1)) From ed6b6509616cb3bf6321b9899c488bfddf41c552 Mon Sep 17 00:00:00 2001 From: tomytp Date: Tue, 28 May 2024 19:40:40 -0300 Subject: [PATCH 04/13] simplified NamedAgg class --- doc/source/user_guide/style.ipynb | 352 +++++++++++++++--------------- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/groupby/generic.py | 39 ++-- 3 files changed, 197 insertions(+), 196 deletions(-) diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 04ba3e5be8ff7..a6aa3e8847850 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -44,12 +44,12 @@ "metadata": { "nbsphinx": "hidden" }, - "outputs": [], "source": [ "import matplotlib.pyplot\n", "# We have this here to trigger matplotlib's font cache stuff.\n", "# This cell is hidden from the output" - ] + ], + "outputs": [] }, { "cell_type": "markdown", @@ -74,7 +74,6 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", @@ -89,7 +88,8 @@ " .format(precision=3, thousands=\".\", decimal=\",\") \\\n", " .format_index(str.upper, axis=1) \\\n", " .relabel_index([\"row 1\", \"row 2\"], axis=0)" - ] + ], + "outputs": [] }, { "cell_type": "markdown", @@ -102,7 +102,6 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "weather_df = pd.DataFrame(np.random.rand(10,2)*5, \n", " index=pd.date_range(start=\"2021-01-01\", periods=10),\n", @@ -123,16 +122,17 @@ " return styler\n", "\n", "weather_df" - ] + ], + "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "weather_df.loc[\"2021-01-04\":\"2021-01-08\"].style.pipe(make_pretty)" - ] + ], + "outputs": [] }, { "cell_type": "markdown", @@ -155,13 +155,13 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "df = pd.DataFrame(np.random.randn(5, 5))\n", "df.style \\\n", " .hide(subset=[0, 2, 4], axis=0) \\\n", " .hide(subset=[0, 2, 4], axis=1)" - ] + ], + "outputs": [] }, { "cell_type": "markdown", @@ -174,13 +174,13 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "show = [0, 2, 4]\n", "df.style \\\n", " .hide([row for row in df.index if row not in show], axis=0) \\\n", " .hide([col for col in df.columns if col not in show], axis=1)" - ] + ], + "outputs": [] }, { "cell_type": "markdown", @@ -197,13 +197,13 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "summary_styler = df.agg([\"sum\", \"mean\"]).style \\\n", " .format(precision=3) \\\n", " .relabel_index([\"Sum\", \"Average\"])\n", "df.style.format(precision=1).concat(summary_styler)" - ] + ], + "outputs": [] }, { "cell_type": "markdown", @@ -225,13 +225,13 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "df = pd.DataFrame([[38.0, 2.0, 18.0, 22.0, 21, np.nan],[19, 439, 6, 452, 226,232]], \n", " index=pd.Index(['Tumour (Positive)', 'Non-Tumour (Negative)'], name='Actual Label:'), \n", " columns=pd.MultiIndex.from_product([['Decision Tree', 'Regression', 'Random'],['Tumour', 'Non-Tumour']], names=['Model:', 'Predicted:']))\n", "df.style" - ] + ], + "outputs": [] }, { "cell_type": "code", @@ -239,7 +239,6 @@ "metadata": { "nbsphinx": "hidden" }, - "outputs": [], "source": [ "# Hidden cell to just create the below example: code is covered throughout the guide.\n", "s = df.style\\\n", @@ -299,16 +298,17 @@ " 'visibility: hidden; position: absolute; z-index: 1; border: 1px solid #000066;'\n", " 'background-color: white; color: #000066; font-size: 0.8em;' \n", " 'transform: translate(0px, -24px); padding: 0.6em; border-radius: 0.5em;')\n" - ] + ], + "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "s" - ] + ], + "outputs": [] }, { "cell_type": "markdown", @@ -323,11 +323,11 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "s = df.style.format('{:.0f}').hide([('Random', 'Tumour'), ('Random', 'Non-Tumour')], axis=\"columns\")\n", "s" - ] + ], + "outputs": [] }, { "cell_type": "code", @@ -335,11 +335,11 @@ "metadata": { "nbsphinx": "hidden" }, - "outputs": [], "source": [ "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", "s.set_uuid('after_hide')" - ] + ], + "outputs": [] }, { "cell_type": "markdown", @@ -392,7 +392,6 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "cell_hover = { # for row hover use instead of \n", " 'selector': 'td:hover',\n", @@ -407,7 +406,8 @@ " 'props': 'background-color: #000066; color: white;'\n", "}\n", "s.set_table_styles([cell_hover, index_names, headers])" - ] + ], + "outputs": [] }, { "cell_type": "code", @@ -415,11 +415,11 @@ "metadata": { "nbsphinx": "hidden" }, - "outputs": [], "source": [ "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", "s.set_uuid('after_tab_styles1')" - ] + ], + "outputs": [] }, { "cell_type": "markdown", @@ -432,14 +432,14 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "s.set_table_styles([\n", " {'selector': 'th.col_heading', 'props': 'text-align: center;'},\n", " {'selector': 'th.col_heading.level0', 'props': 'font-size: 1.5em;'},\n", " {'selector': 'td', 'props': 'text-align: center; font-weight: bold;'},\n", "], overwrite=False)" - ] + ], + "outputs": [] }, { "cell_type": "code", @@ -447,11 +447,11 @@ "metadata": { "nbsphinx": "hidden" }, - "outputs": [], "source": [ "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", "s.set_uuid('after_tab_styles2')" - ] + ], + "outputs": [] }, { "cell_type": "markdown", @@ -466,13 +466,13 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "s.set_table_styles({\n", " ('Regression', 'Tumour'): [{'selector': 'th', 'props': 'border-left: 1px solid white'},\n", " {'selector': 'td', 'props': 'border-left: 1px solid #000066'}]\n", "}, overwrite=False, axis=0)" - ] + ], + "outputs": [] }, { "cell_type": "code", @@ -480,11 +480,11 @@ "metadata": { "nbsphinx": "hidden" }, - "outputs": [], "source": [ "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", "s.set_uuid('xyz01')" - ] + ], + "outputs": [] }, { "cell_type": "markdown", @@ -505,11 +505,11 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "out = s.set_table_attributes('class=\"my-table-cls\"').to_html()\n", "print(out[out.find('