diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index e3dfb552651a0..0768a489c3370 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -46,6 +46,7 @@ Conversion DataFrame.astype DataFrame.convert_dtypes DataFrame.infer_objects + DataFrame.format DataFrame.copy DataFrame.bool diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 3b595ba5ab206..973af498ee90c 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -48,6 +48,7 @@ Conversion Series.astype Series.convert_dtypes Series.infer_objects + Series.format Series.copy Series.bool Series.to_numpy diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 22b83425b58c2..ff34f2a6f247c 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -30,6 +30,35 @@ For example, the below now works: ser[0] pd.Series([1, 2, np.nan], dtype="Int64").astype("string") +.. _whatsnew_110.format: + +``DataFrame.format`` and ``Series.format`` for complex conversion to StringDtype +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +New methods :meth:`DataFrame.format` and :meth:`Series.format` have been added (:issue:`xxxxx`). +These methods allow creating a ``string`` Series from arbitrary ``Series`` or ``DataFrame`` using standard python format strings: + +.. ipython:: python + + df = pd.DataFrame({ + 'state_name': ['California', 'Texas', 'Florida'], + 'state_abbreviation': ['CA', 'TX', 'FL'], + 'population': [39_512_223, 28_995_881, 21_477_737], + }, index=[1, 2, 3]) + df + ser = df["population"] + df.format("{state_name} ({state_abbreviation}): {population:,}") + ser.format("Population: {population:,}") + +The output Series will always have dtype :class:`StringDtype`. + +Formatting using positional arguments is also possible (``positional_only=True`` is not necessary, but by disallowing keyword parameters performance is improved): + +.. ipython:: python + + df.format("{} ({}): {:,}", positional_only=True) + ser.format("Population: {:,}", positional_only=True) + .. _whatsnew_110.period_index_partial_string_slicing: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bb2810ba7857f..328d095d8b3cb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -97,6 +97,7 @@ from pandas.core.internals import BlockManager from pandas.core.missing import find_valid_index from pandas.core.ops import _align_method_FRAME +from pandas.core.strings import str_format from pandas.io.formats import format as fmt from pandas.io.formats.format import DataFrameFormatter, format_percentiles @@ -105,6 +106,7 @@ if TYPE_CHECKING: from pandas.core.resample import Resampler + from pandas.core.series import Series # goal is to be able to define the docs close to function, while still being # able to share @@ -3742,6 +3744,19 @@ def __delitem__(self, key) -> None: # ---------------------------------------------------------------------- # Unsorted + @doc(str_format) + def format( + self, + format: str, + name: Optional[str] = None, + positional_only: bool_t = False, + how_na: str = "any", + ) -> "Series": + + return str_format( + self, format, name=name, positional_only=positional_only, how_na=how_na + ) + def get(self, key, default=None): """ Get item from object for given key (ex: DataFrame column). diff --git a/pandas/core/strings.py b/pandas/core/strings.py index a1db7742916de..82ca688802dbc 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2,7 +2,17 @@ from functools import wraps import re import textwrap -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + List, + Optional, + Pattern, + Type, + Union, +) import warnings import numpy as np @@ -39,6 +49,7 @@ from pandas.core.construction import extract_array if TYPE_CHECKING: + from pandas import Series from pandas.arrays import StringArray _cpython_optimized_encoders = ( @@ -241,6 +252,117 @@ def g(x): return lib.map_infer(arr, f) +def str_format( + arr, + format: str, + name: Optional[str] = None, + positional_only: bool = False, + how_na: str = "any", +) -> "Series": + """ + Format rows according to the format and return a Series with one string per row. + + Parameters + ---------- + arr : DataFrame or Series + The values to format. + format : str + format string. + name : Label, optional + The name of the returned Series. + positional_only : bool, default False + If True, only allow positional parameters (i.e. allow "{}", but not "{key}"). + Setting to ``True`` will improve performance. + how_na : str, one of {"all", "any"}, default "any" + If "all", return ``NA`` if all values in row are nan values. + If "any", return ``NA`` if at least one of the values in row is a nan value. + + Returns + ------- + Series + A Series with dtype ``StringDtype``, formatted according to ``format``. + + Examples + -------- + >>> df = pd.DataFrame({ + ... 'state_name': ['California', 'Texas', 'Florida'], + ... 'state_abbreviation': ['CA', 'TX', 'FL'], + ... 'population': [39_512_223, 28_995_881, 21_477_737], + ... }, index=[1, 2, 3]) + >>> df + state_name state_abbreviation population + 1 California CA 39512223 + 2 Texas TX 28995881 + 3 Florida FL 21477737 + >>> ser = df["population"] + + Formatting using positional arguments: + + >>> ser.format("Population: {:,}") + 1 Population: 39,512,223 + 2 Population: 28,995,881 + 3 Population: 21,477,737 + dtype: string + + >>> df.format("{} ({}): {:,}") + 1 California (CA): 39,512,223 + 2 Texas (TX): 28,995,881 + 3 Florida (FL): 21,477,737 + dtype: string + + Using keyword arguments (only works if column labels are strings): + + >>> ser.format("Population: {population:,}") + 1 Population: 39,512,223 + 2 Population: 28,995,881 + 3 Population: 21,477,737 + dtype: string + + >>> df.format("{state_name} ({state_abbreviation}): {population:,}") + 1 California (CA): 39,512,223 + 2 Texas (TX): 28,995,881 + 3 Florida (FL): 21,477,737 + dtype: string + + The index can be added using the keyword 'Index': + + >>> df.format("{state_name} ({state_abbreviation}): {population:,} (no. {Index})") + 1 California (CA): 39,512,223 (no. 1) + 2 Texas (TX): 28,995,881 (no. 2) + 3 Florida (FL): 21,477,737 (no. 3) + dtype: string + """ + from pandas import NA + from pandas.arrays import StringArray + + if not isinstance(arr, ABCDataFrame): + result_wrapper = arr._constructor + arr_name = arr.name if arr.name is not None else "_1" + arr = arr.to_frame(name=arr_name) + else: + result_wrapper = arr._constructor_sliced + + na_mask = isna(arr) + if how_na == "any": + na_mask = na_mask.any(axis=1) + elif how_na == "all": + na_mask = na_mask.all(axis=1) + else: + raise ValueError(how_na) + + func = format.format + if positional_only: + named_tups = arr.itertuples(index=False) + result = np.array([func(*named_tup) for named_tup in named_tups], dtype=object) + else: + named_tups = arr.itertuples() + res = [func(*named_tup[1:], **named_tup._asdict()) for named_tup in named_tups] + result = np.array(res, dtype=object) + + result[na_mask] = NA + return result_wrapper(StringArray(result), index=arr.index.copy(), name=name) + + def str_count(arr, pat, flags=0): """ Count occurrences of pattern in each string of the Series/Index. diff --git a/pandas/tests/frame/methods/test_format.py b/pandas/tests/frame/methods/test_format.py new file mode 100644 index 0000000000000..e4164bedf837b --- /dev/null +++ b/pandas/tests/frame/methods/test_format.py @@ -0,0 +1,62 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestFormat: + @pytest.mark.parametrize("format_str", ["{}-{}", "{A}-{B}", "{}-{B}"]) + @pytest.mark.parametrize("name", [None, "X"]) + @pytest.mark.parametrize("how_na", ["all", "any"]) + def test_basic(self, format_str, name, how_na): + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + expected = pd.Series(["1-4", "2-5", "3-6"], dtype="string", name=name) + + result = df.format(format_str, name=name, how_na=how_na) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format_str", ["{Index}-{}-{}", "{Index}-{A}-{B}"]) + def test_with_index(self, format_str): + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + expected = pd.Series(["0-1-4", "1-2-5", "2-3-6"], dtype="string") + + result = df.format(format_str) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format_str", ["{}-{}"]) + @pytest.mark.parametrize("positional_only", [True, False]) + def test_positional_only(self, format_str, positional_only): + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + expected = pd.Series(["1-4", "2-5", "3-6"], dtype="string") + + result = df.format(format_str, positional_only=positional_only) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format_str", ["{A}-{B}", "{A}-{}", "{Index}-{}"]) + def test_positional_only_raises(self, format_str): + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + with pytest.raises(KeyError): + df.format(format_str, positional_only=True) + + @pytest.mark.parametrize( + "how_na, expected", + [("any", ["1-4", pd.NA, pd.NA]), ("all", ["1-4", "nan-5", pd.NA])], + ) + def test_na_how(self, how_na, expected): + df = pd.DataFrame({"A": [1, None, None], "B": [4, 5, None]}) + expected = pd.Series(expected, dtype="string") + + result = df.format("{:.0f}-{:.0f}", how_na=how_na) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format_string", ["{}-{}-{}", "{0}-{1}-{2}"]) + def test_too_many_positional_args(self, format_string): + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + with pytest.raises(IndexError): + df.format(format_string) + + @pytest.mark.parametrize("format_string", ["{A}-{B}-{C}", "{C}"]) + def test_too_many_named_args(self, format_string): + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + with pytest.raises(KeyError): + df.format(format_string) diff --git a/pandas/tests/series/methods/test_format.py b/pandas/tests/series/methods/test_format.py new file mode 100644 index 0000000000000..ebb68526b2e2d --- /dev/null +++ b/pandas/tests/series/methods/test_format.py @@ -0,0 +1,64 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestFormat: + @pytest.mark.parametrize("format_str", ["Value: {}", "Value: {A}"]) + @pytest.mark.parametrize("how_na", ["all", "any"]) + def test_basic(self, format_str, how_na): + ser = pd.Series([1, 2, 3], name="A") + expected = pd.Series( + ["Value: 1", "Value: 2", "Value: 3"], dtype="string", name="X" + ) + + result = ser.format(format_str, how_na=how_na, name="X") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format_str", ["{Index}-{}", "{Index}-{A}"]) + def test_with_index(self, format_str): + ser = pd.Series([1, 2, 3], name="A") + expected = pd.Series(["0-1", "1-2", "2-3"], dtype="string", name="X") + + result = ser.format(format_str, name="X") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format_str", ["Value: {}"]) + @pytest.mark.parametrize("positional_only", [True, False]) + def test_positional_only(self, format_str, positional_only): + ser = pd.Series([1, 2, 3], name="A") + expected = pd.Series(["Value: 1", "Value: 2", "Value: 3"], dtype="string") + + result = ser.format(format_str, positional_only=positional_only) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format_str", ["{A}-{}", "{Index}-{}"]) + def test_positional_only_raises(self, format_str): + ser = pd.Series([1, 2, 3], name="A") + with pytest.raises(KeyError): + ser.format(format_str, positional_only=True) + + @pytest.mark.parametrize( + "how_na, expected", + [("any", ["Value: 1", pd.NA, pd.NA]), ("all", ["Value: 1", pd.NA, pd.NA])], + ) + @pytest.mark.parametrize("format_str", ["Value: {}", "Value: {A}"]) + def test_na_how(self, how_na, expected, format_str): + ser = pd.Series([1, pd.NA, pd.NA], name="A") + expected = pd.Series(expected, dtype="string") + + result = ser.format("Value: {}", how_na=how_na) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format_string", ["{}-{}", "{0}-{1}"]) + def test_too_many_positional_args(self, format_string): + ser = pd.Series([1, 2, 3], name="A") + with pytest.raises(IndexError): + ser.format(format_string) + + @pytest.mark.parametrize("format_string", ["{A}-{B}", "{B}"]) + def test_unknown_named_args(self, format_string): + ser = pd.Series([1, 2, 3], name="A") + with pytest.raises(KeyError): + ser.format(format_string)