diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 0f0e82f4ad4e2..1947d681e70f4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1255,6 +1255,7 @@ def convert_dtypes( convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, + convert_floating: bool = True, ) -> Dtype: """ Convert objects to best possible type, and optionally, @@ -1269,6 +1270,10 @@ def convert_dtypes( Whether, if possible, conversion can be done to integer extension types. convert_boolean : bool, defaults True Whether object dtypes should be converted to ``BooleanDtypes()``. + convert_floating : bool, defaults True + Whether, if possible, conversion can be done to floating extension types. + If `convert_integer` is also True, preference will be give to integer + dtypes if the floats can be faithfully casted to integers. Returns ------- @@ -1276,7 +1281,9 @@ def convert_dtypes( new dtype """ is_extension = is_extension_array_dtype(input_array.dtype) - if (convert_string or convert_integer or convert_boolean) and not is_extension: + if ( + convert_string or convert_integer or convert_boolean or convert_floating + ) and not is_extension: try: inferred_dtype = lib.infer_dtype(input_array) except ValueError: @@ -1304,6 +1311,29 @@ def convert_dtypes( if is_integer_dtype(inferred_dtype): inferred_dtype = input_array.dtype + if convert_floating: + if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( + input_array.dtype + ): + from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE + + inferred_float_dtype = FLOAT_STR_TO_DTYPE.get( + input_array.dtype.name, "Float64" + ) + # if we could also convert to integer, check if all floats + # are actually integers + if convert_integer: + arr = input_array[notna(input_array)] + if (arr.astype(int) == arr).all(): + inferred_dtype = "Int64" + else: + inferred_dtype = inferred_float_dtype + else: + inferred_dtype = inferred_float_dtype + else: + if is_float_dtype(inferred_dtype): + inferred_dtype = input_array.dtype + if convert_boolean: if is_bool_dtype(input_array.dtype): inferred_dtype = "boolean" diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c7448cf8f8e40..c9f862d136477 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6088,6 +6088,7 @@ def convert_dtypes( convert_string: bool_t = True, convert_integer: bool_t = True, convert_boolean: bool_t = True, + convert_floating: bool_t = True, ) -> FrameOrSeries: """ Convert columns to best possible dtypes using dtypes supporting ``pd.NA``. @@ -6104,6 +6105,12 @@ def convert_dtypes( Whether, if possible, conversion can be done to integer extension types. convert_boolean : bool, defaults True Whether object dtypes should be converted to ``BooleanDtypes()``. + convert_floating : bool, defaults True + Whether, if possible, conversion can be done to floating extension types. + If `convert_integer` is also True, preference will be give to integer + dtypes if the floats can be faithfully casted to integers. + + .. versionadded:: 1.2.0 Returns ------- @@ -6121,19 +6128,25 @@ def convert_dtypes( ----- By default, ``convert_dtypes`` will attempt to convert a Series (or each Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options - ``convert_string``, ``convert_integer``, and ``convert_boolean``, it is - possible to turn off individual conversions to ``StringDtype``, the integer - extension types or ``BooleanDtype``, respectively. + ``convert_string``, ``convert_integer``, ``convert_boolean`` and + ``convert_boolean``, it is possible to turn off individual conversions + to ``StringDtype``, the integer extension types, ``BooleanDtype`` + or floating extension types, respectively. For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference rules as during normal Series/DataFrame construction. Then, if possible, - convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension - type, otherwise leave as ``object``. + convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer + or floating extension type, otherwise leave as ``object``. If the dtype is integer, convert to an appropriate integer extension type. If the dtype is numeric, and consists of all integers, convert to an - appropriate integer extension type. + appropriate integer extension type. Otherwise, convert to an + appropriate floating extension type. + + .. versionchanged:: 1.2 + Starting with pandas 1.2, this method also converts float columns + to the nullable floating extension type. In the future, as new dtypes are added that support ``pd.NA``, the results of this method will change to support those new dtypes. @@ -6173,7 +6186,7 @@ def convert_dtypes( >>> dfn = df.convert_dtypes() >>> dfn a b c d e f - 0 1 x True h 10 NaN + 0 1 x True h 10 1 2 y False i 100.5 2 3 z 20 200.0 @@ -6183,7 +6196,7 @@ def convert_dtypes( c boolean d string e Int64 - f float64 + f Float64 dtype: object Start with a Series of strings and missing data represented by ``np.nan``. @@ -6205,12 +6218,20 @@ def convert_dtypes( """ if self.ndim == 1: return self._convert_dtypes( - infer_objects, convert_string, convert_integer, convert_boolean + infer_objects, + convert_string, + convert_integer, + convert_boolean, + convert_floating, ) else: results = [ col._convert_dtypes( - infer_objects, convert_string, convert_integer, convert_boolean + infer_objects, + convert_string, + convert_integer, + convert_boolean, + convert_floating, ) for col_name, col in self.items() ] diff --git a/pandas/core/series.py b/pandas/core/series.py index d493ac0a8c051..1f4221206e5bc 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4706,6 +4706,7 @@ def _convert_dtypes( convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, + convert_floating: bool = True, ) -> "Series": input_series = self if infer_objects: @@ -4713,9 +4714,13 @@ def _convert_dtypes( if is_object_dtype(input_series): input_series = input_series.copy() - if convert_string or convert_integer or convert_boolean: + if convert_string or convert_integer or convert_boolean or convert_floating: inferred_dtype = convert_dtypes( - input_series._values, convert_string, convert_integer, convert_boolean + input_series._values, + convert_string, + convert_integer, + convert_boolean, + convert_floating, ) try: result = input_series.astype(inferred_dtype) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index d44667b258414..920182a99e9ef 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -58,9 +58,17 @@ [10, np.nan, 20], np.dtype("float"), "Int64", - {("convert_integer", False): np.dtype("float")}, + { + ("convert_integer", False, "convert_floating", True): "Float64", + ("convert_integer", False, "convert_floating", False): np.dtype("float"), + }, + ), + ( + [np.nan, 100.5, 200], + np.dtype("float"), + "Float64", + {("convert_floating", False): np.dtype("float")}, ), - ([np.nan, 100.5, 200], np.dtype("float"), np.dtype("float"), {}), ( [3, 4, 5], "Int8", @@ -85,20 +93,30 @@ "Int8", {("convert_integer", False): np.dtype("i1")}, ), + ( + [1.2, 1.3], + np.dtype("float32"), + "Float32", + {("convert_floating", False): np.dtype("float32")}, + ), ( [1, 2.0], object, "Int64", { - ("convert_integer", False): np.dtype("float"), + ("convert_integer", False): "Float64", + ("convert_integer", False, "convert_floating", False): np.dtype("float"), ("infer_objects", False): np.dtype("object"), }, ), ( [1, 2.5], object, - np.dtype("float"), - {("infer_objects", False): np.dtype("object")}, + "Float64", + { + ("convert_floating", False): np.dtype("float"), + ("infer_objects", False): np.dtype("object"), + }, ), (["a", "b"], pd.CategoricalDtype(), pd.CategoricalDtype(), {}), ( @@ -134,7 +152,7 @@ class TestSeriesConvertDtypes: "data, maindtype, expected_default, expected_other", test_cases, ) - @pytest.mark.parametrize("params", product(*[(True, False)] * 4)) + @pytest.mark.parametrize("params", product(*[(True, False)] * 5)) def test_convert_dtypes( self, data, maindtype, params, expected_default, expected_other ): @@ -150,12 +168,13 @@ def test_convert_dtypes( "convert_string", "convert_integer", "convert_boolean", + "convert_floating", ] params_dict = dict(zip(param_names, params)) expected_dtype = expected_default - for (key, val), dtype in expected_other.items(): - if params_dict[key] is val: + for spec, dtype in expected_other.items(): + if all(params_dict[key] is val for key, val in zip(spec[::2], spec[1::2])): expected_dtype = dtype expected = pd.Series(data, dtype=expected_dtype)