diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 2e104ac06f9f4..5149bd30dbbef 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -386,6 +386,27 @@ Replace NA with a scalar value df df.fillna(0) +When the data has object dtype, you can control what type of NA values are present. + +.. ipython:: python + + df = pd.DataFrame({"a": [pd.NA, np.nan, None]}, dtype=object) + df + df.fillna(None) + df.fillna(np.nan) + df.fillna(pd.NA) + +However when the dtype is not object, these will all be replaced with the proper NA value for the dtype. + +.. ipython:: python + + data = {"np": [1.0, np.nan, np.nan, 2], "arrow": pd.array([1.0, pd.NA, pd.NA, 2], dtype="float64[pyarrow]")} + df = pd.DataFrame(data) + df + df.fillna(None) + df.fillna(np.nan) + df.fillna(pd.NA) + Fill gaps forward or backward .. ipython:: python diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e05cc87d1af14..e4a5f5e855fc7 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -37,6 +37,7 @@ Other enhancements - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) +- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 123dc679a83ea..cbd0221cc2082 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -328,7 +328,7 @@ def _pad_or_backfill( return new_values @doc(ExtensionArray.fillna) - def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Self: + def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: mask = self.isna() # error: Argument 2 to "check_value_size" has incompatible type # "ExtensionArray"; expected "ndarray" @@ -347,8 +347,7 @@ def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Sel new_values[mask] = value else: # We validate the fill_value even if there is nothing to fill - if value is not None: - self._validate_setitem_value(value) + self._validate_setitem_value(value) if not copy: new_values = self[:] diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 34ca81e36cbc5..1154130b9bed3 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1077,7 +1077,7 @@ def _pad_or_backfill( @doc(ExtensionArray.fillna) def fillna( self, - value: object | ArrayLike | None = None, + value: object | ArrayLike, limit: int | None = None, copy: bool = True, ) -> Self: diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index af666a591b1bc..86f58b48ea3be 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -892,7 +892,7 @@ def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOr indexer = obj.argsort()[-1] return obj[indexer] - def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Self: + def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: """ Fill NA/NaN values using the specified method. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d20d7f98b8aa8..190888d281ea9 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -236,7 +236,7 @@ def _pad_or_backfill( return new_values @doc(ExtensionArray.fillna) - def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Self: + def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: mask = self._mask value = missing.check_value_size(value, mask, len(self)) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 134702099371d..522d86fb165f6 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -706,7 +706,7 @@ def isna(self) -> Self: # type: ignore[override] def fillna( self, - value=None, + value, limit: int | None = None, copy: bool = True, ) -> Self: @@ -736,8 +736,6 @@ def fillna( When ``self.fill_value`` is not NA, the result dtype will be ``self.dtype``. Again, this preserves the amount of memory used. """ - if value is None: - raise ValueError("Must specify 'value'.") new_values = np.where(isna(self.sp_values), value, self.sp_values) if self._null_fill_value: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e1c1b21249362..523ca9de201bf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6752,7 +6752,7 @@ def _pad_or_backfill( @overload def fillna( self, - value: Hashable | Mapping | Series | DataFrame = ..., + value: Hashable | Mapping | Series | DataFrame, *, axis: Axis | None = ..., inplace: Literal[False] = ..., @@ -6762,7 +6762,7 @@ def fillna( @overload def fillna( self, - value: Hashable | Mapping | Series | DataFrame = ..., + value: Hashable | Mapping | Series | DataFrame, *, axis: Axis | None = ..., inplace: Literal[True], @@ -6772,7 +6772,7 @@ def fillna( @overload def fillna( self, - value: Hashable | Mapping | Series | DataFrame = ..., + value: Hashable | Mapping | Series | DataFrame, *, axis: Axis | None = ..., inplace: bool = ..., @@ -6786,7 +6786,7 @@ def fillna( ) def fillna( self, - value: Hashable | Mapping | Series | DataFrame | None = None, + value: Hashable | Mapping | Series | DataFrame, *, axis: Axis | None = None, inplace: bool = False, @@ -6827,6 +6827,12 @@ def fillna( reindex : Conform object to new index. asfreq : Convert TimeSeries to specified frequency. + Notes + ----- + For non-object dtype, ``value=None`` will use the NA value of the dtype. + See more details in the :ref:`Filling missing data` + section. + Examples -------- >>> df = pd.DataFrame( @@ -6909,101 +6915,92 @@ def fillna( axis = 0 axis = self._get_axis_number(axis) - if value is None: - raise ValueError("Must specify a fill 'value'.") - else: - if self.ndim == 1: - if isinstance(value, (dict, ABCSeries)): - if not len(value): - # test_fillna_nonscalar - if inplace: - return None - return self.copy(deep=False) - from pandas import Series - - value = Series(value) - value = value.reindex(self.index) - value = value._values - elif not is_list_like(value): - pass - else: - raise TypeError( - '"value" parameter must be a scalar, dict ' - "or Series, but you passed a " - f'"{type(value).__name__}"' - ) + if self.ndim == 1: + if isinstance(value, (dict, ABCSeries)): + if not len(value): + # test_fillna_nonscalar + if inplace: + return None + return self.copy(deep=False) + from pandas import Series + + value = Series(value) + value = value.reindex(self.index) + value = value._values + elif not is_list_like(value): + pass + else: + raise TypeError( + '"value" parameter must be a scalar, dict ' + "or Series, but you passed a " + f'"{type(value).__name__}"' + ) - new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace) + new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace) - elif isinstance(value, (dict, ABCSeries)): - if axis == 1: - raise NotImplementedError( - "Currently only can fill " - "with dict/Series column " - "by column" - ) - result = self if inplace else self.copy(deep=False) - for k, v in value.items(): - if k not in result: - continue + elif isinstance(value, (dict, ABCSeries)): + if axis == 1: + raise NotImplementedError( + "Currently only can fill with dict/Series column by column" + ) + result = self if inplace else self.copy(deep=False) + for k, v in value.items(): + if k not in result: + continue - res_k = result[k].fillna(v, limit=limit) + res_k = result[k].fillna(v, limit=limit) - if not inplace: - result[k] = res_k + if not inplace: + result[k] = res_k + else: + # We can write into our existing column(s) iff dtype + # was preserved. + if isinstance(res_k, ABCSeries): + # i.e. 'k' only shows up once in self.columns + if res_k.dtype == result[k].dtype: + result.loc[:, k] = res_k + else: + # Different dtype -> no way to do inplace. + result[k] = res_k else: - # We can write into our existing column(s) iff dtype - # was preserved. - if isinstance(res_k, ABCSeries): - # i.e. 'k' only shows up once in self.columns - if res_k.dtype == result[k].dtype: - result.loc[:, k] = res_k + # see test_fillna_dict_inplace_nonunique_columns + locs = result.columns.get_loc(k) + if isinstance(locs, slice): + locs = np.arange(self.shape[1])[locs] + elif isinstance(locs, np.ndarray) and locs.dtype.kind == "b": + locs = locs.nonzero()[0] + elif not ( + isinstance(locs, np.ndarray) and locs.dtype.kind == "i" + ): + # Should never be reached, but let's cover our bases + raise NotImplementedError( + "Unexpected get_loc result, please report a bug at " + "https://github.com/pandas-dev/pandas" + ) + + for i, loc in enumerate(locs): + res_loc = res_k.iloc[:, i] + target = self.iloc[:, loc] + + if res_loc.dtype == target.dtype: + result.iloc[:, loc] = res_loc else: - # Different dtype -> no way to do inplace. - result[k] = res_k - else: - # see test_fillna_dict_inplace_nonunique_columns - locs = result.columns.get_loc(k) - if isinstance(locs, slice): - locs = np.arange(self.shape[1])[locs] - elif ( - isinstance(locs, np.ndarray) and locs.dtype.kind == "b" - ): - locs = locs.nonzero()[0] - elif not ( - isinstance(locs, np.ndarray) and locs.dtype.kind == "i" - ): - # Should never be reached, but let's cover our bases - raise NotImplementedError( - "Unexpected get_loc result, please report a bug at " - "https://github.com/pandas-dev/pandas" - ) - - for i, loc in enumerate(locs): - res_loc = res_k.iloc[:, i] - target = self.iloc[:, loc] - - if res_loc.dtype == target.dtype: - result.iloc[:, loc] = res_loc - else: - result.isetitem(loc, res_loc) - if inplace: - return self._update_inplace(result) - else: - return result + result.isetitem(loc, res_loc) + if inplace: + return self._update_inplace(result) + else: + return result - elif not is_list_like(value): - if axis == 1: - result = self.T.fillna(value=value, limit=limit).T - new_data = result._mgr - else: - new_data = self._mgr.fillna( - value=value, limit=limit, inplace=inplace - ) - elif isinstance(value, ABCDataFrame) and self.ndim == 2: - new_data = self.where(self.notna(), value)._mgr + elif not is_list_like(value): + if axis == 1: + result = self.T.fillna(value=value, limit=limit).T + new_data = result._mgr else: - raise ValueError(f"invalid fill value with a {type(value)}") + new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace) + elif isinstance(value, ABCDataFrame) and self.ndim == 2: + new_data = self.where(self.notna(), value)._mgr + else: + raise ValueError(f"invalid fill value with a {type(value)}") result = self._constructor_from_mgr(new_data, axes=new_data.axes) if inplace: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d5517a210b39d..69f916bb3f769 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2543,7 +2543,7 @@ def notna(self) -> npt.NDArray[np.bool_]: notnull = notna - def fillna(self, value=None): + def fillna(self, value): """ Fill NA/NaN values with the specified value. diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 4affa1337aa2a..9df0d26ce622a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1675,7 +1675,7 @@ def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: # (previously declared in base class "IndexOpsMixin") _duplicated = duplicated # type: ignore[misc] - def fillna(self, value=None, downcast=None): + def fillna(self, value, downcast=None): """ fillna is not implemented for MultiIndex """ diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 328c6cd6164fb..4b9234a9904a2 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -68,6 +68,12 @@ def test_fillna_scalar(self, data_missing): expected = data_missing.fillna(valid) tm.assert_extension_array_equal(result, expected) + def test_fillna_with_none(self, data_missing): + # GH#57723 + result = data_missing.fillna(None) + expected = data_missing + tm.assert_extension_array_equal(result, expected) + def test_fillna_limit_pad(self, data_missing): arr = data_missing.take([1, 0, 0, 0, 1]) result = pd.Series(arr).ffill(limit=2) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index a2721908e858f..504bafc145108 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -144,6 +144,14 @@ def test_fillna_series(self, data_missing): ): super().test_fillna_series(data_missing) + def test_fillna_with_none(self, data_missing): + # GH#57723 + # EAs that don't have special logic for None will raise, unlike pandas' + # which interpret None as the NA value for the dtype. + msg = "conversion from NoneType to Decimal is not supported" + with pytest.raises(TypeError, match=msg): + super().test_fillna_with_none(data_missing) + @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna): all_data = all_data[:10] diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 6ecbf2063f203..22ac9627f6cda 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -149,6 +149,13 @@ def test_fillna_frame(self): """We treat dictionaries as a mapping in fillna, not a scalar.""" super().test_fillna_frame() + def test_fillna_with_none(self, data_missing): + # GH#57723 + # EAs that don't have special logic for None will raise, unlike pandas' + # which interpret None as the NA value for the dtype. + with pytest.raises(AssertionError): + super().test_fillna_with_none(data_missing) + @pytest.mark.parametrize( "limit_area, input_ilocs, expected_ilocs", [ diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index b8f67138889cc..e858c123e4dae 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -64,8 +64,8 @@ def test_fillna_datetime(self, datetime_frame): padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"] ).all() - msg = "Must specify a fill 'value'" - with pytest.raises(ValueError, match=msg): + msg = r"missing 1 required positional argument: 'value'" + with pytest.raises(TypeError, match=msg): datetime_frame.fillna() @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") @@ -779,3 +779,17 @@ def test_ffill_bfill_limit_area(data, expected_data, method, kwargs): expected = DataFrame(expected_data) result = getattr(df, method)(**kwargs) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("test_frame", [True, False]) +@pytest.mark.parametrize("dtype", ["float", "object"]) +def test_fillna_with_none_object(test_frame, dtype): + # GH#57723 + obj = Series([1, np.nan, 3], dtype=dtype) + if test_frame: + obj = obj.to_frame() + result = obj.fillna(value=None) + expected = Series([1, None, 3], dtype=dtype) + if test_frame: + expected = expected.to_frame() + tm.assert_equal(result, expected)