diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 70c60401f29fb..36f1a77badc95 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -224,6 +224,7 @@ Copy-on-Write improvements - :meth:`DataFrame.truncate` - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize` - :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects` + - :meth:`DataFrame.astype` / :meth:`Series.astype` - :func:`concat` These methods return views when Copy-on-Write is enabled, which provides a significant diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index e5b0b5658534f..4780ec176f6d0 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -26,6 +26,7 @@ is_dtype_equal, is_integer_dtype, is_object_dtype, + is_string_dtype, is_timedelta64_dtype, pandas_dtype, ) @@ -246,3 +247,55 @@ def astype_array_safe( raise return new_values + + +def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool: + """Checks if astype avoided copying the data. + + Parameters + ---------- + dtype : Original dtype + new_dtype : target dtype + + Returns + ------- + True if new data is a view or not guaranteed to be a copy, False otherwise + """ + if dtype == new_dtype: + return True + + elif isinstance(dtype, np.dtype) and isinstance(new_dtype, np.dtype): + # Only equal numpy dtypes avoid a copy + return False + + elif is_string_dtype(dtype) and is_string_dtype(new_dtype): + # Potentially! a view when converting from object to string + return True + + elif is_object_dtype(dtype) and new_dtype.kind == "O": + # When the underlying array has dtype object, we don't have to make a copy + return True + + elif dtype.kind in "mM" and new_dtype.kind in "mM": + dtype = getattr(dtype, "numpy_dtype", dtype) + new_dtype = getattr(new_dtype, "numpy_dtype", new_dtype) + return getattr(dtype, "unit", None) == getattr(new_dtype, "unit", None) + + numpy_dtype = getattr(dtype, "numpy_dtype", None) + new_numpy_dtype = getattr(new_dtype, "numpy_dtype", None) + + if numpy_dtype is None and isinstance(dtype, np.dtype): + numpy_dtype = dtype + + if new_numpy_dtype is None and isinstance(new_dtype, np.dtype): + numpy_dtype = new_dtype + + if numpy_dtype is not None and new_numpy_dtype is not None: + # if both have NumPy dtype or one of them is a numpy dtype + # they are only a view when the numpy dtypes are equal, e.g. + # int64 -> Int64 or int64[pyarrow] + # int64 -> Int32 copies + return numpy_dtype == new_numpy_dtype + + # Assume this is a view since we don't know for sure if a copy was made + return True diff --git a/pandas/core/generic.py b/pandas/core/generic.py index eae4ed038d692..698b44d21906b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6120,7 +6120,7 @@ def dtypes(self): return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_) def astype( - self: NDFrameT, dtype, copy: bool_t = True, errors: IgnoreRaise = "raise" + self: NDFrameT, dtype, copy: bool_t | None = None, errors: IgnoreRaise = "raise" ) -> NDFrameT: """ Cast a pandas object to a specified dtype ``dtype``. @@ -6257,7 +6257,7 @@ def astype( for i, (col_name, col) in enumerate(self.items()): cdt = dtype_ser.iat[i] if isna(cdt): - res_col = col.copy() if copy else col + res_col = col.copy(deep=copy) else: try: res_col = col.astype(dtype=cdt, copy=copy, errors=errors) @@ -6284,7 +6284,7 @@ def astype( # GH 33113: handle empty frame or series if not results: - return self.copy() + return self.copy(deep=None) # GH 19920: retain column metadata after concat result = concat(results, axis=1, copy=False) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 2823c355955ee..431313e3a2960 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -366,7 +366,10 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast ) - def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T: + def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T: + if copy is None: + copy = True + return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors) def convert(self: T, copy: bool | None) -> T: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 115ae5dc6bb9d..e66011acb978b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -41,7 +41,10 @@ from pandas.util._decorators import cache_readonly from pandas.util._validators import validate_bool_kwarg -from pandas.core.dtypes.astype import astype_array_safe +from pandas.core.dtypes.astype import ( + astype_array_safe, + astype_is_view, +) from pandas.core.dtypes.cast import ( LossySetitemError, can_hold_element, @@ -470,7 +473,11 @@ def dtype(self) -> DtypeObj: @final def astype( - self, dtype: DtypeObj, copy: bool = False, errors: IgnoreRaise = "raise" + self, + dtype: DtypeObj, + copy: bool = False, + errors: IgnoreRaise = "raise", + using_cow: bool = False, ) -> Block: """ Coerce to the new dtype. @@ -483,6 +490,8 @@ def astype( errors : str, {'raise', 'ignore'}, default 'raise' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object + using_cow: bool, default False + Signaling if copy on write copy logic is used. Returns ------- @@ -493,7 +502,12 @@ def astype( new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) new_values = maybe_coerce_values(new_values) - newb = self.make_block(new_values) + + refs = None + if using_cow and astype_is_view(values.dtype, new_values.dtype): + refs = self.refs + + newb = self.make_block(new_values, refs=refs) if newb.shape != self.shape: raise TypeError( f"cannot set astype for copy = [{copy}] for dtype " diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5d45b33871900..517e6d7e48275 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -421,8 +421,20 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast ) - def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T: - return self.apply("astype", dtype=dtype, copy=copy, errors=errors) + def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T: + if copy is None: + if using_copy_on_write(): + copy = False + else: + copy = True + + return self.apply( + "astype", + dtype=dtype, + copy=copy, + errors=errors, + using_cow=using_copy_on_write(), + ) def convert(self: T, copy: bool | None) -> T: if copy is None: diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py new file mode 100644 index 0000000000000..a485275a28ac4 --- /dev/null +++ b/pandas/tests/copy_view/test_astype.py @@ -0,0 +1,195 @@ +import numpy as np +import pytest + +from pandas.compat import pa_version_under7p0 + +from pandas import ( + DataFrame, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +def test_astype_single_dtype(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1.5}) + df_orig = df.copy() + df2 = df.astype("float64") + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 2] = 5.5 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + tm.assert_frame_equal(df, df_orig) + + # mutating parent also doesn't update result + df2 = df.astype("float64") + df.iloc[0, 2] = 5.5 + tm.assert_frame_equal(df2, df_orig.astype("float64")) + + +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +@pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"]) +def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype): + if new_dtype == "int64[pyarrow]" and pa_version_under7p0: + pytest.skip("pyarrow not installed") + df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) + df_orig = df.copy() + df2 = df.astype(new_dtype) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 0] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + # mutating parent also doesn't update result + df2 = df.astype(new_dtype) + df.iloc[0, 0] = 100 + tm.assert_frame_equal(df2, df_orig.astype(new_dtype)) + + +@pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"]) +def test_astype_different_target_dtype(using_copy_on_write, dtype): + if dtype == "int32[pyarrow]" and pa_version_under7p0: + pytest.skip("pyarrow not installed") + df = DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + df2 = df.astype(dtype) + + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + if using_copy_on_write: + assert df2._mgr._has_no_reference(0) + + df2.iloc[0, 0] = 5 + tm.assert_frame_equal(df, df_orig) + + # mutating parent also doesn't update result + df2 = df.astype(dtype) + df.iloc[0, 0] = 100 + tm.assert_frame_equal(df2, df_orig.astype(dtype)) + + +@pytest.mark.parametrize( + "dtype, new_dtype", [("object", "string"), ("string", "object")] +) +def test_astype_string_and_object(using_copy_on_write, dtype, new_dtype): + df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) + df_orig = df.copy() + df2 = df.astype(new_dtype) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df2.iloc[0, 0] = "x" + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "dtype, new_dtype", [("object", "string"), ("string", "object")] +) +def test_astype_string_and_object_update_original( + using_copy_on_write, dtype, new_dtype +): + df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) + df2 = df.astype(new_dtype) + df_orig = df2.copy() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df.iloc[0, 0] = "x" + tm.assert_frame_equal(df2, df_orig) + + +def test_astype_dict_dtypes(using_copy_on_write): + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")} + ) + df_orig = df.copy() + df2 = df.astype({"a": "float64", "c": "float64"}) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 2] = 5.5 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + + df2.iloc[0, 1] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + tm.assert_frame_equal(df, df_orig) + + +def test_astype_different_datetime_resos(using_copy_on_write): + df = DataFrame({"a": date_range("2019-12-31", periods=2, freq="D")}) + result = df.astype("datetime64[ms]") + + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) + if using_copy_on_write: + assert result._mgr._has_no_reference(0) + + +def test_astype_different_timezones(using_copy_on_write): + df = DataFrame( + {"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific")} + ) + result = df.astype("datetime64[ns, Europe/Berlin]") + if using_copy_on_write: + assert not result._mgr._has_no_reference(0) + assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a").asi8) + + +def test_astype_different_timezones_different_reso(using_copy_on_write): + df = DataFrame( + {"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific")} + ) + result = df.astype("datetime64[ms, Europe/Berlin]") + if using_copy_on_write: + assert result._mgr._has_no_reference(0) + assert not np.shares_memory( + get_array(df, "a").asi8, get_array(result, "a").asi8 + ) + + +@pytest.mark.skipif(pa_version_under7p0, reason="pyarrow not installed") +def test_astype_arrow_timestamp(using_copy_on_write): + df = DataFrame( + { + "a": [ + Timestamp("2020-01-01 01:01:01.000001"), + Timestamp("2020-01-01 01:01:01.000001"), + ] + }, + dtype="M8[ns]", + ) + result = df.astype("timestamp[ns][pyarrow]") + if using_copy_on_write: + assert not result._mgr._has_no_reference(0) + assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a")._data) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index f5805455e326f..1b709a8aa8076 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas import Series @@ -6,13 +7,14 @@ # Copy/view behaviour for Series / DataFrame constructors -def test_series_from_series(using_copy_on_write): +@pytest.mark.parametrize("dtype", [None, "int64"]) +def test_series_from_series(dtype, using_copy_on_write): # Case: constructing a Series from another Series object follows CoW rules: # a new object is returned and thus mutations are not propagated ser = Series([1, 2, 3], name="name") # default is copy=False -> new Series is a shallow copy / view of original - result = Series(ser) + result = Series(ser, dtype=dtype) # the shallow copy still shares memory assert np.shares_memory(ser.values, result.values) @@ -34,7 +36,7 @@ def test_series_from_series(using_copy_on_write): assert np.shares_memory(ser.values, result.values) # the same when modifying the parent - result = Series(ser) + result = Series(ser, dtype=dtype) if using_copy_on_write: # mutating original doesn't mutate new series diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index cdacbf9c5bb5a..90f89d71b15a9 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -483,3 +483,14 @@ def test_to_numpy_keyword(): result = pd.Series(a).to_numpy(decimals=2) tm.assert_numpy_array_equal(result, expected) + + +def test_array_copy_on_write(using_copy_on_write): + df = pd.DataFrame({"a": [decimal.Decimal(2), decimal.Decimal(3)]}, dtype="object") + df2 = df.astype(DecimalDtype()) + df.iloc[0, 0] = 0 + if using_copy_on_write: + expected = pd.DataFrame( + {"a": [decimal.Decimal(2), decimal.Decimal(3)]}, dtype=DecimalDtype() + ) + tm.assert_equal(df2.values, expected.values) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 49c5b78a48a9f..e8d8024b57064 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -873,13 +873,16 @@ def test_constructor_invalid_coerce_ints_with_float_nan(self, any_int_numpy_dtyp with pytest.raises(IntCastingNaNError, match=msg): Series(np.array(vals), dtype=any_int_numpy_dtype) - def test_constructor_dtype_no_cast(self): + def test_constructor_dtype_no_cast(self, using_copy_on_write): # see gh-1572 s = Series([1, 2, 3]) s2 = Series(s, dtype=np.int64) s2[1] = 5 - assert s[1] == 5 + if using_copy_on_write: + assert s[1] == 2 + else: + assert s[1] == 5 def test_constructor_datelike_coercion(self):