diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3c9c861afd989..7d7a280208de9 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1015,6 +1015,7 @@ Conversion - Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`) - Bug in :meth:`Series.astype` raising ``pyarrow.ArrowInvalid`` when converting from a non-pyarrow string dtype to a pyarrow numeric type (:issue:`50430`) - Bug in :meth:`Series.to_numpy` converting to NumPy array before applying ``na_value`` (:issue:`48951`) +- Bug in :meth:`DataFrame.astype` not copying data when converting to pyarrow dtype (:issue:`50984`) - Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`) - Bug in :meth:`TimedeltaArray.astype` raising ``TypeError`` when converting to a pyarrow duration type (:issue:`49795`) - diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0e70b3795bc85..aa2b706e2663f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1,5 +1,6 @@ from __future__ import annotations +from copy import deepcopy from typing import ( TYPE_CHECKING, Any, @@ -218,6 +219,9 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal if isinstance(scalars, cls): scalars = scalars._data elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)): + if copy and is_array_like(scalars): + # pa array should not get updated when numpy array is updated + scalars = deepcopy(scalars) try: scalars = pa.array(scalars, type=pa_dtype, from_pandas=True) except pa.ArrowInvalid: diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index e9f6371239b4a..1f43b51d4808f 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,6 +3,7 @@ import numpy as np import pytest +from pandas.compat import pa_version_under6p0 import pandas.util._test_decorators as td import pandas as pd @@ -867,3 +868,14 @@ def test_frame_astype_no_copy(): assert result.a.dtype == pd.Int16Dtype() assert np.shares_memory(df.b.values, result.b.values) + + +@pytest.mark.skipif(pa_version_under6p0, reason="pyarrow is required for this test") +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +def test_astype_copies(dtype): + # GH#50984 + df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) + result = df.astype("int64[pyarrow]", copy=True) + df.iloc[0, 0] = 100 + expected = DataFrame({"a": [1, 2, 3]}, dtype="int64[pyarrow]") + tm.assert_frame_equal(result, expected)