From c91c5b951f9f18caf3c5913ba9ab614abc1b7ead Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 25 Jan 2023 20:48:42 -0500 Subject: [PATCH 1/3] BUG: astype to pyarrow does not copy np array --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/arrays/arrow/array.py | 3 +++ pandas/tests/frame/methods/test_astype.py | 12 ++++++++++++ 3 files changed, 16 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3c9c861afd989..d2f95505dc7ce 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1015,6 +1015,7 @@ Conversion - Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`) - Bug in :meth:`Series.astype` raising ``pyarrow.ArrowInvalid`` when converting from a non-pyarrow string dtype to a pyarrow numeric type (:issue:`50430`) - Bug in :meth:`Series.to_numpy` converting to NumPy array before applying ``na_value`` (:issue:`48951`) +- Bug in :meth:`DataFrame.astype` not copying data when converting to pyarrow dtype (:issue:``) - Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`) - Bug in :meth:`TimedeltaArray.astype` raising ``TypeError`` when converting to a pyarrow duration type (:issue:`49795`) - diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0e70b3795bc85..49c11401fe9b3 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -218,6 +218,9 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal if isinstance(scalars, cls): scalars = scalars._data elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)): + if copy and is_array_like(scalars): + # pa array should not get updated when numpy array is updated + scalars = scalars.copy() try: scalars = pa.array(scalars, type=pa_dtype, from_pandas=True) except pa.ArrowInvalid: diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index e9f6371239b4a..25a76622ceec7 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,6 +3,7 @@ import numpy as np import pytest +from pandas.compat import pa_version_under6p0 import pandas.util._test_decorators as td import pandas as pd @@ -867,3 +868,14 @@ def test_frame_astype_no_copy(): assert result.a.dtype == pd.Int16Dtype() assert np.shares_memory(df.b.values, result.b.values) + + +@pytest.mark.skipif(pa_version_under6p0, reason="pyarrow is required for this test") +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +def test_astype_copies(dtype): + # GH# + df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) + result = df.astype("int64[pyarrow]", copy=True) + df.iloc[0, 0] = 100 + expected = DataFrame({"a": [1, 2, 3]}, dtype="int64[pyarrow]") + tm.assert_frame_equal(result, expected) From 6df179ecf7904a2e7d328e58348c088e55ac8b6d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 25 Jan 2023 20:50:58 -0500 Subject: [PATCH 2/3] Add gh ref --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/tests/frame/methods/test_astype.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d2f95505dc7ce..7d7a280208de9 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1015,7 +1015,7 @@ Conversion - Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`) - Bug in :meth:`Series.astype` raising ``pyarrow.ArrowInvalid`` when converting from a non-pyarrow string dtype to a pyarrow numeric type (:issue:`50430`) - Bug in :meth:`Series.to_numpy` converting to NumPy array before applying ``na_value`` (:issue:`48951`) -- Bug in :meth:`DataFrame.astype` not copying data when converting to pyarrow dtype (:issue:``) +- Bug in :meth:`DataFrame.astype` not copying data when converting to pyarrow dtype (:issue:`50984`) - Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`) - Bug in :meth:`TimedeltaArray.astype` raising ``TypeError`` when converting to a pyarrow duration type (:issue:`49795`) - diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 25a76622ceec7..1f43b51d4808f 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -873,7 +873,7 @@ def test_frame_astype_no_copy(): @pytest.mark.skipif(pa_version_under6p0, reason="pyarrow is required for this test") @pytest.mark.parametrize("dtype", ["int64", "Int64"]) def test_astype_copies(dtype): - # GH# + # GH#50984 df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) result = df.astype("int64[pyarrow]", copy=True) df.iloc[0, 0] = 100 From 8defadffc032e6b20ec70a6ec04098de136e35a0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 26 Jan 2023 18:14:49 -0500 Subject: [PATCH 3/3] Use deepcopy --- pandas/core/arrays/arrow/array.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 49c11401fe9b3..aa2b706e2663f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1,5 +1,6 @@ from __future__ import annotations +from copy import deepcopy from typing import ( TYPE_CHECKING, Any, @@ -220,7 +221,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)): if copy and is_array_like(scalars): # pa array should not get updated when numpy array is updated - scalars = scalars.copy() + scalars = deepcopy(scalars) try: scalars = pa.array(scalars, type=pa_dtype, from_pandas=True) except pa.ArrowInvalid: