pandas-dev · jbrockmendel · Jul 26, 2020 · Jul 26, 2020 · Jul 27, 2020 · Jul 28, 2020
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -61,7 +61,82 @@ Notable bug fixes
 
 These are bug fixes that might have notable behavior changes.
 
+Assigning with ``DataFrame.__setitem__`` consistently creates a new array
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+Assigning values with ``DataFrame.__setitem__`` now consistently assigns a new array, rather than mutating inplace (:issue:`33457`, :issue:`35271`, :issue:`35266`)
+
+Previously, ``DataFrame.__setitem__`` would sometimes operate inplace on the
+underlying array, and sometimes assign a new array. Fixing this inconsistency
+can have behavior-changing implications for workloads that relied on inplace
+mutation. The two most common cases are creating a ``DataFrame`` from an array
+and slicing a ``DataFrame``.
+
+*Previous Behavior*
+
+The array would be mutated inplace for some dtypes, like NumPy's ``int64`` dtype.
+
+.. code-block:: python
+
+   >>> import pandas as pd
+   >>> import numpy as np
+   >>> a = np.array([1, 2, 3])
+   >>> df = pd.DataFrame(a, columns=['a'])
+   >>> df['a'] = 0
+   >>> a  # mutated inplace
+   array([0, 0, 0])
+
+But not others, like :class:`Int64Dtype`.
+
+.. code-block:: python
+
+   >>> import pandas as pd
+   >>> import numpy as np
+   >>> a = pd.array([1, 2, 3], dtype="Int64")
+   >>> df = pd.DataFrame(a, columns=['a'])
+   >>> df['a'] = 0
+   >>> a  # not mutated
+   <IntegerArray>
+   [1, 2, 3]
+   Length: 3, dtype: Int64
+
+
+*New Behavior*
+
+In pandas 1.3.0, ``DataFrame.__setitem__`` consistently sets on a new array rather than
+mutating the existing array inplace.
+
+For NumPy's int64 dtype
+
+.. ipython:: python
+
+   import pandas as pd
+   import numpy as np
+   a = np.array([1, 2, 3])
+   df = pd.DataFrame(a, columns=['a'])
+   df['a'] = 0
+   a  # not mutated
+
+For :class:`Int64Dtype`.
+
+.. ipython:: python
+
+   import pandas as pd
+   import numpy as np
+   a = pd.array([1, 2, 3], dtype="Int64")
+   df = pd.DataFrame(a, columns=['a'])
+   df['a'] = 0
+   a  # not mutated
+
+This also affects cases where a second ``Series`` or ``DataFrame`` is a view on a first ``DataFrame``.
+
+.. code-block:: python
+
+   df = pd.DataFrame({"A": [1, 2, 3]})
+   df2 = df[['A']]
+   df['A'] = np.array([0, 0, 0])
+
+Previously, whether ``df2`` was mutated depending on the dtype of the array being assigned to. Now, a new array is consistently assigned, so ``df2`` is not mutated.
 
 .. _whatsnew_130.api_breaking.deps:
 
@@ -236,6 +311,7 @@ Indexing
 - Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`)
 - Bug in :meth:`DataFrame.iloc.__setitem__` and :meth:`DataFrame.loc.__setitem__` with mixed dtypes when setting with a dictionary value (:issue:`38335`)
 - Bug in :meth:`DataFrame.loc` dropping levels of :class:`MultiIndex` when :class:`DataFrame` used as input has only one row (:issue:`10521`)
+- Bug in :meth:`DataFrame.iloc.__setitem__` creating a new array instead of overwriting ``Categorical`` values in-place (:issue:`35417`)
 -
 
 Missing

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3219,10 +3219,21 @@ def _set_item_frame_value(self, key, value: "DataFrame") -> None:
         value = value.T
         self._set_item_mgr(key, value)
 
-    def _iset_item_mgr(self, loc: int, value) -> None:
-        self._mgr.iset(loc, value)
+    def _iset_item_mgr(self, loc: int, value, inplace: bool = False) -> None:
+        self._mgr.iset(loc, value, inplace=inplace)
         self._clear_item_cache()
 
+    def _iset_item(self, loc: int, value, *, inplace: bool = False):
+        value = self._sanitize_column(value)
+        value = _maybe_atleast_2d(value)
+        self._iset_item_mgr(loc, value, inplace=inplace)
+
+        # check if we are modifying a copy
+        # try to set first as we want an invalid
+        # value exception to occur first
+        if len(self):  # FIXME: this should depend on inplace, right?
+            self._check_setitem_copy()
+
     def _set_item_mgr(self, key, value):
         value = _maybe_atleast_2d(value)
 
@@ -3240,17 +3251,6 @@ def _set_item_mgr(self, key, value):
         if len(self):
             self._check_setitem_copy()
 
-    def _iset_item(self, loc: int, value):
-        value = self._sanitize_column(value)
-        value = _maybe_atleast_2d(value)
-        self._iset_item_mgr(loc, value)
-
-        # check if we are modifying a copy
-        # try to set first as we want an invalid
-        # value exception to occur first
-        if len(self):
-            self._check_setitem_copy()
-
     def _set_item(self, key, value):
         """
         Add series to DataFrame in specified column.

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3439,7 +3439,7 @@ def _maybe_cache_changed(self, item, value) -> None:
         The object has called back to us saying maybe it has changed.
         """
         loc = self._info_axis.get_loc(item)
-        self._mgr.iset(loc, value)
+        self._mgr.iset(loc, value, inplace=False)
 
     @final
     @property

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -11,6 +11,7 @@
 from pandas.errors import AbstractMethodError, InvalidIndexError
 from pandas.util._decorators import doc
 
+from pandas.core.dtypes.cast import maybe_infer_dtype_type
 from pandas.core.dtypes.common import (
     is_array_like,
     is_hashable,
@@ -675,8 +676,24 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None):
             # GH#38148
             keys = self.obj.columns.union(key, sort=False)
 
+            # Try to get the right dtype when we do this reindex.
+            fv = None
+            if not is_list_like(value):
+                fv = value
+            elif len(value) and not is_list_like(value[0]):
+                fv = value[0]
+            else:
+                dtype = maybe_infer_dtype_type(value)
+                if dtype is not None:
+                    fv = dtype.type(0)
+
             self.obj._mgr = self.obj._mgr.reindex_axis(
-                keys, axis=0, copy=False, consolidate=False, only_slice=True
+                keys,
+                axis=0,
+                copy=False,
+                consolidate=False,
+                only_slice=True,
+                fill_value=fv,
             )
 
     def __setitem__(self, key, value):
@@ -1580,7 +1597,6 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"):
                     # essentially this separates out the block that is needed
                     # to possibly be modified
                     if self.ndim > 1 and i == info_axis:
-
                         # add the new item, and set the value
                         # must have all defined axes if we have a scalar
                         # or a list-like on the non-info axes if we have a
@@ -1811,6 +1827,13 @@ def _setitem_single_column(self, loc: int, value, plane_indexer):
         # multi-dim object
         # GH#6149 (null slice), GH#10408 (full bounds)
         if com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)):
+            blk = ser._mgr.blocks[0]
+            if blk._can_hold_element(value) and is_scalar(value):
+                # FIXME: ExtensionBlock._can_hold_element
+                # We can do an inplace-setting, do it directly on _values
+                #  to get our underlying
+                ser._values[plane_indexer] = value
+                return
             ser = value
         else:
             # set the item, possibly having a dtype change
@@ -1819,7 +1842,7 @@ def _setitem_single_column(self, loc: int, value, plane_indexer):
             ser._maybe_update_cacher(clear=True)
 
         # reset the sliced object if unique
-        self.obj._iset_item(loc, ser)
+        self.obj._iset_item(loc, ser, inplace=True)
 
     def _setitem_single_block(self, indexer, value, name: str):
         """
@@ -1845,7 +1868,11 @@ def _setitem_single_block(self, indexer, value, name: str):
                 )
                 and item_labels.is_unique
             ):
-                self.obj[item_labels[indexer[info_axis]]] = value
+                col = item_labels[indexer[info_axis]]
+                loc = item_labels.get_loc(col)
+                if isinstance(value, ABCDataFrame):
+                    return self.obj._set_item_frame_value(col, value)
+                self.obj._iset_item(loc, value, inplace=True)
                 return
 
             indexer = maybe_convert_ix(*indexer)

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1618,7 +1618,7 @@ def set_inplace(self, locs, values):
         # NB: This is a misnomer, is supposed to be inplace but is not,
         #  see GH#33457
         assert locs.tolist() == [0]
-        self.values = values
+        self.values[:] = values
 
     def putmask(self, mask, new, axis: int = 0) -> List["Block"]:
         """

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1048,7 +1048,7 @@ def idelete(self, indexer):
         )
         self._rebuild_blknos_and_blklocs()
 
-    def iset(self, loc: Union[int, slice, np.ndarray], value):
+    def iset(self, loc: Union[int, slice, np.ndarray], value, inplace: bool = False):
         """
         Set new item in-place. Does not consolidate. Adds new Block if not
         contained in the current set of items
@@ -1100,7 +1100,7 @@ def value_getitem(placement):
         for blkno, val_locs in libinternals.get_blkno_placements(blknos, group=True):
             blk = self.blocks[blkno]
             blk_locs = blklocs[val_locs.indexer]
-            if blk.should_store(value):
+            if inplace and blk.should_store(value):
                 blk.set_inplace(blk_locs, value_getitem(val_locs))
             else:
                 unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])

diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
@@ -809,7 +809,7 @@ def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame):
 
         msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
         with pytest.raises(com.SettingWithCopyError, match=msg):
-            sliced["C"] = 4.0
+            sliced.loc[:, "C"] = 4.0
 
         assert (float_frame["C"] == 4).all()
 
@@ -1286,7 +1286,7 @@ def test_iloc_row(self):
         # setting it makes it raise/warn
         msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
         with pytest.raises(com.SettingWithCopyError, match=msg):
-            result[2] = 0.0
+            result.loc[:, 2] = 0.0
 
         exp_col = df[2].copy()
         exp_col[4:8] = 0.0
@@ -1318,7 +1318,7 @@ def test_iloc_col(self):
         # and that we are setting a copy
         msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
         with pytest.raises(com.SettingWithCopyError, match=msg):
-            result[8] = 0.0
+            result.loc[:, 8] = 0.0
 
         assert (df[8] == 0).all()
 

diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py
@@ -164,7 +164,7 @@ def test_rename_multiindex(self):
 
     def test_rename_nocopy(self, float_frame):
         renamed = float_frame.rename(columns={"C": "foo"}, copy=False)
-        renamed["foo"] = 1.0
+        renamed["foo"][:] = 1.0
         assert (float_frame["C"] == 1.0).all()
 
     def test_rename_inplace(self, float_frame):

diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
@@ -704,7 +704,7 @@ def test_identity_slice_returns_new_object(self):
         assert sliced_df is not original_df
 
         # should be a shallow copy
-        original_df["a"] = [4, 4, 4]
+        original_df.loc[:, "a"] = [4, 4, 4]
         assert (sliced_df["a"] == 4).all()
 
         original_series = Series([1, 2, 3, 4, 5, 6])
@@ -728,8 +728,8 @@ def test_series_indexing_zerodim_np_array(self):
         result = s.iloc[np.array(0)]
         assert result == 1
 
-    @pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/33457")
     def test_iloc_setitem_categorical_updates_inplace(self):
+        # GH#35417
         # Mixed dtype ensures we go through take_split_path in setitem_with_indexer
         cat = Categorical(["A", "B", "C"])
         df = DataFrame({1: cat, 2: [1, 2, 3]})

diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
@@ -902,7 +902,7 @@ def test_identity_slice_returns_new_object(self):
         assert original_df[:] is not original_df
 
         # should be a shallow copy
-        original_df["a"] = [4, 4, 4]
+        original_df["a"][:] = [4, 4, 4]
         assert (sliced_df["a"] == 4).all()
 
         # These should not return copies