diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 39ff807d6b1e4..51ddbdd4dbee6 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -91,6 +91,22 @@ API changes df df.dtypes +- ``SettingWithCopy`` raise/warnings (according to the option ``mode.chained_assignment``) will now be issued when setting a value on a sliced mixed-dtype DataFrame using chained-assignment. (:issue:`7845`) + + .. code-block:: python + + In [1]: df = DataFrame(np.arange(0,9), columns=['count']) + + In [2]: df['group'] = 'b' + + In [3]: df.iloc[0:5]['group'] = 'a' + /usr/local/bin/ipython:1: SettingWithCopyWarning: + A value is trying to be set on a copy of a slice from a DataFrame. + Try using .loc[row_indexer,col_indexer] = value instead + + See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy + + .. _whatsnew_0150.cat: Categoricals in Series/DataFrame diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 636dedfbeb7b7..4b8d13ce30355 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2075,13 +2075,12 @@ def _set_item(self, key, value): Add series to DataFrame in specified column. If series is a numpy-array (not a Series/TimeSeries), it must be the - same length as the DataFrame's index or an error will be thrown. + same length as the DataFrames index or an error will be thrown. - Series/TimeSeries will be conformed to the DataFrame's index to + Series/TimeSeries will be conformed to the DataFrames index to ensure homogeneity. """ - is_existing = key in self.columns self._ensure_valid_index(value) value = self._sanitize_column(key, value) NDFrame._set_item(self, key, value) @@ -2089,7 +2088,7 @@ def _set_item(self, key, value): # check if we are modifying a copy # try to set first as we want an invalid # value exeption to occur first - if is_existing: + if len(self): self._check_setitem_copy() def insert(self, loc, column, value, allow_duplicates=False): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 71da20af2ad43..cef18c5ad3c2b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1133,7 +1133,13 @@ def _slice(self, slobj, axis=0, typ=None): """ axis = self._get_block_manager_axis(axis) - return self._constructor(self._data.get_slice(slobj, axis=axis)) + result = self._constructor(self._data.get_slice(slobj, axis=axis)) + + # this could be a view + # but only in a single-dtyped view slicable case + is_copy = axis!=0 or result._is_view + result._set_is_copy(self, copy=is_copy) + return result def _set_item(self, key, value): self._data.set(key, value) @@ -1149,10 +1155,28 @@ def _set_is_copy(self, ref=None, copy=True): self.is_copy = None def _check_setitem_copy(self, stacklevel=4, t='setting'): - """ validate if we are doing a settitem on a chained copy. + """ + + validate if we are doing a settitem on a chained copy. If you call this function, be sure to set the stacklevel such that the - user will see the error *at the level of setting*""" + user will see the error *at the level of setting* + + It is technically possible to figure out that we are setting on + a copy even WITH a multi-dtyped pandas object. In other words, some blocks + may be views while other are not. Currently _is_view will ALWAYS return False + for multi-blocks to avoid having to handle this case. + + df = DataFrame(np.arange(0,9), columns=['count']) + df['group'] = 'b' + + # this technically need not raise SettingWithCopy if both are view (which is not + # generally guaranteed but is usually True + # however, this is in general not a good practice and we recommend using .loc + df.iloc[0:5]['group'] = 'a' + + """ + if self.is_copy: value = config.get_option('mode.chained_assignment') @@ -1170,14 +1194,18 @@ def _check_setitem_copy(self, stacklevel=4, t='setting'): pass if t == 'referant': - t = ("A value is trying to be set on a copy of a slice from a " + t = ("\n" + "A value is trying to be set on a copy of a slice from a " "DataFrame\n\n" - "See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy") + "See the the caveats in the documentation: " + "http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy") else: - t = ("A value is trying to be set on a copy of a slice from a " - "DataFrame.\nTry using .loc[row_index,col_indexer] = value " - "instead\n\n" - "See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy") + t = ("\n" + "A value is trying to be set on a copy of a slice from a " + "DataFrame.\n" + "Try using .loc[row_indexer,col_indexer] = value instead\n\n" + "See the the caveats in the documentation: " + "http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy") if value == 'raise': raise SettingWithCopyError(t) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 08d3fbe335f35..48c3b4ece1d95 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -25,7 +25,7 @@ notnull, _DATELIKE_DTYPES, is_numeric_dtype, is_timedelta64_dtype, is_datetime64_dtype, is_categorical_dtype) - +from pandas.core.config import option_context from pandas import _np_version_under1p7 import pandas.lib as lib from pandas.lib import Timestamp @@ -635,7 +635,9 @@ def apply(self, func, *args, **kwargs): @wraps(func) def f(g): - return func(g, *args, **kwargs) + # ignore SettingWithCopy here in case the user mutates + with option_context('mode.chained_assignment',None): + return func(g, *args, **kwargs) return self._python_apply_general(f) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index cad7b579aa554..98e8d4f88104f 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2519,6 +2519,14 @@ def is_view(self): """ return a boolean if we are a single block and are a view """ if len(self.blocks) == 1: return self.blocks[0].values.base is not None + + # It is technically possible to figure out which blocks are views + # e.g. [ b.values.base is not None for b in self.blocks ] + # but then we have the case of possibly some blocks being a view + # and some blocks not. setting in theory is possible on the non-view + # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit + # complicated + return False def get_bool_data(self, copy=False): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 0dd729d58f174..c5cacda17edba 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -418,8 +418,12 @@ def test_setitem(self): self.frame['col8'] = 'foo' assert((self.frame['col8'] == 'foo').all()) + # this is partially a view (e.g. some blocks are view) + # so raise/warn smaller = self.frame[:2] - smaller['col10'] = ['1', '2'] + def f(): + smaller['col10'] = ['1', '2'] + self.assertRaises(com.SettingWithCopyError, f) self.assertEqual(smaller['col10'].dtype, np.object_) self.assertTrue((smaller['col10'] == ['1', '2']).all()) @@ -830,8 +834,11 @@ def test_fancy_getitem_slice_mixed(self): self.assertEqual(sliced['D'].dtype, np.float64) # get view with single block + # setting it triggers setting with copy sliced = self.frame.ix[:, -3:] - sliced['C'] = 4. + def f(): + sliced['C'] = 4. + self.assertRaises(com.SettingWithCopyError, f) self.assertTrue((self.frame['C'] == 4).all()) def test_fancy_setitem_int_labels(self): @@ -1618,7 +1625,10 @@ def test_irow(self): assert_frame_equal(result, expected) # verify slice is view - result[2] = 0. + # setting it makes it raise/warn + def f(): + result[2] = 0. + self.assertRaises(com.SettingWithCopyError, f) exp_col = df[2].copy() exp_col[4:8] = 0. assert_series_equal(df[2], exp_col) @@ -1645,7 +1655,10 @@ def test_icol(self): assert_frame_equal(result, expected) # verify slice is view - result[8] = 0. + # and that we are setting a copy + def f(): + result[8] = 0. + self.assertRaises(com.SettingWithCopyError, f) self.assertTrue((df[8] == 0).all()) # list of integers diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index ea4d66074e65a..5adaacbeb9d29 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -13,6 +13,7 @@ from pandas.core.groupby import (SpecificationError, DataError, _nargsort, _lexsort_indexer) from pandas.core.series import Series +from pandas.core.config import option_context from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal, assert_index_equal, assertRaisesRegexp) @@ -2299,9 +2300,11 @@ def f(group): self.assertEqual(result['d'].dtype, np.float64) - for key, group in grouped: - res = f(group) - assert_frame_equal(res, result.ix[key]) + # this is by definition a mutating operation! + with option_context('mode.chained_assignment',None): + for key, group in grouped: + res = f(group) + assert_frame_equal(res, result.ix[key]) def test_groupby_wrong_multi_labels(self): from pandas import read_csv diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 0e962800fef08..64e9d18d0aa2f 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -3246,6 +3246,13 @@ def f(): df['column1'] = df['column1'] + 'c' str(df) + # from SO: http://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc + df = DataFrame(np.arange(0,9), columns=['count']) + df['group'] = 'b' + def f(): + df.iloc[0:5]['group'] = 'a' + self.assertRaises(com.SettingWithCopyError, f) + def test_float64index_slicing_bug(self): # GH 5557, related to slicing a float index ser = {256: 2321.0, 1: 78.0, 2: 2716.0, 3: 0.0, 4: 369.0, 5: 0.0, 6: 269.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 3536.0, 11: 0.0, 12: 24.0, 13: 0.0, 14: 931.0, 15: 0.0, 16: 101.0, 17: 78.0, 18: 9643.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 63761.0, 23: 0.0, 24: 446.0, 25: 0.0, 26: 34773.0, 27: 0.0, 28: 729.0, 29: 78.0, 30: 0.0, 31: 0.0, 32: 3374.0, 33: 0.0, 34: 1391.0, 35: 0.0, 36: 361.0, 37: 0.0, 38: 61808.0, 39: 0.0, 40: 0.0, 41: 0.0, 42: 6677.0, 43: 0.0, 44: 802.0, 45: 0.0, 46: 2691.0, 47: 0.0, 48: 3582.0, 49: 0.0, 50: 734.0, 51: 0.0, 52: 627.0, 53: 70.0, 54: 2584.0, 55: 0.0, 56: 324.0, 57: 0.0, 58: 605.0, 59: 0.0, 60: 0.0, 61: 0.0, 62: 3989.0, 63: 10.0, 64: 42.0, 65: 0.0, 66: 904.0, 67: 0.0, 68: 88.0, 69: 70.0, 70: 8172.0, 71: 0.0, 72: 0.0, 73: 0.0, 74: 64902.0, 75: 0.0, 76: 347.0, 77: 0.0, 78: 36605.0, 79: 0.0, 80: 379.0, 81: 70.0, 82: 0.0, 83: 0.0, 84: 3001.0, 85: 0.0, 86: 1630.0, 87: 7.0, 88: 364.0, 89: 0.0, 90: 67404.0, 91: 9.0, 92: 0.0, 93: 0.0, 94: 7685.0, 95: 0.0, 96: 1017.0, 97: 0.0, 98: 2831.0, 99: 0.0, 100: 2963.0, 101: 0.0, 102: 854.0, 103: 0.0, 104: 0.0, 105: 0.0, 106: 0.0, 107: 0.0, 108: 0.0, 109: 0.0, 110: 0.0, 111: 0.0, 112: 0.0, 113: 0.0, 114: 0.0, 115: 0.0, 116: 0.0, 117: 0.0, 118: 0.0, 119: 0.0, 120: 0.0, 121: 0.0, 122: 0.0, 123: 0.0, 124: 0.0, 125: 0.0, 126: 67744.0, 127: 22.0, 128: 264.0, 129: 0.0, 260: 197.0, 268: 0.0, 265: 0.0, 269: 0.0, 261: 0.0, 266: 1198.0, 267: 0.0, 262: 2629.0, 258: 775.0, 257: 0.0, 263: 0.0, 259: 0.0, 264: 163.0, 250: 10326.0, 251: 0.0, 252: 1228.0, 253: 0.0, 254: 2769.0, 255: 0.0} diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 9132fea089fe7..ada13d6f4bccb 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -228,9 +228,14 @@ def _all_key(key): if len(rows) > 0: margin = data[rows + values].groupby(rows).agg(aggfunc) cat_axis = 1 + for key, piece in table.groupby(level=0, axis=cat_axis): all_key = _all_key(key) + + # we are going to mutate this, so need to copy! + piece = piece.copy() piece[all_key] = margin[key] + table_pieces.append(piece) margin_keys.append(all_key) else: diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 4601ad0784562..df2f270346e20 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -1320,6 +1320,7 @@ def test_append_many(self): result = chunks[0].append(chunks[1:]) tm.assert_frame_equal(result, self.frame) + chunks[-1] = chunks[-1].copy() chunks[-1]['foo'] = 'bar' result = chunks[0].append(chunks[1:]) tm.assert_frame_equal(result.ix[:, self.frame.columns], self.frame) @@ -1673,7 +1674,7 @@ def test_join_dups(self): def test_handle_empty_objects(self): df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) - baz = df[:5] + baz = df[:5].copy() baz['foo'] = 'bar' empty = df[5:5]