Skip to content

COMPAT: SettingWithCopy will now warn when slices which can generate views are then set #7845

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 26, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,22 @@ API changes
df
df.dtypes

- ``SettingWithCopy`` raise/warnings (according to the option ``mode.chained_assignment``) will now be issued when setting a value on a sliced mixed-dtype DataFrame using chained-assignment. (:issue:`7845`)

.. code-block:: python

In [1]: df = DataFrame(np.arange(0,9), columns=['count'])

In [2]: df['group'] = 'b'

In [3]: df.iloc[0:5]['group'] = 'a'
/usr/local/bin/ipython:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


.. _whatsnew_0150.cat:

Categoricals in Series/DataFrame
Expand Down
7 changes: 3 additions & 4 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2075,21 +2075,20 @@ def _set_item(self, key, value):
Add series to DataFrame in specified column.

If series is a numpy-array (not a Series/TimeSeries), it must be the
same length as the DataFrame's index or an error will be thrown.
same length as the DataFrames index or an error will be thrown.

Series/TimeSeries will be conformed to the DataFrame's index to
Series/TimeSeries will be conformed to the DataFrames index to
ensure homogeneity.
"""

is_existing = key in self.columns
self._ensure_valid_index(value)
value = self._sanitize_column(key, value)
NDFrame._set_item(self, key, value)

# check if we are modifying a copy
# try to set first as we want an invalid
# value exeption to occur first
if is_existing:
if len(self):
self._check_setitem_copy()

def insert(self, loc, column, value, allow_duplicates=False):
Expand Down
46 changes: 37 additions & 9 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1133,7 +1133,13 @@ def _slice(self, slobj, axis=0, typ=None):

"""
axis = self._get_block_manager_axis(axis)
return self._constructor(self._data.get_slice(slobj, axis=axis))
result = self._constructor(self._data.get_slice(slobj, axis=axis))

# this could be a view
# but only in a single-dtyped view slicable case
is_copy = axis!=0 or result._is_view
result._set_is_copy(self, copy=is_copy)
return result

def _set_item(self, key, value):
self._data.set(key, value)
Expand All @@ -1149,10 +1155,28 @@ def _set_is_copy(self, ref=None, copy=True):
self.is_copy = None

def _check_setitem_copy(self, stacklevel=4, t='setting'):
""" validate if we are doing a settitem on a chained copy.
"""

validate if we are doing a settitem on a chained copy.

If you call this function, be sure to set the stacklevel such that the
user will see the error *at the level of setting*"""
user will see the error *at the level of setting*

It is technically possible to figure out that we are setting on
a copy even WITH a multi-dtyped pandas object. In other words, some blocks
may be views while other are not. Currently _is_view will ALWAYS return False
for multi-blocks to avoid having to handle this case.

df = DataFrame(np.arange(0,9), columns=['count'])
df['group'] = 'b'

# this technically need not raise SettingWithCopy if both are view (which is not
# generally guaranteed but is usually True
# however, this is in general not a good practice and we recommend using .loc
df.iloc[0:5]['group'] = 'a'

"""

if self.is_copy:

value = config.get_option('mode.chained_assignment')
Expand All @@ -1170,14 +1194,18 @@ def _check_setitem_copy(self, stacklevel=4, t='setting'):
pass

if t == 'referant':
t = ("A value is trying to be set on a copy of a slice from a "
t = ("\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame\n\n"
"See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy")
"See the the caveats in the documentation: "
"http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy")
else:
t = ("A value is trying to be set on a copy of a slice from a "
"DataFrame.\nTry using .loc[row_index,col_indexer] = value "
"instead\n\n"
"See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy")
t = ("\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame.\n"
"Try using .loc[row_indexer,col_indexer] = value instead\n\n"
"See the the caveats in the documentation: "
"http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy")

if value == 'raise':
raise SettingWithCopyError(t)
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
notnull, _DATELIKE_DTYPES, is_numeric_dtype,
is_timedelta64_dtype, is_datetime64_dtype,
is_categorical_dtype)

from pandas.core.config import option_context
from pandas import _np_version_under1p7
import pandas.lib as lib
from pandas.lib import Timestamp
Expand Down Expand Up @@ -635,7 +635,9 @@ def apply(self, func, *args, **kwargs):

@wraps(func)
def f(g):
return func(g, *args, **kwargs)
# ignore SettingWithCopy here in case the user mutates
with option_context('mode.chained_assignment',None):
return func(g, *args, **kwargs)

return self._python_apply_general(f)

Expand Down
8 changes: 8 additions & 0 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -2519,6 +2519,14 @@ def is_view(self):
""" return a boolean if we are a single block and are a view """
if len(self.blocks) == 1:
return self.blocks[0].values.base is not None

# It is technically possible to figure out which blocks are views
# e.g. [ b.values.base is not None for b in self.blocks ]
# but then we have the case of possibly some blocks being a view
# and some blocks not. setting in theory is possible on the non-view
# blocks w/o causing a SettingWithCopy raise/warn. But this is a bit
# complicated

return False

def get_bool_data(self, copy=False):
Expand Down
21 changes: 17 additions & 4 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,8 +418,12 @@ def test_setitem(self):
self.frame['col8'] = 'foo'
assert((self.frame['col8'] == 'foo').all())

# this is partially a view (e.g. some blocks are view)
# so raise/warn
smaller = self.frame[:2]
smaller['col10'] = ['1', '2']
def f():
smaller['col10'] = ['1', '2']
self.assertRaises(com.SettingWithCopyError, f)
self.assertEqual(smaller['col10'].dtype, np.object_)
self.assertTrue((smaller['col10'] == ['1', '2']).all())

Expand Down Expand Up @@ -830,8 +834,11 @@ def test_fancy_getitem_slice_mixed(self):
self.assertEqual(sliced['D'].dtype, np.float64)

# get view with single block
# setting it triggers setting with copy
sliced = self.frame.ix[:, -3:]
sliced['C'] = 4.
def f():
sliced['C'] = 4.
self.assertRaises(com.SettingWithCopyError, f)
self.assertTrue((self.frame['C'] == 4).all())

def test_fancy_setitem_int_labels(self):
Expand Down Expand Up @@ -1618,7 +1625,10 @@ def test_irow(self):
assert_frame_equal(result, expected)

# verify slice is view
result[2] = 0.
# setting it makes it raise/warn
def f():
result[2] = 0.
self.assertRaises(com.SettingWithCopyError, f)
exp_col = df[2].copy()
exp_col[4:8] = 0.
assert_series_equal(df[2], exp_col)
Expand All @@ -1645,7 +1655,10 @@ def test_icol(self):
assert_frame_equal(result, expected)

# verify slice is view
result[8] = 0.
# and that we are setting a copy
def f():
result[8] = 0.
self.assertRaises(com.SettingWithCopyError, f)
self.assertTrue((df[8] == 0).all())

# list of integers
Expand Down
9 changes: 6 additions & 3 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from pandas.core.groupby import (SpecificationError, DataError,
_nargsort, _lexsort_indexer)
from pandas.core.series import Series
from pandas.core.config import option_context
from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
assert_series_equal, assert_almost_equal,
assert_index_equal, assertRaisesRegexp)
Expand Down Expand Up @@ -2299,9 +2300,11 @@ def f(group):

self.assertEqual(result['d'].dtype, np.float64)

for key, group in grouped:
res = f(group)
assert_frame_equal(res, result.ix[key])
# this is by definition a mutating operation!
with option_context('mode.chained_assignment',None):
for key, group in grouped:
res = f(group)
assert_frame_equal(res, result.ix[key])

def test_groupby_wrong_multi_labels(self):
from pandas import read_csv
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3246,6 +3246,13 @@ def f():
df['column1'] = df['column1'] + 'c'
str(df)

# from SO: http://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc
df = DataFrame(np.arange(0,9), columns=['count'])
df['group'] = 'b'
def f():
df.iloc[0:5]['group'] = 'a'
self.assertRaises(com.SettingWithCopyError, f)

def test_float64index_slicing_bug(self):
# GH 5557, related to slicing a float index
ser = {256: 2321.0, 1: 78.0, 2: 2716.0, 3: 0.0, 4: 369.0, 5: 0.0, 6: 269.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 3536.0, 11: 0.0, 12: 24.0, 13: 0.0, 14: 931.0, 15: 0.0, 16: 101.0, 17: 78.0, 18: 9643.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 63761.0, 23: 0.0, 24: 446.0, 25: 0.0, 26: 34773.0, 27: 0.0, 28: 729.0, 29: 78.0, 30: 0.0, 31: 0.0, 32: 3374.0, 33: 0.0, 34: 1391.0, 35: 0.0, 36: 361.0, 37: 0.0, 38: 61808.0, 39: 0.0, 40: 0.0, 41: 0.0, 42: 6677.0, 43: 0.0, 44: 802.0, 45: 0.0, 46: 2691.0, 47: 0.0, 48: 3582.0, 49: 0.0, 50: 734.0, 51: 0.0, 52: 627.0, 53: 70.0, 54: 2584.0, 55: 0.0, 56: 324.0, 57: 0.0, 58: 605.0, 59: 0.0, 60: 0.0, 61: 0.0, 62: 3989.0, 63: 10.0, 64: 42.0, 65: 0.0, 66: 904.0, 67: 0.0, 68: 88.0, 69: 70.0, 70: 8172.0, 71: 0.0, 72: 0.0, 73: 0.0, 74: 64902.0, 75: 0.0, 76: 347.0, 77: 0.0, 78: 36605.0, 79: 0.0, 80: 379.0, 81: 70.0, 82: 0.0, 83: 0.0, 84: 3001.0, 85: 0.0, 86: 1630.0, 87: 7.0, 88: 364.0, 89: 0.0, 90: 67404.0, 91: 9.0, 92: 0.0, 93: 0.0, 94: 7685.0, 95: 0.0, 96: 1017.0, 97: 0.0, 98: 2831.0, 99: 0.0, 100: 2963.0, 101: 0.0, 102: 854.0, 103: 0.0, 104: 0.0, 105: 0.0, 106: 0.0, 107: 0.0, 108: 0.0, 109: 0.0, 110: 0.0, 111: 0.0, 112: 0.0, 113: 0.0, 114: 0.0, 115: 0.0, 116: 0.0, 117: 0.0, 118: 0.0, 119: 0.0, 120: 0.0, 121: 0.0, 122: 0.0, 123: 0.0, 124: 0.0, 125: 0.0, 126: 67744.0, 127: 22.0, 128: 264.0, 129: 0.0, 260: 197.0, 268: 0.0, 265: 0.0, 269: 0.0, 261: 0.0, 266: 1198.0, 267: 0.0, 262: 2629.0, 258: 775.0, 257: 0.0, 263: 0.0, 259: 0.0, 264: 163.0, 250: 10326.0, 251: 0.0, 252: 1228.0, 253: 0.0, 254: 2769.0, 255: 0.0}
Expand Down
5 changes: 5 additions & 0 deletions pandas/tools/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,9 +228,14 @@ def _all_key(key):
if len(rows) > 0:
margin = data[rows + values].groupby(rows).agg(aggfunc)
cat_axis = 1

for key, piece in table.groupby(level=0, axis=cat_axis):
all_key = _all_key(key)

# we are going to mutate this, so need to copy!
piece = piece.copy()
piece[all_key] = margin[key]

table_pieces.append(piece)
margin_keys.append(all_key)
else:
Expand Down
3 changes: 2 additions & 1 deletion pandas/tools/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1320,6 +1320,7 @@ def test_append_many(self):
result = chunks[0].append(chunks[1:])
tm.assert_frame_equal(result, self.frame)

chunks[-1] = chunks[-1].copy()
chunks[-1]['foo'] = 'bar'
result = chunks[0].append(chunks[1:])
tm.assert_frame_equal(result.ix[:, self.frame.columns], self.frame)
Expand Down Expand Up @@ -1673,7 +1674,7 @@ def test_join_dups(self):
def test_handle_empty_objects(self):
df = DataFrame(np.random.randn(10, 4), columns=list('abcd'))

baz = df[:5]
baz = df[:5].copy()
baz['foo'] = 'bar'
empty = df[5:5]

Expand Down