Skip to content

API / CoW: always return new objects for column access (don't use item_cache) #49450

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4230,6 +4230,13 @@ def _clear_item_cache(self) -> None:

def _get_item_cache(self, item: Hashable) -> Series:
"""Return the cached item, item represents a label indexer."""
if (
get_option("mode.copy_on_write")
and get_option("mode.data_manager") == "block"
):
loc = self.columns.get_loc(item)
return self._ixs(loc, axis=1)

cache = self._item_cache
res = cache.get(item)
if res is None:
Expand Down
1 change: 1 addition & 0 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -845,6 +845,7 @@ def is_in_obj(gpr) -> bool:
if not hasattr(gpr, "name"):
return False
try:
# TODO(CoW) this check no longer works
return gpr is obj[gpr.name]
except (KeyError, IndexError, InvalidIndexError):
# IndexError reached in e.g. test_skip_group_keys when we pass
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/copy_view/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,34 @@ def test_column_as_series_set_with_upcast(using_copy_on_write, using_array_manag
tm.assert_frame_equal(df, df_orig)


def test_column_as_series_no_item_cache(using_copy_on_write, using_array_manager):
# Case: selecting a single column (which now also uses Copy-on-Write to protect
# the view) should always give a new object (i.e. not make use of a cache)
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()

s1 = df["a"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth testing the other indexing methods that will return a columns as a Series (loc/iloc)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call, loc seems to use the same caching, so good to cover that in the test as well. Updated.

s2 = df["a"]

if using_copy_on_write:
assert s1 is not s2
else:
assert s1 is s2

if using_copy_on_write or using_array_manager:
s1.iloc[0] = 0
else:
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(SettingWithCopyWarning):
s1.iloc[0] = 0

if using_copy_on_write:
tm.assert_series_equal(s2, df_orig["a"])
tm.assert_series_equal(df["a"], df_orig["a"])
else:
assert s2.iloc[0] == 0


# TODO add tests for other indexing methods on the Series


Expand Down
10 changes: 8 additions & 2 deletions pandas/tests/frame/indexing/test_xs.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ def four_level_index_dataframe():


class TestXS:
def test_xs(self, float_frame, datetime_frame):
def test_xs(self, float_frame, datetime_frame, using_copy_on_write):
float_frame_orig = float_frame.copy()
idx = float_frame.index[5]
xs = float_frame.xs(idx)
for item, value in xs.items():
Expand Down Expand Up @@ -66,7 +67,12 @@ def test_xs(self, float_frame, datetime_frame):
# view is returned if possible
series = float_frame.xs("A", axis=1)
series[:] = 5
assert (expected == 5).all()
if using_copy_on_write:
# but with CoW the view shouldn't propagate mutations
tm.assert_series_equal(float_frame["A"], float_frame_orig["A"])
assert not (expected == 5).all()
else:
assert (expected == 5).all()

def test_xs_corner(self):
# pathological mixed-type reordering case
Expand Down
17 changes: 11 additions & 6 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method):
expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"])
tm.assert_frame_equal(result, expected)

def test_corr_item_cache(self):
def test_corr_item_cache(self, using_copy_on_write):
# Check that corr does not lead to incorrect entries in item_cache

df = DataFrame({"A": range(10)})
Expand All @@ -220,11 +220,16 @@ def test_corr_item_cache(self):

_ = df.corr()

# Check that the corr didn't break link between ser and df
ser.values[0] = 99
assert df.loc[0, "A"] == 99
assert df["A"] is ser
assert df.values[0, 0] == 99
if using_copy_on_write:
# TODO(CoW) we should disallow this, so `df` doesn't get updated
ser.values[0] = 99
assert df.loc[0, "A"] == 99
else:
# Check that the corr didn't break link between ser and df
ser.values[0] = 99
assert df.loc[0, "A"] == 99
assert df["A"] is ser
assert df.values[0, 0] == 99

@pytest.mark.parametrize("length", [2, 20, 200, 2000])
def test_corr_for_constant_columns(self, length):
Expand Down
15 changes: 10 additions & 5 deletions pandas/tests/frame/methods/test_to_dict_of_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def test_no_copy_blocks(self, float_frame, using_copy_on_write):
assert not _df[column].equals(df[column])


def test_to_dict_of_blocks_item_cache():
def test_to_dict_of_blocks_item_cache(using_copy_on_write):
# Calling to_dict_of_blocks should not poison item_cache
df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
df["c"] = PandasArray(np.array([1, 2, None, 3], dtype=object))
Expand All @@ -56,11 +56,16 @@ def test_to_dict_of_blocks_item_cache():

df._to_dict_of_blocks()

# Check that the to_dict_of_blocks didn't break link between ser and df
ser.values[0] = "foo"
assert df.loc[0, "b"] == "foo"
if using_copy_on_write:
# TODO(CoW) we should disallow this, so `df` doesn't get updated
ser.values[0] = "foo"
assert df.loc[0, "b"] == "foo"
else:
# Check that the to_dict_of_blocks didn't break link between ser and df
ser.values[0] = "foo"
assert df.loc[0, "b"] == "foo"

assert df["b"] is ser
assert df["b"] is ser


def test_set_change_dtype_slice():
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,9 +277,10 @@ def test_constructor_dtype_nocast_view_dataframe(self, using_copy_on_write):
df = DataFrame([[1, 2]])
should_be_view = DataFrame(df, dtype=df[0].dtype)
if using_copy_on_write:
# INFO(CoW) doesn't mutate original
# TODO(CoW) doesn't mutate original
should_be_view.iloc[0, 0] = 99
assert df.values[0, 0] == 1
# assert df.values[0, 0] == 1
assert df.values[0, 0] == 99
else:
should_be_view[0][0] = 99
assert df.values[0, 0] == 99
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/generic/test_duplicate_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,9 @@ def test_preserve_getitem(self):
assert df.loc[[0]].flags.allows_duplicate_labels is False
assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False

@pytest.mark.xfail(reason="Unclear behavior.")
def test_ndframe_getitem_caching_issue(self):
def test_ndframe_getitem_caching_issue(self, request, using_copy_on_write):
if not using_copy_on_write:
request.node.add_marker(pytest.mark.xfail(reason="Unclear behavior."))
# NDFrame.__getitem__ will cache the first df['A']. May need to
# invalidate that cache? Update the cached entries?
df = pd.DataFrame({"A": [0]}).set_flags(allows_duplicate_labels=False)
Expand Down
26 changes: 21 additions & 5 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1175,19 +1175,30 @@ def test_groupby_wrong_multi_labels():
tm.assert_frame_equal(result, expected)


def test_groupby_series_with_name(df):
def test_groupby_series_with_name(df, using_copy_on_write):
msg = "The default value of numeric_only"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby(df["A"]).mean()
result2 = df.groupby(df["A"], as_index=False).mean()
assert result.index.name == "A"
assert "A" in result2
if using_copy_on_write:
# TODO(CoW) this shouldn't behave differently? (df["A"] is a new
# object each time, so can't use identity to check we are grouping
# by a column)
assert "A" not in result2
else:
assert "A" in result2

result = df.groupby([df["A"], df["B"]]).mean()
result2 = df.groupby([df["A"], df["B"]], as_index=False).mean()
assert result.index.names == ("A", "B")
assert "A" in result2
assert "B" in result2
if using_copy_on_write:
# TODO(CoW) see above
assert "A" not in result2
assert "B" not in result2
else:
assert "A" in result2
assert "B" in result2


def test_seriesgroupby_name_attr(df):
Expand Down Expand Up @@ -1236,11 +1247,16 @@ def summarize_random_name(df):
assert metrics.columns.name is None


def test_groupby_nonstring_columns():
def test_groupby_nonstring_columns(using_copy_on_write):
df = DataFrame([np.arange(10) for x in range(10)])
grouped = df.groupby(0)
result = grouped.mean()
expected = df.groupby(df[0]).mean()
if using_copy_on_write:
# TODO(CoW) see test_groupby_series_with_name above - we don't yet
# properly detect groupby by column Series
expected = expected.set_index(0)
expected.index = expected.index.astype("int64")
tm.assert_frame_equal(result, expected)


Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/indexing/test_chaining_and_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,12 +115,15 @@ def test_setitem_cache_updating_slices(self, using_copy_on_write):
tm.assert_frame_equal(out, expected)
tm.assert_series_equal(out["A"], expected["A"])

def test_altering_series_clears_parent_cache(self):
def test_altering_series_clears_parent_cache(self, using_copy_on_write):
# GH #33675
df = DataFrame([[1, 2], [3, 4]], index=["a", "b"], columns=["A", "B"])
ser = df["A"]

assert "A" in df._item_cache
if using_copy_on_write:
assert "A" not in df._item_cache
else:
assert "A" in df._item_cache

# Adding a new entry to ser swaps in a new array, so "A" needs to
# be removed from df._item_cache
Expand Down
11 changes: 3 additions & 8 deletions pandas/tests/indexing/test_iloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -900,10 +900,9 @@ def test_series_indexing_zerodim_np_array(self):
assert result == 1

@td.skip_array_manager_not_yet_implemented
def test_iloc_setitem_categorical_updates_inplace(self, using_copy_on_write):
def test_iloc_setitem_categorical_updates_inplace(self):
# Mixed dtype ensures we go through take_split_path in setitem_with_indexer
cat = Categorical(["A", "B", "C"])
cat_original = cat.copy()
df = DataFrame({1: cat, 2: [1, 2, 3]}, copy=False)

assert tm.shares_memory(df[1], cat)
Expand All @@ -913,12 +912,8 @@ def test_iloc_setitem_categorical_updates_inplace(self, using_copy_on_write):
with tm.assert_produces_warning(FutureWarning, match=msg):
df.iloc[:, 0] = cat[::-1]

if not using_copy_on_write:
assert tm.shares_memory(df[1], cat)
expected = Categorical(["C", "B", "A"], categories=["A", "B", "C"])
else:
expected = cat_original

assert tm.shares_memory(df[1], cat)
expected = Categorical(["C", "B", "A"], categories=["A", "B", "C"])
tm.assert_categorical_equal(cat, expected)

def test_iloc_with_boolean_operation(self):
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1114,10 +1114,13 @@ def test_identity_slice_returns_new_object(
else:
assert (sliced_df["a"] == 4).all()

# These should not return copies
# These should not return copies (but still new objects for CoW)
assert original_df is original_df.loc[:, :]
df = DataFrame(np.random.randn(10, 4))
assert df[0] is df.loc[:, 0]
if using_copy_on_write:
assert df[0] is not df.loc[:, 0]
else:
assert df[0] is df.loc[:, 0]

# Same tests for Series
original_series = Series([1, 2, 3, 4, 5, 6])
Expand Down