Skip to content

Backport PR #57174 on branch 2.2.x (BUG: Interchange protocol implementation allows non-string column names) #57203

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Bug fixes
~~~~~~~~~
- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for Nullable integers (:issue:`55069`)
- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`)
- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which wasn't converting columns names to strings (:issue:`55069`)
- Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`)

.. ---------------------------------------------------------------------------
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/interchange/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ def __init__(self, column: pd.Series, allow_copy: bool = True) -> None:
Note: doesn't deal with extension arrays yet, just assume a regular
Series/ndarray for now.
"""
if isinstance(column, pd.DataFrame):
raise TypeError(
"Expected a Series, got a DataFrame. This likely happened "
"because you called __dataframe__ on a DataFrame which, "
"after converting column names to string, resulted in duplicated "
f"names: {column.columns}. Please rename these columns before "
"using the interchange protocol."
)
if not isinstance(column, pd.Series):
raise NotImplementedError(f"Columns of type {type(column)} not handled yet")

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/interchange/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __init__(self, df: DataFrame, allow_copy: bool = True) -> None:
Constructor - an instance of this (private) class is returned from
`pd.DataFrame.__dataframe__`.
"""
self._df = df
self._df = df.rename(columns=str, copy=False)
self._allow_copy = allow_copy

def __dataframe__(
Expand Down
26 changes: 24 additions & 2 deletions pandas/tests/interchange/test_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,6 @@ def test_missing_from_masked():
}
)

df2 = df.__dataframe__()

rng = np.random.default_rng(2)
dict_null = {col: rng.integers(low=0, high=len(df)) for col in df.columns}
for col, num_nulls in dict_null.items():
Expand Down Expand Up @@ -382,6 +380,30 @@ def test_large_string():
tm.assert_frame_equal(result, expected)


def test_non_str_names():
# https://github.com/pandas-dev/pandas/issues/56701
df = pd.Series([1, 2, 3], name=0).to_frame()
names = df.__dataframe__().column_names()
assert names == ["0"]


def test_non_str_names_w_duplicates():
# https://github.com/pandas-dev/pandas/issues/56701
df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]})
dfi = df.__dataframe__()
with pytest.raises(
TypeError,
match=(
"Expected a Series, got a DataFrame. This likely happened because you "
"called __dataframe__ on a DataFrame which, after converting column "
r"names to string, resulted in duplicated names: Index\(\['0', '0'\], "
r"dtype='object'\). Please rename these columns before using the "
"interchange protocol."
),
):
pd.api.interchange.from_dataframe(dfi, allow_copy=False)


@pytest.mark.parametrize(
"dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))]
)
Expand Down