Skip to content

ENH: add from_dummies #31795

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Other enhancements

- :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`)
- When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`)
- We have added a :meth:`pandas.from_dummies`, which is an inverse transformation of :meth:`pandas.get_dummies` (:issue:`8745`)
- `OptionError` is now exposed in `pandas.errors` (:issue:`27553`)
-

Expand Down
1 change: 1 addition & 0 deletions pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@
get_dummies,
cut,
qcut,
from_dummies,
)

import pandas.api
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/reshape/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@
from pandas.core.reshape.melt import lreshape, melt, wide_to_long
from pandas.core.reshape.merge import merge, merge_asof, merge_ordered
from pandas.core.reshape.pivot import crosstab, pivot, pivot_table
from pandas.core.reshape.reshape import get_dummies
from pandas.core.reshape.reshape import from_dummies, get_dummies
from pandas.core.reshape.tile import cut, qcut
172 changes: 157 additions & 15 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from functools import partial
import itertools
from typing import List, Optional, Union
from typing import List, Optional, Set, Union

import numpy as np

Expand Down Expand Up @@ -751,6 +751,159 @@ def _convert_level_number(level_num, columns):
return result


def from_dummies(data, prefix=None, prefix_sep="_", dtype="category") -> "DataFrame":
"""
The inverse transformation of ``pandas.get_dummies``.

.. versionadded:: 1.1.0

Parameters
----------
data : DataFrame
Data which contains dummy indicators.
prefix : list-like, default None
How to name the decoded groups of columns. If there are columns
containing `prefix_sep`, then the part of their name preceding
`prefix_sep` will be used (see examples below).
prefix_sep : str, default '_'
Separator between original column name and dummy variable.
dtype : dtype, default 'category'
Data dtype for new columns - only a single data type is allowed.

Returns
-------
DataFrame
Decoded data.

See Also
--------
get_dummies : The inverse operation.

Examples
--------
Say we have a dataframe where some variables have been dummified:

>>> df = pd.DataFrame(
... {
... "baboon": [0, 0, 1],
... "lemur": [0, 1, 0],
... "zebra": [1, 0, 0],
... }
... )
>>> df
baboon lemur zebra
0 0 0 1
1 0 1 0
2 1 0 0

We can recover the original dataframe using `from_dummies`:

>>> pd.from_dummies(df, prefix='animal')
animal
0 zebra
1 lemur
2 baboon

If our dataframe already has columns with `prefix_sep` in them,
we don't need to pass in the `prefix` argument:

>>> df = pd.DataFrame(
... {
... "animal_baboon": [0, 0, 1],
... "animal_lemur": [0, 1, 0],
... "animal_zebra": [1, 0, 0],
... "other": ['a', 'b', 'c'],
... }
... )
>>> df
animal_baboon animal_lemur animal_zebra other
0 0 0 1 a
1 0 1 0 b
2 1 0 0 c

>>> pd.from_dummies(df)
other animal
0 a zebra
1 b lemur
2 c baboon
"""
if dtype is None:
dtype = "category"

columns_to_decode = [i for i in data.columns if prefix_sep in i]
if not columns_to_decode:
if prefix is None:
raise ValueError(
"If no columns contain `prefix_sep`, you must"
" pass a value to `prefix` with which to name"
" the decoded columns."
)
# If no column contains `prefix_sep`, we add `prefix`_`prefix_sep` to
# each column.
out = data.rename(columns=lambda x: f"{prefix}{prefix_sep}{x}").copy()
columns_to_decode = out.columns
else:
out = data.copy()

data_to_decode = out[columns_to_decode]

if prefix is None:
# If no prefix has been passed, extract it from columns containing
# `prefix_sep`
seen: Set[str] = set()
prefix = []
for i in columns_to_decode:
i = i.split(prefix_sep)[0]
if i in seen:
continue
seen.add(i)
prefix.append(i)
elif isinstance(prefix, str):
prefix = [prefix]

# Check each row sums to 1 or 0
def _validate_values(data):
if not all(i in [0, 1] for i in data.sum(axis=1).unique().tolist()):
raise ValueError(
"Data cannot be decoded! Each row must contain only 0s and"
" 1s, and each row may have at most one 1."
)

for prefix_ in prefix:
cols, labels = (
[
i.replace(x, "")
for i in data_to_decode.columns
if prefix_ + prefix_sep in i
]
for x in ["", prefix_ + prefix_sep]
)
if not cols:
continue
_validate_values(data_to_decode[cols])
out = out.drop(cols, axis=1)
out[prefix_] = Series(
np.array(labels)[np.argmax(data_to_decode[cols].to_numpy(), axis=1)],
dtype=dtype,
)
return out


def _check_len(item, name, data_to_encode):
""" Validate prefixes and separator to avoid silently dropping cols. """
len_msg = (
"Length of '{name}' ({len_item}) did not match the "
"length of the columns being encoded ({len_enc})."
)

if is_list_like(item):
if not len(item) == data_to_encode.shape[1]:
len_msg = len_msg.format(
name=name, len_item=len(item), len_enc=data_to_encode.shape[1]
)
raise ValueError(len_msg)


def get_dummies(
data,
prefix=None,
Expand Down Expand Up @@ -801,6 +954,7 @@ def get_dummies(
See Also
--------
Series.str.get_dummies : Convert Series to dummy codes.
from_dummies : The inverse operation.

Examples
--------
Expand Down Expand Up @@ -871,20 +1025,8 @@ def get_dummies(
else:
data_to_encode = data[columns]

# validate prefixes and separator to avoid silently dropping cols
def check_len(item, name):

if is_list_like(item):
if not len(item) == data_to_encode.shape[1]:
len_msg = (
f"Length of '{name}' ({len(item)}) did not match the "
"length of the columns being encoded "
f"({data_to_encode.shape[1]})."
)
raise ValueError(len_msg)

check_len(prefix, "prefix")
check_len(prefix_sep, "prefix_sep")
_check_len(prefix, "prefix", data_to_encode)
_check_len(prefix_sep, "prefix_sep", data_to_encode)

if isinstance(prefix, str):
prefix = itertools.cycle([prefix])
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/api/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ class TestPDApi(Base):
"eval",
"factorize",
"get_dummies",
"from_dummies",
"infer_freq",
"isna",
"isnull",
Expand Down
62 changes: 62 additions & 0 deletions pandas/tests/reshape/test_from_dummies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pytest

import pandas as pd
import pandas._testing as tm


@pytest.mark.parametrize(
"dtype, expected_dict",
[
("str", {"col1": ["a", "a", "b"]}),
(str, {"col1": ["a", "a", "b"]},),
(None, {"col1": ["a", "a", "b"]}),
],
)
def test_dtype(dtype, expected_dict):
df = pd.DataFrame({"col1_a": [1, 1, 0], "col1_b": [0, 0, 1]})
result = pd.from_dummies(df, dtype=dtype)
expected = pd.DataFrame(expected_dict)
if dtype is None:
expected = expected.astype("category")
tm.assert_frame_equal(result, expected)


def test_malformed():
df = pd.DataFrame({"col1_a": [1, 1, 0], "col1_b": [1, 0, 1]})
msg = (
"Data cannot be decoded! Each row must contain only 0s and 1s"
", and each row may have at most one 1"
)
with pytest.raises(ValueError, match=msg):
pd.from_dummies(df)


@pytest.mark.parametrize(
"prefix_sep, input_dict",
[
("_", {"col1_a": [1, 1, 0], "col1_b": [0, 0, 1]}),
("*", {"col1*a": [1, 1, 0], "col1*b": [0, 0, 1]}),
(".", {"col1.a": [1, 1, 0], "col1.b": [0, 0, 1]}),
],
)
def test_prefix_sep(prefix_sep, input_dict):
df = pd.DataFrame(input_dict)
result = pd.from_dummies(df, prefix_sep=prefix_sep)
expected = pd.DataFrame({"col1": ["a", "a", "b"]}, dtype="category")
tm.assert_frame_equal(result, expected)


def test_no_prefix():
df = pd.DataFrame({"a": [1, 1, 0], "b": [0, 0, 1]})
result = pd.from_dummies(df, prefix="letter")
expected = pd.DataFrame({"letter": ["a", "a", "b"]}, dtype="category")
tm.assert_frame_equal(result, expected)


def test_multiple_columns():
df = pd.DataFrame(
{"col1_a": [1, 0], "col1_b": [0, 1], "col2_a": [0, 0], "col2_c": [1, 1]}
)
result = pd.from_dummies(df)
expected = pd.DataFrame({"col1": ["a", "b"], "col2": ["c", "c"]}, dtype="category")
tm.assert_frame_equal(result, expected)