From 159d54e61c8bc2b9bf8e8543e373e39981356eae Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 2 Feb 2020 13:22:21 +0000 Subject: [PATCH 01/16] first draft --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/__init__.py | 1 + pandas/core/reshape/api.py | 2 +- pandas/core/reshape/reshape.py | 148 ++++++++++++++++++++-- pandas/tests/mytest.py | 10 ++ pandas/tests/reshape/test_from_dummies.py | 63 +++++++++ 6 files changed, 210 insertions(+), 16 deletions(-) create mode 100644 pandas/tests/mytest.py create mode 100644 pandas/tests/reshape/test_from_dummies.py diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 13827e8fc4c33..68059bea56cd2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -43,7 +43,7 @@ Other enhancements - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) - When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`) -- +- We have added a :meth:`pandas.from_dummies`, which is an inverse transformation of :meth:`pandas.get_dummies` (:issue:`8745`) - .. --------------------------------------------------------------------------- diff --git a/pandas/__init__.py b/pandas/__init__.py index d526531b159b2..1c379ab2544d3 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -135,6 +135,7 @@ get_dummies, cut, qcut, + from_dummies, ) import pandas.api diff --git a/pandas/core/reshape/api.py b/pandas/core/reshape/api.py index 3c76eef809c7a..7054926b9c0c4 100644 --- a/pandas/core/reshape/api.py +++ b/pandas/core/reshape/api.py @@ -4,5 +4,5 @@ from pandas.core.reshape.melt import lreshape, melt, wide_to_long from pandas.core.reshape.merge import merge, merge_asof, merge_ordered from pandas.core.reshape.pivot import crosstab, pivot, pivot_table -from pandas.core.reshape.reshape import get_dummies +from pandas.core.reshape.reshape import from_dummies, get_dummies from pandas.core.reshape.tile import cut, qcut diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 359e5b956f8a5..d6e4212bce60a 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -751,6 +751,138 @@ def _convert_level_number(level_num, columns): return result +def from_dummies(data, columns=None, prefix_sep="_", dtype="category", fill_first=None): + """ + The inverse transformation of ``pandas.get_dummies``. + + Parameters + ---------- + data : DataFrame + columns : list-like, default None + Column names in the DataFrame to be decoded. + If `columns` is None then all the columns will be converted. + prefix_sep : str, default '_' + Separator between original column name and dummy variable + dtype : dtype, default 'category' + Data dtype for new columns - only a single data type is allowed + fill_first : str, list, or dict, default None + Used to fill rows for which all the dummy variables are 0 + + Returns + ------- + transformed : DataFrame + + Examples + -------- + Say we have a dataframe where some variables have been dummified: + + >>> df = pd.DataFrame( + ... { + ... "animal_baboon": [0, 0, 1], + ... "animal_lemur": [0, 1, 0], + ... "animal_zebra": [1, 0, 0], + ... "other_col": ["a", "b", "c"], + ... } + ... ) + >>> df + animal_baboon animal_lemur animal_zebra other_col + 0 0 0 1 a + 1 0 1 0 b + 2 1 0 0 c + + We can recover the original dataframe using `from_dummies`: + + >>> pd.from_dummies(df, columns=['animal']) + other_col animal + 0 a zebra + 1 b lemur + 2 c baboon + + Suppose our dataframe has one column from each dummified column + dropped: + + >>> df = df.drop('animal_zebra', axis=1) + >>> df + animal_baboon animal_lemur other_col + 0 0 0 a + 1 0 1 b + 2 1 0 c + + We can still recover the original dataframe, by using the argument + `fill_first`: + + >>> pd.from_dummies(df, columns=["animal"], fill_first=["zebra"]) + other_col animal + 0 a zebra + 1 b lemur + 2 c baboon + """ + if dtype is None: + dtype = "category" + + if columns is None: + data_to_decode = data.copy() + columns = data.columns.tolist() + columns = list( + {i.split(prefix_sep)[0] for i in data.columns if prefix_sep in i} + ) + + data_to_decode = data[ + [i for i in data.columns for c in columns if i.startswith(c + prefix_sep)] + ] + + # Check each row sums to 1 or 0 + if not all(i in [0, 1] for i in data_to_decode.sum(axis=1).unique().tolist()): + raise ValueError( + "Data cannot be decoded! Each row must contain only 0s and" + " 1s, and each row may have at most one 1" + ) + + if fill_first is None: + fill_first = [None] * len(columns) + elif isinstance(fill_first, str): + fill_first = itertools.cycle([fill_first]) + elif isinstance(fill_first, dict): + fill_first = [fill_first[col] for col in columns] + + out = data.copy() + for column, fill_first_ in zip(columns, fill_first): + cols, labels = [ + [ + i.replace(x, "") + for i in data_to_decode.columns + if column + prefix_sep in i + ] + for x in ["", column + prefix_sep] + ] + if not cols: + continue + out = out.drop(cols, axis=1) + if fill_first_: + cols = [column + prefix_sep + fill_first_] + cols + labels = [fill_first_] + labels + data[cols[0]] = (1 - data[cols[1:]]).all(axis=1) + out[column] = Series( + np.array(labels)[np.argmax(data[cols].to_numpy(), axis=1)], dtype=dtype + ) + return out + + +def _check_len(item, name, data_to_encode): + """ Validate prefixes and separator to avoid silently dropping cols. """ + len_msg = ( + "Length of '{name}' ({len_item}) did not match the " + "length of the columns being encoded ({len_enc})." + ) + + if is_list_like(item): + if not len(item) == data_to_encode.shape[1]: + len_msg = len_msg.format( + name=name, len_item=len(item), len_enc=data_to_encode.shape[1] + ) + raise ValueError(len_msg) + + def get_dummies( data, prefix=None, @@ -871,20 +1003,8 @@ def get_dummies( else: data_to_encode = data[columns] - # validate prefixes and separator to avoid silently dropping cols - def check_len(item, name): - - if is_list_like(item): - if not len(item) == data_to_encode.shape[1]: - len_msg = ( - f"Length of '{name}' ({len(item)}) did not match the " - "length of the columns being encoded " - f"({data_to_encode.shape[1]})." - ) - raise ValueError(len_msg) - - check_len(prefix, "prefix") - check_len(prefix_sep, "prefix_sep") + _check_len(prefix, "prefix", data_to_encode) + _check_len(prefix_sep, "prefix_sep", data_to_encode) if isinstance(prefix, str): prefix = itertools.cycle([prefix]) diff --git a/pandas/tests/mytest.py b/pandas/tests/mytest.py new file mode 100644 index 0000000000000..3e8916b75e977 --- /dev/null +++ b/pandas/tests/mytest.py @@ -0,0 +1,10 @@ +import pandas as pd + +def test_me(): + pd.eval( + """ + A = df.A - df.B + B = df.A + df.B + """, + target=pd.DataFrame(), + ) \ No newline at end of file diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py new file mode 100644 index 0000000000000..a416a9abecc3d --- /dev/null +++ b/pandas/tests/reshape/test_from_dummies.py @@ -0,0 +1,63 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "dtype, expected_dict", + [ + ("str", {"col1": ["a", "a", "b"]}), + (str, {"col1": ["a", "a", "b"]},), + (None, {"col1": ["a", "a", "b"]}), + ], +) +def test_dtype(dtype, expected_dict): + df = pd.DataFrame({"col1_a": [1, 1, 0], "col1_b": [0, 0, 1]}) + result = pd.from_dummies(df, dtype=dtype) + expected = pd.DataFrame(expected_dict) + if dtype is None: + expected = expected.astype("category") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "fill_first, expected_dict", + [ + ("a", {"col1": ["a", "a", "b"]}), + (["a"], {"col1": ["a", "a", "b"]}), + ({"col1": "a"}, {"col1": ["a", "a", "b"]}), + ], +) +def test_fill_first(fill_first, expected_dict): + df = pd.DataFrame({"col1_b": [0, 0, 1]}) + result = pd.from_dummies(df, fill_first=fill_first) + # get_dummies changes the ordering of columns, + # see https://github.com/pandas-dev/pandas/issues/17612 + expected = pd.DataFrame(expected_dict, dtype="category") + tm.assert_frame_equal(result, expected) + + +def test_malformed(): + df = pd.DataFrame({"col1_a": [1, 1, 0], "col1_b": [1, 0, 1]}) + msg = ( + "Data cannot be decoded! Each row must contain only 0s and 1s" + ", and each row may have at most one 1" + ) + with pytest.raises(ValueError, match=msg): + pd.from_dummies(df) + + +@pytest.mark.parametrize( + "prefix_sep, input_dict", + [ + ("_", {"col1_a": [1, 1, 0], "col1_b": [0, 0, 1]}), + ("*", {"col1*a": [1, 1, 0], "col1*b": [0, 0, 1]}), + (".", {"col1.a": [1, 1, 0], "col1.b": [0, 0, 1]}), + ], +) +def test_prefix_sep(prefix_sep, input_dict): + df = pd.DataFrame(input_dict) + result = pd.from_dummies(df, prefix_sep=prefix_sep) + expected = pd.DataFrame({"col1": ["a", "a", "b"]}, dtype="category") + tm.assert_frame_equal(result, expected) From 3bfafe6eab39c61745169abcdb9638911c4af132 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 9 Feb 2020 16:18:59 +0000 Subject: [PATCH 02/16] rename columns to prefix --- pandas/core/reshape/reshape.py | 36 ++++++++++++++++------------------ 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d6e4212bce60a..e1e5b96f8b052 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -751,16 +751,16 @@ def _convert_level_number(level_num, columns): return result -def from_dummies(data, columns=None, prefix_sep="_", dtype="category", fill_first=None): +def from_dummies(data, prefix=None, prefix_sep="_", dtype="category", fill_first=None): """ The inverse transformation of ``pandas.get_dummies``. Parameters ---------- data : DataFrame - columns : list-like, default None - Column names in the DataFrame to be decoded. - If `columns` is None then all the columns will be converted. + prefix : list-like, default None + Prefixes of the columns in the DataFrame to be decoded. + If `prefix` is None then all the columns will be decoded. prefix_sep : str, default '_' Separator between original column name and dummy variable dtype : dtype, default 'category' @@ -792,7 +792,7 @@ def from_dummies(data, columns=None, prefix_sep="_", dtype="category", fill_firs We can recover the original dataframe using `from_dummies`: - >>> pd.from_dummies(df, columns=['animal']) + >>> pd.from_dummies(df, prefix=['animal']) other_col animal 0 a zebra 1 b lemur @@ -811,7 +811,7 @@ def from_dummies(data, columns=None, prefix_sep="_", dtype="category", fill_firs We can still recover the original dataframe, by using the argument `fill_first`: - >>> pd.from_dummies(df, columns=["animal"], fill_first=["zebra"]) + >>> pd.from_dummies(df, prefix=["animal"], fill_first=["zebra"]) other_col animal 0 a zebra 1 b lemur @@ -820,15 +820,13 @@ def from_dummies(data, columns=None, prefix_sep="_", dtype="category", fill_firs if dtype is None: dtype = "category" - if columns is None: + if prefix is None: data_to_decode = data.copy() - columns = data.columns.tolist() - columns = list( - {i.split(prefix_sep)[0] for i in data.columns if prefix_sep in i} - ) + prefix = data.columns.tolist() + prefix = list({i.split(prefix_sep)[0] for i in data.columns if prefix_sep in i}) data_to_decode = data[ - [i for i in data.columns for c in columns if i.startswith(c + prefix_sep)] + [i for i in data.columns for p in prefix if i.startswith(p + prefix_sep)] ] # Check each row sums to 1 or 0 @@ -839,30 +837,30 @@ def from_dummies(data, columns=None, prefix_sep="_", dtype="category", fill_firs ) if fill_first is None: - fill_first = [None] * len(columns) + fill_first = [None] * len(prefix) elif isinstance(fill_first, str): fill_first = itertools.cycle([fill_first]) elif isinstance(fill_first, dict): - fill_first = [fill_first[col] for col in columns] + fill_first = [fill_first[p] for p in prefix] out = data.copy() - for column, fill_first_ in zip(columns, fill_first): + for prefix_, fill_first_ in zip(prefix, fill_first): cols, labels = [ [ i.replace(x, "") for i in data_to_decode.columns - if column + prefix_sep in i + if prefix_ + prefix_sep in i ] - for x in ["", column + prefix_sep] + for x in ["", prefix_ + prefix_sep] ] if not cols: continue out = out.drop(cols, axis=1) if fill_first_: - cols = [column + prefix_sep + fill_first_] + cols + cols = [prefix_ + prefix_sep + fill_first_] + cols labels = [fill_first_] + labels data[cols[0]] = (1 - data[cols[1:]]).all(axis=1) - out[column] = Series( + out[prefix_] = Series( np.array(labels)[np.argmax(data[cols].to_numpy(), axis=1)], dtype=dtype ) return out From 15f68187ab5cb34d83368ea8c322d0a6d811dde7 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 9 Feb 2020 16:27:10 +0000 Subject: [PATCH 03/16] fix docstring --- pandas/core/reshape/reshape.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index e1e5b96f8b052..0bfc227ee86ee 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -751,26 +751,28 @@ def _convert_level_number(level_num, columns): return result -def from_dummies(data, prefix=None, prefix_sep="_", dtype="category", fill_first=None): +def from_dummies(data, prefix=None, prefix_sep="_", dtype="category", fill_first=None) -> "DataFrame": """ The inverse transformation of ``pandas.get_dummies``. Parameters ---------- data : DataFrame + Data which contains dummy indicators. prefix : list-like, default None Prefixes of the columns in the DataFrame to be decoded. If `prefix` is None then all the columns will be decoded. prefix_sep : str, default '_' - Separator between original column name and dummy variable + Separator between original column name and dummy variable. dtype : dtype, default 'category' - Data dtype for new columns - only a single data type is allowed + Data dtype for new columns - only a single data type is allowed. fill_first : str, list, or dict, default None - Used to fill rows for which all the dummy variables are 0 + Used to fill rows for which all the dummy variables are 0. Returns ------- - transformed : DataFrame + DataFrame + Decoded data. Examples -------- From 9eed071ba5c875094b34ac8b2474e3dca2cf6a16 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 9 Feb 2020 16:35:15 +0000 Subject: [PATCH 04/16] black --- pandas/core/reshape/reshape.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 0bfc227ee86ee..9255b8639490d 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -751,7 +751,9 @@ def _convert_level_number(level_num, columns): return result -def from_dummies(data, prefix=None, prefix_sep="_", dtype="category", fill_first=None) -> "DataFrame": +def from_dummies( + data, prefix=None, prefix_sep="_", dtype="category", fill_first=None +) -> "DataFrame": """ The inverse transformation of ``pandas.get_dummies``. From f5d90888e1d3382f38e3217d3c158148af8c35ae Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 14 Feb 2020 15:53:20 +0000 Subject: [PATCH 05/16] infer prefixes --- pandas/core/reshape/reshape.py | 138 ++++++++++++---------- pandas/tests/reshape/test_from_dummies.py | 29 ++--- 2 files changed, 88 insertions(+), 79 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 9255b8639490d..d67912f7c7668 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -752,7 +752,7 @@ def _convert_level_number(level_num, columns): def from_dummies( - data, prefix=None, prefix_sep="_", dtype="category", fill_first=None + data, prefix=None, prefix_sep="_", dtype="category" ) -> "DataFrame": """ The inverse transformation of ``pandas.get_dummies``. @@ -762,14 +762,13 @@ def from_dummies( data : DataFrame Data which contains dummy indicators. prefix : list-like, default None - Prefixes of the columns in the DataFrame to be decoded. - If `prefix` is None then all the columns will be decoded. + How to name the decoded groups of columns. If there are columns + containing `prefix_sep`, then the part of their name preceding + `prefix_sep` will be used (see examples below). prefix_sep : str, default '_' Separator between original column name and dummy variable. dtype : dtype, default 'category' Data dtype for new columns - only a single data type is allowed. - fill_first : str, list, or dict, default None - Used to fill rows for which all the dummy variables are 0. Returns ------- @@ -782,90 +781,105 @@ def from_dummies( >>> df = pd.DataFrame( ... { - ... "animal_baboon": [0, 0, 1], - ... "animal_lemur": [0, 1, 0], - ... "animal_zebra": [1, 0, 0], - ... "other_col": ["a", "b", "c"], + ... "baboon": [0, 0, 1], + ... "lemur": [0, 1, 0], + ... "zebra": [1, 0, 0], ... } ... ) >>> df - animal_baboon animal_lemur animal_zebra other_col - 0 0 0 1 a - 1 0 1 0 b - 2 1 0 0 c + baboon lemur zebra + 0 0 0 1 + 1 0 1 0 + 2 1 0 0 We can recover the original dataframe using `from_dummies`: - >>> pd.from_dummies(df, prefix=['animal']) - other_col animal - 0 a zebra - 1 b lemur - 2 c baboon + >>> pd.from_dummies(df, prefix='animal') + animal + 0 zebra + 1 lemur + 2 baboon - Suppose our dataframe has one column from each dummified column - dropped: + If our dataframe already has columns with `prefix_sep` in them, + we don't need to pass in the `prefix` argument: - >>> df = df.drop('animal_zebra', axis=1) + >>> df = pd.DataFrame( + ... { + ... "animal_baboon": [0, 0, 1], + ... "animal_lemur": [0, 1, 0], + ... "animal_zebra": [1, 0, 0], + ... "other": ['a', 'b', 'c'], + ... } + ... ) >>> df - animal_baboon animal_lemur other_col - 0 0 0 a - 1 0 1 b - 2 1 0 c - - We can still recover the original dataframe, by using the argument - `fill_first`: - - >>> pd.from_dummies(df, prefix=["animal"], fill_first=["zebra"]) - other_col animal - 0 a zebra - 1 b lemur - 2 c baboon + animal_baboon animal_lemur animal_zebra other + 0 0 0 1 a + 1 0 1 0 b + 2 1 0 0 c + + >>> pd.from_dummies(df) + other animal + 0 a zebra + 1 b lemur + 2 c baboon """ if dtype is None: dtype = "category" - if prefix is None: - data_to_decode = data.copy() - prefix = data.columns.tolist() - prefix = list({i.split(prefix_sep)[0] for i in data.columns if prefix_sep in i}) + columns_to_decode = [i for i in data.columns if prefix_sep in i] + if not columns_to_decode: + if prefix is None: + raise ValueError( + "If no columns contain `prefix_sep`, you must" + " pass a value to `prefix` with which to name" + " the decoded columns." + ) + # If no column contains `prefix_sep`, we add `prefix`_`prefix_sep` to + # each column. + out = data.rename(columns = lambda x: f'{prefix}{prefix_sep}{x}').copy() + columns_to_decode = out.columns + else: + out = data.copy() - data_to_decode = data[ - [i for i in data.columns for p in prefix if i.startswith(p + prefix_sep)] - ] + data_to_decode = out[columns_to_decode] - # Check each row sums to 1 or 0 - if not all(i in [0, 1] for i in data_to_decode.sum(axis=1).unique().tolist()): - raise ValueError( - "Data cannot be decoded! Each row must contain only 0s and" - " 1s, and each row may have at most one 1" - ) + if prefix is None: + # If no prefix has been passed, extract it from columns containing + # `prefix_sep` + seen = set() + prefix = [] + for i in columns_to_decode: + i = i.split(prefix_sep)[0] + if i in seen: + continue + seen.add(i) + prefix.append(i) + elif isinstance(prefix, str): + prefix = [prefix] - if fill_first is None: - fill_first = [None] * len(prefix) - elif isinstance(fill_first, str): - fill_first = itertools.cycle([fill_first]) - elif isinstance(fill_first, dict): - fill_first = [fill_first[p] for p in prefix] + # Check each row sums to 1 or 0 + def _validate_values(data): + if not all(i in [0, 1] for i in data.sum(axis=1).unique().tolist()): + raise ValueError( + "Data cannot be decoded! Each row must contain only 0s and" + " 1s, and each row may have at most one 1." + ) - out = data.copy() - for prefix_, fill_first_ in zip(prefix, fill_first): - cols, labels = [ + for prefix_ in prefix: + cols, labels = ( [ i.replace(x, "") for i in data_to_decode.columns if prefix_ + prefix_sep in i ] for x in ["", prefix_ + prefix_sep] - ] + ) if not cols: continue + _validate_values(data_to_decode[cols]) out = out.drop(cols, axis=1) - if fill_first_: - cols = [prefix_ + prefix_sep + fill_first_] + cols - labels = [fill_first_] + labels - data[cols[0]] = (1 - data[cols[1:]]).all(axis=1) out[prefix_] = Series( - np.array(labels)[np.argmax(data[cols].to_numpy(), axis=1)], dtype=dtype + np.array(labels)[np.argmax(data_to_decode[cols].to_numpy(), axis=1)], dtype=dtype ) return out diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index a416a9abecc3d..50ca78aca9ee4 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -21,23 +21,6 @@ def test_dtype(dtype, expected_dict): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "fill_first, expected_dict", - [ - ("a", {"col1": ["a", "a", "b"]}), - (["a"], {"col1": ["a", "a", "b"]}), - ({"col1": "a"}, {"col1": ["a", "a", "b"]}), - ], -) -def test_fill_first(fill_first, expected_dict): - df = pd.DataFrame({"col1_b": [0, 0, 1]}) - result = pd.from_dummies(df, fill_first=fill_first) - # get_dummies changes the ordering of columns, - # see https://github.com/pandas-dev/pandas/issues/17612 - expected = pd.DataFrame(expected_dict, dtype="category") - tm.assert_frame_equal(result, expected) - - def test_malformed(): df = pd.DataFrame({"col1_a": [1, 1, 0], "col1_b": [1, 0, 1]}) msg = ( @@ -61,3 +44,15 @@ def test_prefix_sep(prefix_sep, input_dict): result = pd.from_dummies(df, prefix_sep=prefix_sep) expected = pd.DataFrame({"col1": ["a", "a", "b"]}, dtype="category") tm.assert_frame_equal(result, expected) + +def test_no_prefix(): + df = pd.DataFrame({"a": [1, 1, 0], "b": [0, 0, 1]}) + result = pd.from_dummies(df, prefix='letter') + expected = pd.DataFrame({'letter': ['a', 'a', 'b']}, dtype='category') + tm.assert_frame_equal(result, expected) + +def test_multiple_columns(): + df = pd.DataFrame({"col1_a": [1, 0], "col1_b": [0, 1], "col2_a": [0, 0], "col2_c": [1, 1]}) + result = pd.from_dummies(df) + expected = pd.DataFrame({'col1': ['a', 'b'], 'col2': ['c', 'c']}, dtype='category') + tm.assert_frame_equal(result, expected) \ No newline at end of file From 4be43af882cb30216cff6d1f9588df9d258cfb6d Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 14 Feb 2020 15:56:42 +0000 Subject: [PATCH 06/16] lint --- pandas/core/reshape/reshape.py | 15 +++++++-------- pandas/tests/mytest.py | 3 ++- pandas/tests/reshape/test_from_dummies.py | 14 +++++++++----- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d67912f7c7668..5066ec261aad5 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1,6 +1,6 @@ from functools import partial import itertools -from typing import List, Optional, Union +from typing import List, Optional, Set, Union import numpy as np @@ -751,9 +751,7 @@ def _convert_level_number(level_num, columns): return result -def from_dummies( - data, prefix=None, prefix_sep="_", dtype="category" -) -> "DataFrame": +def from_dummies(data, prefix=None, prefix_sep="_", dtype="category") -> "DataFrame": """ The inverse transformation of ``pandas.get_dummies``. @@ -833,10 +831,10 @@ def from_dummies( "If no columns contain `prefix_sep`, you must" " pass a value to `prefix` with which to name" " the decoded columns." - ) + ) # If no column contains `prefix_sep`, we add `prefix`_`prefix_sep` to # each column. - out = data.rename(columns = lambda x: f'{prefix}{prefix_sep}{x}').copy() + out = data.rename(columns=lambda x: f"{prefix}{prefix_sep}{x}").copy() columns_to_decode = out.columns else: out = data.copy() @@ -846,7 +844,7 @@ def from_dummies( if prefix is None: # If no prefix has been passed, extract it from columns containing # `prefix_sep` - seen = set() + seen: Set[str] = set() prefix = [] for i in columns_to_decode: i = i.split(prefix_sep)[0] @@ -879,7 +877,8 @@ def _validate_values(data): _validate_values(data_to_decode[cols]) out = out.drop(cols, axis=1) out[prefix_] = Series( - np.array(labels)[np.argmax(data_to_decode[cols].to_numpy(), axis=1)], dtype=dtype + np.array(labels)[np.argmax(data_to_decode[cols].to_numpy(), axis=1)], + dtype=dtype, ) return out diff --git a/pandas/tests/mytest.py b/pandas/tests/mytest.py index 3e8916b75e977..d301fdc1ef135 100644 --- a/pandas/tests/mytest.py +++ b/pandas/tests/mytest.py @@ -1,5 +1,6 @@ import pandas as pd + def test_me(): pd.eval( """ @@ -7,4 +8,4 @@ def test_me(): B = df.A + df.B """, target=pd.DataFrame(), - ) \ No newline at end of file + ) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 50ca78aca9ee4..477d06708a49b 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -45,14 +45,18 @@ def test_prefix_sep(prefix_sep, input_dict): expected = pd.DataFrame({"col1": ["a", "a", "b"]}, dtype="category") tm.assert_frame_equal(result, expected) + def test_no_prefix(): df = pd.DataFrame({"a": [1, 1, 0], "b": [0, 0, 1]}) - result = pd.from_dummies(df, prefix='letter') - expected = pd.DataFrame({'letter': ['a', 'a', 'b']}, dtype='category') + result = pd.from_dummies(df, prefix="letter") + expected = pd.DataFrame({"letter": ["a", "a", "b"]}, dtype="category") tm.assert_frame_equal(result, expected) + def test_multiple_columns(): - df = pd.DataFrame({"col1_a": [1, 0], "col1_b": [0, 1], "col2_a": [0, 0], "col2_c": [1, 1]}) + df = pd.DataFrame( + {"col1_a": [1, 0], "col1_b": [0, 1], "col2_a": [0, 0], "col2_c": [1, 1]} + ) result = pd.from_dummies(df) - expected = pd.DataFrame({'col1': ['a', 'b'], 'col2': ['c', 'c']}, dtype='category') - tm.assert_frame_equal(result, expected) \ No newline at end of file + expected = pd.DataFrame({"col1": ["a", "b"], "col2": ["c", "c"]}, dtype="category") + tm.assert_frame_equal(result, expected) From 9b111b764b8006efca8aa7b244513184719fd392 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 14 Feb 2020 16:30:40 +0000 Subject: [PATCH 07/16] update test_api --- pandas/tests/api/test_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 406d5f055797d..0fbf9653bae18 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -120,6 +120,7 @@ class TestPDApi(Base): "eval", "factorize", "get_dummies", + "from_dummies", "infer_freq", "isna", "isnull", From cfa74cec3090877bf66d9a1c55938f032dc163aa Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 14 Feb 2020 17:10:40 +0000 Subject: [PATCH 08/16] add see also and versionadded --- pandas/core/reshape/reshape.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 5066ec261aad5..9ca579c27923a 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -755,6 +755,8 @@ def from_dummies(data, prefix=None, prefix_sep="_", dtype="category") -> "DataFr """ The inverse transformation of ``pandas.get_dummies``. + .. versionadded:: 1.1.0 + Parameters ---------- data : DataFrame @@ -773,6 +775,10 @@ def from_dummies(data, prefix=None, prefix_sep="_", dtype="category") -> "DataFr DataFrame Decoded data. + See Also + -------- + get_dummies : the inverse operation + Examples -------- Say we have a dataframe where some variables have been dummified: @@ -948,6 +954,7 @@ def get_dummies( See Also -------- Series.str.get_dummies : Convert Series to dummy codes. + from_dummies : the inverse operation. Examples -------- From 3827e01fb70e723a993909ec0b45ad0969bedfa4 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 14 Feb 2020 17:14:06 +0000 Subject: [PATCH 09/16] remove mytest --- pandas/tests/mytest.py | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 pandas/tests/mytest.py diff --git a/pandas/tests/mytest.py b/pandas/tests/mytest.py deleted file mode 100644 index d301fdc1ef135..0000000000000 --- a/pandas/tests/mytest.py +++ /dev/null @@ -1,11 +0,0 @@ -import pandas as pd - - -def test_me(): - pd.eval( - """ - A = df.A - df.B - B = df.A + df.B - """, - target=pd.DataFrame(), - ) From 5cf2ab8ff1b2b40ef6f2cbcc05a5cf5d2567d332 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 14 Feb 2020 17:33:04 +0000 Subject: [PATCH 10/16] fix docstring validation --- pandas/core/reshape/reshape.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 9ca579c27923a..06795a6654ff1 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -777,7 +777,7 @@ def from_dummies(data, prefix=None, prefix_sep="_", dtype="category") -> "DataFr See Also -------- - get_dummies : the inverse operation + get_dummies : The inverse operation. Examples -------- @@ -954,7 +954,7 @@ def get_dummies( See Also -------- Series.str.get_dummies : Convert Series to dummy codes. - from_dummies : the inverse operation. + from_dummies : The inverse operation. Examples -------- From 8ffb0baf0c9f4a387253fb76d4ee0a9dde03845d Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 10 Apr 2020 12:50:48 +0100 Subject: [PATCH 11/16] type signature, simplify validation --- pandas/__init__.py | 204 +++++++++++----------- pandas/core/reshape/reshape.py | 10 +- pandas/tests/reshape/test_from_dummies.py | 6 +- 3 files changed, 108 insertions(+), 112 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index 85d9a452c4cc9..fcfdefda428e9 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -18,171 +18,163 @@ ) del hard_dependencies, dependency, missing_dependencies +from pandas._config import ( + describe_option, + get_option, + option_context, + options, + reset_option, + set_option, +) + # numpy compat from pandas.compat.numpy import ( + _is_numpy_dev, _np_version_under1p14, _np_version_under1p15, _np_version_under1p16, _np_version_under1p17, _np_version_under1p18, - _is_numpy_dev, -) - -try: - from pandas._libs import hashtable as _hashtable, lib as _lib, tslib as _tslib -except ImportError as e: # pragma: no cover - # hack but overkill to use re - module = str(e).replace("cannot import name ", "") - raise ImportError( - f"C extension: {module} not built. If you want to import " - "pandas from the source directory, you may need to run " - "'python setup.py build_ext --inplace --force' to build the C extensions first." - ) from e - -from pandas._config import ( - get_option, - set_option, - reset_option, - describe_option, - option_context, - options, ) +from pandas.util._print_versions import show_versions +from pandas.util._tester import test # let init-time option registration happen -import pandas.core.config_init - -from pandas.core.api import ( - # dtype +import pandas.api +import pandas.arrays +from pandas.core.api import ( # dtype; missing; indexes; tseries; conversion; misc + NA, + BooleanDtype, + Categorical, + CategoricalDtype, + CategoricalIndex, + DataFrame, + DateOffset, + DatetimeIndex, + DatetimeTZDtype, + Float64Index, + Grouper, + Index, + IndexSlice, Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, + Int64Index, + Interval, + IntervalDtype, + IntervalIndex, + MultiIndex, + NamedAgg, + NaT, + Period, + PeriodDtype, + PeriodIndex, + RangeIndex, + Series, + StringDtype, + Timedelta, + TimedeltaIndex, + Timestamp, UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype, - CategoricalDtype, - PeriodDtype, - IntervalDtype, - DatetimeTZDtype, - StringDtype, - BooleanDtype, - # missing - NA, + UInt64Index, + array, + bdate_range, + date_range, + factorize, + interval_range, isna, isnull, notna, notnull, - # indexes - Index, - CategoricalIndex, - Int64Index, - UInt64Index, - RangeIndex, - Float64Index, - MultiIndex, - IntervalIndex, - TimedeltaIndex, - DatetimeIndex, - PeriodIndex, - IndexSlice, - # tseries - NaT, - Period, period_range, - Timedelta, + set_eng_float_format, timedelta_range, - Timestamp, - date_range, - bdate_range, - Interval, - interval_range, - DateOffset, - # conversion - to_numeric, to_datetime, + to_numeric, to_timedelta, - # misc - Grouper, - factorize, unique, value_counts, - NamedAgg, - array, - Categorical, - set_eng_float_format, - Series, - DataFrame, ) - from pandas.core.arrays.sparse import SparseDtype - -from pandas.tseries.api import infer_freq -from pandas.tseries import offsets - from pandas.core.computation.api import eval - +import pandas.core.config_init from pandas.core.reshape.api import ( concat, + crosstab, + cut, + from_dummies, + get_dummies, lreshape, melt, - wide_to_long, merge, merge_asof, merge_ordered, - crosstab, pivot, pivot_table, - get_dummies, - cut, qcut, - from_dummies, + wide_to_long, ) +import pandas.testing -import pandas.api -from pandas.util._print_versions import show_versions - -from pandas.io.api import ( - # excel +from pandas.io.api import ( # excel; parsers; pickle; pytables; sql; misc ExcelFile, ExcelWriter, - read_excel, - # parsers - read_csv, - read_fwf, - read_table, - # pickle - read_pickle, - to_pickle, - # pytables HDFStore, - read_hdf, - # sql - read_sql, - read_sql_query, - read_sql_table, - # misc read_clipboard, - read_parquet, - read_orc, + read_csv, + read_excel, read_feather, + read_fwf, read_gbq, + read_hdf, read_html, read_json, - read_stata, + read_orc, + read_parquet, + read_pickle, read_sas, read_spss, + read_sql, + read_sql_query, + read_sql_table, + read_stata, + read_table, + to_pickle, ) - from pandas.io.json import _json_normalize as json_normalize - -from pandas.util._tester import test -import pandas.testing -import pandas.arrays +from pandas.tseries import offsets +from pandas.tseries.api import infer_freq # use the closest tagged version if possible from ._version import get_versions +try: + from pandas._libs import hashtable as _hashtable, lib as _lib, tslib as _tslib +except ImportError as e: # pragma: no cover + # hack but overkill to use re + module = str(e).replace("cannot import name ", "") + raise ImportError( + f"C extension: {module} not built. If you want to import " + "pandas from the source directory, you may need to run " + "'python setup.py build_ext --inplace --force' to build the C extensions first." + ) from e + + + + + + + + + + + + + v = get_versions() __version__ = v.get("closest-tag", v["version"]) __git_version__ = v.get("full-revisionid") diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 4880e886fc4a5..9cbcf7da8718a 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -6,6 +6,7 @@ import pandas._libs.algos as libalgos import pandas._libs.reshape as libreshape from pandas._libs.sparse import IntIndex +from pandas._typing import Dtype from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import maybe_promote @@ -746,7 +747,12 @@ def _convert_level_number(level_num, columns): return result -def from_dummies(data, prefix=None, prefix_sep="_", dtype="category") -> "DataFrame": +def from_dummies( + data: "DataFrame", + prefix: Optional[Union[str, List[str]]] = None, + prefix_sep: str = "_", + dtype: Dtype = "category", +) -> "DataFrame": """ The inverse transformation of ``pandas.get_dummies``. @@ -858,7 +864,7 @@ def from_dummies(data, prefix=None, prefix_sep="_", dtype="category") -> "DataFr # Check each row sums to 1 or 0 def _validate_values(data): - if not all(i in [0, 1] for i in data.sum(axis=1).unique().tolist()): + if (data.sum(axis=1) != 1).any(): raise ValueError( "Data cannot be decoded! Each row must contain only 0s and" " 1s, and each row may have at most one 1." diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 477d06708a49b..89334e2348e0b 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -9,15 +9,13 @@ [ ("str", {"col1": ["a", "a", "b"]}), (str, {"col1": ["a", "a", "b"]},), - (None, {"col1": ["a", "a", "b"]}), + ("category", {"col1": ["a", "a", "b"]}), ], ) def test_dtype(dtype, expected_dict): df = pd.DataFrame({"col1_a": [1, 1, 0], "col1_b": [0, 0, 1]}) result = pd.from_dummies(df, dtype=dtype) - expected = pd.DataFrame(expected_dict) - if dtype is None: - expected = expected.astype("category") + expected = pd.DataFrame(expected_dict, dtype=dtype) tm.assert_frame_equal(result, expected) From 2e2cb572346193b805b2c5f523da2344b7198fea Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 11 Apr 2020 13:53:19 +0100 Subject: [PATCH 12/16] don't blacken pandas/__init__ --- pandas/__init__.py | 204 +++++++++++++++++++++++---------------------- 1 file changed, 106 insertions(+), 98 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index fcfdefda428e9..85d9a452c4cc9 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -18,163 +18,171 @@ ) del hard_dependencies, dependency, missing_dependencies -from pandas._config import ( - describe_option, - get_option, - option_context, - options, - reset_option, - set_option, -) - # numpy compat from pandas.compat.numpy import ( - _is_numpy_dev, _np_version_under1p14, _np_version_under1p15, _np_version_under1p16, _np_version_under1p17, _np_version_under1p18, + _is_numpy_dev, +) + +try: + from pandas._libs import hashtable as _hashtable, lib as _lib, tslib as _tslib +except ImportError as e: # pragma: no cover + # hack but overkill to use re + module = str(e).replace("cannot import name ", "") + raise ImportError( + f"C extension: {module} not built. If you want to import " + "pandas from the source directory, you may need to run " + "'python setup.py build_ext --inplace --force' to build the C extensions first." + ) from e + +from pandas._config import ( + get_option, + set_option, + reset_option, + describe_option, + option_context, + options, ) -from pandas.util._print_versions import show_versions -from pandas.util._tester import test # let init-time option registration happen -import pandas.api -import pandas.arrays -from pandas.core.api import ( # dtype; missing; indexes; tseries; conversion; misc - NA, - BooleanDtype, - Categorical, - CategoricalDtype, - CategoricalIndex, - DataFrame, - DateOffset, - DatetimeIndex, - DatetimeTZDtype, - Float64Index, - Grouper, - Index, - IndexSlice, +import pandas.core.config_init + +from pandas.core.api import ( + # dtype Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, - Int64Index, - Interval, - IntervalDtype, - IntervalIndex, - MultiIndex, - NamedAgg, - NaT, - Period, - PeriodDtype, - PeriodIndex, - RangeIndex, - Series, - StringDtype, - Timedelta, - TimedeltaIndex, - Timestamp, UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype, - UInt64Index, - array, - bdate_range, - date_range, - factorize, - interval_range, + CategoricalDtype, + PeriodDtype, + IntervalDtype, + DatetimeTZDtype, + StringDtype, + BooleanDtype, + # missing + NA, isna, isnull, notna, notnull, + # indexes + Index, + CategoricalIndex, + Int64Index, + UInt64Index, + RangeIndex, + Float64Index, + MultiIndex, + IntervalIndex, + TimedeltaIndex, + DatetimeIndex, + PeriodIndex, + IndexSlice, + # tseries + NaT, + Period, period_range, - set_eng_float_format, + Timedelta, timedelta_range, - to_datetime, + Timestamp, + date_range, + bdate_range, + Interval, + interval_range, + DateOffset, + # conversion to_numeric, + to_datetime, to_timedelta, + # misc + Grouper, + factorize, unique, value_counts, + NamedAgg, + array, + Categorical, + set_eng_float_format, + Series, + DataFrame, ) + from pandas.core.arrays.sparse import SparseDtype + +from pandas.tseries.api import infer_freq +from pandas.tseries import offsets + from pandas.core.computation.api import eval -import pandas.core.config_init + from pandas.core.reshape.api import ( concat, - crosstab, - cut, - from_dummies, - get_dummies, lreshape, melt, + wide_to_long, merge, merge_asof, merge_ordered, + crosstab, pivot, pivot_table, + get_dummies, + cut, qcut, - wide_to_long, + from_dummies, ) -import pandas.testing -from pandas.io.api import ( # excel; parsers; pickle; pytables; sql; misc +import pandas.api +from pandas.util._print_versions import show_versions + +from pandas.io.api import ( + # excel ExcelFile, ExcelWriter, + read_excel, + # parsers + read_csv, + read_fwf, + read_table, + # pickle + read_pickle, + to_pickle, + # pytables HDFStore, + read_hdf, + # sql + read_sql, + read_sql_query, + read_sql_table, + # misc read_clipboard, - read_csv, - read_excel, + read_parquet, + read_orc, read_feather, - read_fwf, read_gbq, - read_hdf, read_html, read_json, - read_orc, - read_parquet, - read_pickle, + read_stata, read_sas, read_spss, - read_sql, - read_sql_query, - read_sql_table, - read_stata, - read_table, - to_pickle, ) + from pandas.io.json import _json_normalize as json_normalize -from pandas.tseries import offsets -from pandas.tseries.api import infer_freq + +from pandas.util._tester import test +import pandas.testing +import pandas.arrays # use the closest tagged version if possible from ._version import get_versions -try: - from pandas._libs import hashtable as _hashtable, lib as _lib, tslib as _tslib -except ImportError as e: # pragma: no cover - # hack but overkill to use re - module = str(e).replace("cannot import name ", "") - raise ImportError( - f"C extension: {module} not built. If you want to import " - "pandas from the source directory, you may need to run " - "'python setup.py build_ext --inplace --force' to build the C extensions first." - ) from e - - - - - - - - - - - - - v = get_versions() __version__ = v.get("closest-tag", v["version"]) __git_version__ = v.get("full-revisionid") From d2916d95bcef8f5ee5ba4bbed9fbb18bfbe65adf Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 18 Apr 2020 09:32:54 +0100 Subject: [PATCH 13/16] revert taking check_len out of get_dummies --- pandas/core/reshape/reshape.py | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index de5fe04ae33c8..e5e887a06bac4 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -871,21 +871,6 @@ def _validate_values(data): return out -def _check_len(item, name, data_to_encode): - """ Validate prefixes and separator to avoid silently dropping cols. """ - len_msg = ( - "Length of '{name}' ({len_item}) did not match the " - "length of the columns being encoded ({len_enc})." - ) - - if is_list_like(item): - if not len(item) == data_to_encode.shape[1]: - len_msg = len_msg.format( - name=name, len_item=len(item), len_enc=data_to_encode.shape[1] - ) - raise ValueError(len_msg) - - def get_dummies( data, prefix=None, @@ -1007,8 +992,20 @@ def get_dummies( else: data_to_encode = data[columns] - _check_len(prefix, "prefix", data_to_encode) - _check_len(prefix_sep, "prefix_sep", data_to_encode) + # validate prefixes and separator to avoid silently dropping cols + def check_len(item, name): + + if is_list_like(item): + if not len(item) == data_to_encode.shape[1]: + len_msg = ( + f"Length of '{name}' ({len(item)}) did not match the " + "length of the columns being encoded " + f"({data_to_encode.shape[1]})." + ) + raise ValueError(len_msg) + + check_len(prefix, "prefix") + check_len(prefix_sep, "prefix_sep") if isinstance(prefix, str): prefix = itertools.cycle([prefix]) From 6a8456900bc8feca782b07d77e9bda1418f6e2ab Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 18 Apr 2020 09:52:53 +0100 Subject: [PATCH 14/16] reword comment --- pandas/core/reshape/reshape.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index e5e887a06bac4..91e4414a580c2 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -820,8 +820,8 @@ def from_dummies( " pass a value to `prefix` with which to name" " the decoded columns." ) - # If no column contains `prefix_sep`, we add `prefix`_`prefix_sep` to - # each column. + # If no column contains `prefix_sep`, we prepend `prefix` and + # `prefix_sep` to each column. out = data.rename(columns=lambda x: f"{prefix}{prefix_sep}{x}").copy() columns_to_decode = out.columns else: From 5efee2969cc5ebfe5532eda72352a7de0628e47a Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 18 Apr 2020 10:36:33 +0100 Subject: [PATCH 15/16] put space at end of string, rather than beginning --- pandas/core/reshape/reshape.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 91e4414a580c2..258ee6c2460d8 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -816,9 +816,9 @@ def from_dummies( if not columns_to_decode: if prefix is None: raise ValueError( - "If no columns contain `prefix_sep`, you must" - " pass a value to `prefix` with which to name" - " the decoded columns." + "If no columns contain `prefix_sep`, you must " + "pass a value to `prefix` with which to name " + "the decoded columns." ) # If no column contains `prefix_sep`, we prepend `prefix` and # `prefix_sep` to each column. From 1bf16c9916c98147905b14d99b7ee37ac04130e7 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 18 Apr 2020 11:00:06 +0100 Subject: [PATCH 16/16] put space at end of string --- pandas/core/reshape/reshape.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 258ee6c2460d8..23f14cc84ad62 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -847,8 +847,8 @@ def from_dummies( def _validate_values(data): if (data.sum(axis=1) != 1).any(): raise ValueError( - "Data cannot be decoded! Each row must contain only 0s and" - " 1s, and each row may have at most one 1." + "Data cannot be decoded! Each row must contain only 0s and " + "1s, and each row may have at most one 1." ) for prefix_ in prefix: