From ed949ae05990694d800c02eaffbae34d3bf0f2a8 Mon Sep 17 00:00:00 2001 From: changhiskhan Date: Wed, 19 Dec 2018 16:07:39 -0800 Subject: [PATCH 01/34] [ENH] Add DataFrame method to explode a list-like column (GH #16538) Sometimes a values column is presented with list-like values on one row. Instead we may want to split each individual value onto its own row, keeping the same mapping to the other key columns. While it's possible to chain together existing pandas operations (in fact that's exactly what this implementation is) to do this, the sequence of operations is not obvious. By contrast this is available as a built-in operation in say Spark and is a fairly common use case. --- asv_bench/benchmarks/reshape.py | 18 ++++++ doc/source/user_guide/reshaping.rst | 31 ++++++++++ doc/source/whatsnew/v0.24.0.rst | 30 +++++++++ pandas/core/frame.py | 51 ++++++++++++++++ pandas/tests/frame/test_reshape.py | 95 +++++++++++++++++++++++++++++ 5 files changed, 225 insertions(+) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index f41e13163b3f5..25d4c106c0e20 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -240,4 +240,22 @@ def time_qcut_datetime(self, bins): pd.qcut(self.datetime_series, bins) +class Explode(object): + param_names = ['n_rows', 'max_list_length'] + params = [[100, 1000, 10000], [3, 5, 10]] + + def setup(self, n_rows, max_list_length): + import string + num_letters = np.random.randint(0, max_list_length, n_rows) + key_column = [','.join([np.random.choice(list(string.ascii_letters)) + for _ in range(k)]) + for k in num_letters] + value_column = np.random.randn(n_rows) + self.frame = pd.DataFrame({'key': key_column, + 'value': value_column}) + + def time_explode(self, n_rows, max_list_length): + self.frame.explode('key', sep=',') + + from .pandas_vb_common import setup # noqa: F401 diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index b7b6dd0a69c24..699b6374ca35a 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -801,3 +801,34 @@ Note to subdivide over multiple columns we can pass in a list to the df.pivot_table( values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean']) + +.. _reshaping.explode: + +Exploding a List-like Column +---------------------------- + +Sometimes the value column is list-like: + +.. ipython:: python + + keys = ['panda1', 'panda2', 'panda3'] + values = [['eats', 'shoots'], ['shoots', 'leaves'], ['eats', 'leaves']] + df = pd.DataFrame({'keys': keys, 'values': values}) + df + +But we actually want to put each value onto its own row. +For this purpose we can use ``DataFrame.explode``: + +.. ipython:: python + + df.explode('values') + +For convenience, we can use the optional keyword ``sep`` to automatically +split a string column before exploding: + +.. ipython:: python + + values = ['eats,shoots', 'shoots,leaves', 'eats,shoots,leaves'] + df2 = pd.DataFrame({'keys': keys, 'values': values}) + df2 + df2.explode('values', sep=',') diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index a66056f661de3..249fc72596dda 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -15,7 +15,37 @@ This is a major release from 0.23.4 and includes a number of API changes, new features, enhancements, and performance improvements along with a large number of bug fixes. +<<<<<<< HEAD Highlights include: +======= +These are the changes in pandas 0.24.0. See :ref:`release` for a full changelog +including other versions of pandas. + +.. _whatsnew_0240.enhancements: + +New features +~~~~~~~~~~~~ +- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`) +- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) +- ``FrozenList`` has gained the ``.union()`` and ``.difference()`` methods. This functionality greatly simplifies groupby's that rely on explicitly excluding certain columns. See :ref:`Splitting an object into groups ` for more information (:issue:`15475`, :issue:`15506`). +- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing + the user to override the engine's default behavior to include or omit the + dataframe's indexes from the resulting Parquet file. (:issue:`20768`) +- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) +- :func:`DataFrame.to_string` now accepts ``decimal`` as an argument, allowing the user to specify which decimal separator should be used in the output. (:issue:`23614`) +- :func:`read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`) +- :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame. + See the :ref:`section on writing HTML ` in the IO docs for example usage. (:issue:`2679`) +- :func:`DataFrame.explode` to split list-like values onto individual rows. See :ref:`section on Exploding list-like column ` in docs for more information (:issue:`16538`) + +.. _whatsnew_0240.values_api: + +Accessing the values in a Series or Index +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:attr:`Series.array` and :attr:`Index.array` have been added for extracting the array backing a +``Series`` or ``Index``. (:issue:`19954`, :issue:`23623`) +>>>>>>> 2138ef063... [ENH] Add DataFrame method to explode a list-like column (GH #16538) * :ref:`Optional Integer NA Support ` * :ref:`New APIs for accessing the array backing a Series or Index ` diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f45a13249b16c..79566cf9f583c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6423,6 +6423,57 @@ def melt( col_level=col_level, ) + def explode(self, col_name, sep=None, dtype=None): + """ + Create new DataFrame expanding a list-like column. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + col_name : str + Name of the column to be exploded. + sep : str, default None + Convenience to split a string `col_name` before exploding. + dtype : str or dtype, default None + Optionally coerce the dtype of exploded column. + + Returns + ------- + exploded: DataFrame + + See Also + -------- + Series.str.split: Split string values on specified separator. + Series.str.extract: Extract groups from the first regex match. + + Examples + -------- + >>> df = pd.DataFrame({'k': ['a,b', 'c,d'], 'v': [0, 1]}) + >>> df.explode('k', sep=',') + k v + 0 a 0 + 0 b 0 + 1 c 1 + 1 d 1 + """ + col = self[col_name] + if len(self) == 0: + return self.copy() + if sep: + col_expanded = col.str.split(sep, expand=True) + else: + col_expanded = col.apply(Series) + col_stacked = (col_expanded + .stack() + .reset_index(level=-1, drop=True) + .rename(col_name)) + if dtype: + col_stacked = col_stacked.astype(dtype) + return (col_stacked.to_frame() + .join(self.drop(col_name, axis=1)) + .reindex(self.columns, axis=1)) + # ---------------------------------------------------------------------- # Time series-related diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index f3452e9a85fb3..363191b92c78a 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -1043,6 +1043,101 @@ def test_unstack_swaplevel_sortlevel(self, level): tm.assert_frame_equal(result, expected) +class TestDataFrameExplode(object): + # GH 16538 + columns = ['a', 'b', 'c'] + + def test_sep(self): + # Automatically do str.split + df = pd.DataFrame([['foo,bar', 'x', 42], + ['fizz,buzz', 'y', 43]], + columns=self.columns) + rs = df.explode('a', sep=',') + xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'], + 'b': ['x', 'x', 'y', 'y'], + 'c': [42, 42, 43, 43]}, + index=[0, 0, 1, 1]) + tm.assert_frame_equal(rs, xp) + + def test_dtype(self): + # Coerce dtype + df = pd.DataFrame([[[0, 1, 4], 'x', 42], + [[2, 3], 'y', 43]], + columns=self.columns) + rs = df.explode('a', dtype='int') + xp = pd.DataFrame({'a': np.array([0, 1, 4, 2, 3], dtype='int'), + 'b': ['x', 'x', 'x', 'y', 'y'], + 'c': [42, 42, 42, 43, 43]}, + index=[0, 0, 0, 1, 1]) + tm.assert_frame_equal(rs, xp) + + def test_na(self): + # NaN's and empty lists are omitted + # TODO: option to preserve explicit NAs instead + df = pd.DataFrame([[[], 'x', 42], + [[2.0, np.nan], 'y', 43]], + columns=self.columns) + rs = df.explode('a') + xp = pd.DataFrame({'a': [2.0], + 'b': ['y'], + 'c': [43]}, + index=[1]) + tm.assert_frame_equal(rs, xp) + + def test_nonuniform_type(self): + # Not everything is a list + df = pd.DataFrame([[[0, 1, 4], 'x', 42], + [3, 'y', 43]], + columns=self.columns) + rs = df.explode('a', dtype='int') + xp = pd.DataFrame({'a': np.array([0, 1, 4, 3], dtype='int'), + 'b': ['x', 'x', 'x', 'y'], + 'c': [42, 42, 42, 43]}, + index=[0, 0, 0, 1]) + tm.assert_frame_equal(rs, xp) + + def test_all_scalars(self): + # Nothing is a list + df = pd.DataFrame([[0, 'x', 42], + [3, 'y', 43]], + columns=self.columns) + rs = df.explode('a') + xp = pd.DataFrame({'a': [0, 3], + 'b': ['x', 'y'], + 'c': [42, 43]}, + index=[0, 1]) + tm.assert_frame_equal(rs, xp) + + def test_empty(self): + # Empty frame + rs = pd.DataFrame(columns=['a', 'b']).explode('a') + xp = pd.DataFrame(columns=['a', 'b']) + tm.assert_frame_equal(rs, xp) + + def test_missing_column(self): + # Bad column name + df = pd.DataFrame([[0, 'x', 42], + [3, 'y', 43]], + columns=self.columns) + pytest.raises(KeyError, df.explode, 'badcolumnname') + + def test_multi_index(self): + # Multi-index + idx = pd.MultiIndex.from_tuples([(0, 'a'), (1, 'b')]) + df = pd.DataFrame([['foo,bar', 'x', 42], + ['fizz,buzz', 'y', 43]], + columns=self.columns, + index=idx) + rs = df.explode('a', sep=',') + idx = pd.MultiIndex.from_tuples( + [(0, 'a'), (0, 'a'), (1, 'b'), (1, 'b')]) + xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'], + 'b': ['x', 'x', 'y', 'y'], + 'c': [42, 42, 43, 43]}, + index=idx) + tm.assert_frame_equal(rs, xp) + + def test_unstack_fill_frame_object(): # GH12815 Test unstacking with object. data = pd.Series(["a", "b", "c", "a"], dtype="object") From 959ed4cf74a89e4a0e6917826692341a5f677a75 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Jul 2019 12:39:45 -0400 Subject: [PATCH 02/34] move to Series --- Makefile | 2 + asv_bench/benchmarks/reshape.py | 18 +++--- doc/source/reference/series.rst | 2 +- doc/source/user_guide/reshaping.rst | 14 ++--- doc/source/whatsnew/v0.24.0.rst | 30 --------- doc/source/whatsnew/v0.25.0.rst | 14 +++++ pandas/_libs/reshape.pyx | 59 +++++++++++++++++- pandas/core/frame.py | 51 ---------------- pandas/core/series.py | 48 ++++++++++++++- pandas/tests/frame/test_reshape.py | 95 ----------------------------- pandas/tests/series/test_explode.py | 52 ++++++++++++++++ 11 files changed, 188 insertions(+), 197 deletions(-) create mode 100644 pandas/tests/series/test_explode.py diff --git a/Makefile b/Makefile index a02fe145c5f0e..baceefe6d49ff 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,7 @@ .PHONY : develop build clean clean_pyc doc lint-diff black +all: develop + clean: -python setup.py clean diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 25d4c106c0e20..503fbd507dc8c 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -241,21 +241,19 @@ def time_qcut_datetime(self, bins): class Explode(object): - param_names = ['n_rows', 'max_list_length'] + param_names = ["n_rows", "max_list_length"] params = [[100, 1000, 10000], [3, 5, 10]] def setup(self, n_rows, max_list_length): - import string - num_letters = np.random.randint(0, max_list_length, n_rows) - key_column = [','.join([np.random.choice(list(string.ascii_letters)) - for _ in range(k)]) - for k in num_letters] - value_column = np.random.randn(n_rows) - self.frame = pd.DataFrame({'key': key_column, - 'value': value_column}) + + l = [[np.arange(np.random.randint(max_list_length)) for _ in range(n_rows)]] + self.series = pd.Series(l) def time_explode(self, n_rows, max_list_length): - self.frame.explode('key', sep=',') + try: + self.series.explode() + except AttributeError: + pass from .pandas_vb_common import setup # noqa: F401 diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 8d2a764c33a43..7ba625c141f24 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -245,6 +245,7 @@ Reshaping, sorting Series.sort_index Series.swaplevel Series.unstack + Series.explode Series.searchsorted Series.ravel Series.repeat @@ -590,4 +591,3 @@ Sparse SparseSeries.to_coo SparseSeries.from_coo - diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 699b6374ca35a..4a625f0b95857 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -807,6 +807,8 @@ Note to subdivide over multiple columns we can pass in a list to the Exploding a List-like Column ---------------------------- +.. versionadded:: 0.25.0 + Sometimes the value column is list-like: .. ipython:: python @@ -817,18 +819,14 @@ Sometimes the value column is list-like: df But we actually want to put each value onto its own row. -For this purpose we can use ``DataFrame.explode``: +For this purpose we can use ``~Series.explode``. This will replicate the index values: .. ipython:: python - df.explode('values') + df['values'].explode() -For convenience, we can use the optional keyword ``sep`` to automatically -split a string column before exploding: +You can merge this back into the original to expand the dataframe. .. ipython:: python - values = ['eats,shoots', 'shoots,leaves', 'eats,shoots,leaves'] - df2 = pd.DataFrame({'keys': keys, 'values': values}) - df2 - df2.explode('values', sep=',') + pd.merge(df, df['values'].explode(), left_index=True, right_index=True) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 249fc72596dda..a66056f661de3 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -15,37 +15,7 @@ This is a major release from 0.23.4 and includes a number of API changes, new features, enhancements, and performance improvements along with a large number of bug fixes. -<<<<<<< HEAD Highlights include: -======= -These are the changes in pandas 0.24.0. See :ref:`release` for a full changelog -including other versions of pandas. - -.. _whatsnew_0240.enhancements: - -New features -~~~~~~~~~~~~ -- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`) -- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) -- ``FrozenList`` has gained the ``.union()`` and ``.difference()`` methods. This functionality greatly simplifies groupby's that rely on explicitly excluding certain columns. See :ref:`Splitting an object into groups ` for more information (:issue:`15475`, :issue:`15506`). -- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing - the user to override the engine's default behavior to include or omit the - dataframe's indexes from the resulting Parquet file. (:issue:`20768`) -- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) -- :func:`DataFrame.to_string` now accepts ``decimal`` as an argument, allowing the user to specify which decimal separator should be used in the output. (:issue:`23614`) -- :func:`read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`) -- :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame. - See the :ref:`section on writing HTML ` in the IO docs for example usage. (:issue:`2679`) -- :func:`DataFrame.explode` to split list-like values onto individual rows. See :ref:`section on Exploding list-like column ` in docs for more information (:issue:`16538`) - -.. _whatsnew_0240.values_api: - -Accessing the values in a Series or Index -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:attr:`Series.array` and :attr:`Index.array` have been added for extracting the array backing a -``Series`` or ``Index``. (:issue:`19954`, :issue:`23623`) ->>>>>>> 2138ef063... [ENH] Add DataFrame method to explode a list-like column (GH #16538) * :ref:`Optional Integer NA Support ` * :ref:`New APIs for accessing the array backing a Series or Index ` diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 59cd6615b7395..cf9db2a87694c 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -182,6 +182,20 @@ The repr now looks like this: json_normalize(data, max_level=1) +.. _whatsnew_0250.enhancements.explode: + +Series.explode to split list-like values to rows +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`Series` has gained the :meth:`Series.explode` method to tranform list-likes to onto individual rows. See :ref:`section on Exploding list-like column ` in docs for more information (:issue:`16538`) + + +.. ipython:: python + + s = pd.Series([[1, 2, 3], np.nan, [], [3, 4]]) + s + s.explode() + .. _whatsnew_0250.enhancements.other: Other enhancements diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 35b2ab4aa5326..ce6402c1e490a 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -2,7 +2,8 @@ import cython from cython import Py_ssize_t from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, - uint32_t, uint64_t, float32_t, float64_t) + uint32_t, uint64_t, float32_t, float64_t, ndarray) +import numpy ctypedef fused reshape_t: @@ -91,3 +92,59 @@ unstack_int64 = unstack["int64_t"] unstack_float32 = unstack["float32_t"] unstack_float64 = unstack["float64_t"] unstack_object = unstack["object"] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def explode(ndarray[object] values): + """ + transform array list-likes to long form + preserve non-list entries + + Parameters + ---------- + values : object ndarray + + Returns + ------- + tuple(values, counts) + """ + cdef: + Py_ssize_t i, j, count, n + object v + ndarray[object] result + ndarray[uint8_t] counts + + # find the resulting len + n = len(values) + counts = numpy.zeros(n, dtype='uint8') + for i in range(n): + v = values[i] + if isinstance(v, (list, tuple)): + if len(v): + counts[i] += len(v) + else: + # empty list-like, use a nan marker + counts[i] += 1 + else: + counts[i] += 1 + + result = numpy.empty(counts.sum(), dtype='object') + count = 0 + for i in range(n): + v = values[i] + + if isinstance(v, (list, tuple)): + if len(v): + for j in range(len(v)): + result[count] = v[j] + count += 1 + else: + # empty list-like, use a nan marker + result[count] = numpy.nan + count += 1 + else: + # replace with the existing scalar + result[count] = v + count += 1 + return result, counts diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 79566cf9f583c..f45a13249b16c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6423,57 +6423,6 @@ def melt( col_level=col_level, ) - def explode(self, col_name, sep=None, dtype=None): - """ - Create new DataFrame expanding a list-like column. - - .. versionadded:: 0.24.0 - - Parameters - ---------- - col_name : str - Name of the column to be exploded. - sep : str, default None - Convenience to split a string `col_name` before exploding. - dtype : str or dtype, default None - Optionally coerce the dtype of exploded column. - - Returns - ------- - exploded: DataFrame - - See Also - -------- - Series.str.split: Split string values on specified separator. - Series.str.extract: Extract groups from the first regex match. - - Examples - -------- - >>> df = pd.DataFrame({'k': ['a,b', 'c,d'], 'v': [0, 1]}) - >>> df.explode('k', sep=',') - k v - 0 a 0 - 0 b 0 - 1 c 1 - 1 d 1 - """ - col = self[col_name] - if len(self) == 0: - return self.copy() - if sep: - col_expanded = col.str.split(sep, expand=True) - else: - col_expanded = col.apply(Series) - col_stacked = (col_expanded - .stack() - .reset_index(level=-1, drop=True) - .rename(col_name)) - if dtype: - col_stacked = col_stacked.astype(dtype) - return (col_stacked.to_frame() - .join(self.drop(col_name, axis=1)) - .reindex(self.columns, axis=1)) - # ---------------------------------------------------------------------- # Time series-related diff --git a/pandas/core/series.py b/pandas/core/series.py index 46b96c1ece77c..ff326b96e3d2d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -12,7 +12,7 @@ from pandas._config import get_option -from pandas._libs import iNaT, index as libindex, lib, tslibs +from pandas._libs import iNaT, index as libindex, lib, tslibs, reshape from pandas.compat import PY36 from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, deprecate @@ -3635,6 +3635,52 @@ def reorder_levels(self, order): result.index = result.index.reorder_levels(order) return result + def explode(self): + """ + Create new Series expanding a list-like column. + + .. versionadded:: 0.25.0 + + Returns + ------- + exploded: Series + + See Also + -------- + Series.str.split: Split string values on specified separator. + Series.unstakc: Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. + + Examples + -------- + In [1]: s = pd.Series([[1, 2, 3], np.nan, [], [3, 4]]) + + In [2]: s + Out[2]: + 0 [1, 2, 3] + 1 NaN + 2 [] + 3 [3, 4] + dtype: object + + In [3]: s.explode() + Out[3]: + 0 1 + 0 2 + 0 3 + 1 NaN + 2 NaN + 3 3 + 3 4 + dtype: object + """ + if not len(self): + return self.copy() + + values, counts = reshape.explode(np.asarray(self.array)) + + result = Series(values, index=self.index.repeat(counts), name=self.name) + return result + def unstack(self, level=-1, fill_value=None): """ Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 363191b92c78a..f3452e9a85fb3 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -1043,101 +1043,6 @@ def test_unstack_swaplevel_sortlevel(self, level): tm.assert_frame_equal(result, expected) -class TestDataFrameExplode(object): - # GH 16538 - columns = ['a', 'b', 'c'] - - def test_sep(self): - # Automatically do str.split - df = pd.DataFrame([['foo,bar', 'x', 42], - ['fizz,buzz', 'y', 43]], - columns=self.columns) - rs = df.explode('a', sep=',') - xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'], - 'b': ['x', 'x', 'y', 'y'], - 'c': [42, 42, 43, 43]}, - index=[0, 0, 1, 1]) - tm.assert_frame_equal(rs, xp) - - def test_dtype(self): - # Coerce dtype - df = pd.DataFrame([[[0, 1, 4], 'x', 42], - [[2, 3], 'y', 43]], - columns=self.columns) - rs = df.explode('a', dtype='int') - xp = pd.DataFrame({'a': np.array([0, 1, 4, 2, 3], dtype='int'), - 'b': ['x', 'x', 'x', 'y', 'y'], - 'c': [42, 42, 42, 43, 43]}, - index=[0, 0, 0, 1, 1]) - tm.assert_frame_equal(rs, xp) - - def test_na(self): - # NaN's and empty lists are omitted - # TODO: option to preserve explicit NAs instead - df = pd.DataFrame([[[], 'x', 42], - [[2.0, np.nan], 'y', 43]], - columns=self.columns) - rs = df.explode('a') - xp = pd.DataFrame({'a': [2.0], - 'b': ['y'], - 'c': [43]}, - index=[1]) - tm.assert_frame_equal(rs, xp) - - def test_nonuniform_type(self): - # Not everything is a list - df = pd.DataFrame([[[0, 1, 4], 'x', 42], - [3, 'y', 43]], - columns=self.columns) - rs = df.explode('a', dtype='int') - xp = pd.DataFrame({'a': np.array([0, 1, 4, 3], dtype='int'), - 'b': ['x', 'x', 'x', 'y'], - 'c': [42, 42, 42, 43]}, - index=[0, 0, 0, 1]) - tm.assert_frame_equal(rs, xp) - - def test_all_scalars(self): - # Nothing is a list - df = pd.DataFrame([[0, 'x', 42], - [3, 'y', 43]], - columns=self.columns) - rs = df.explode('a') - xp = pd.DataFrame({'a': [0, 3], - 'b': ['x', 'y'], - 'c': [42, 43]}, - index=[0, 1]) - tm.assert_frame_equal(rs, xp) - - def test_empty(self): - # Empty frame - rs = pd.DataFrame(columns=['a', 'b']).explode('a') - xp = pd.DataFrame(columns=['a', 'b']) - tm.assert_frame_equal(rs, xp) - - def test_missing_column(self): - # Bad column name - df = pd.DataFrame([[0, 'x', 42], - [3, 'y', 43]], - columns=self.columns) - pytest.raises(KeyError, df.explode, 'badcolumnname') - - def test_multi_index(self): - # Multi-index - idx = pd.MultiIndex.from_tuples([(0, 'a'), (1, 'b')]) - df = pd.DataFrame([['foo,bar', 'x', 42], - ['fizz,buzz', 'y', 43]], - columns=self.columns, - index=idx) - rs = df.explode('a', sep=',') - idx = pd.MultiIndex.from_tuples( - [(0, 'a'), (0, 'a'), (1, 'b'), (1, 'b')]) - xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'], - 'b': ['x', 'x', 'y', 'y'], - 'c': [42, 42, 43, 43]}, - index=idx) - tm.assert_frame_equal(rs, xp) - - def test_unstack_fill_frame_object(): # GH12815 Test unstacking with object. data = pd.Series(["a", "b", "c", "a"], dtype="object") diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py new file mode 100644 index 0000000000000..b5c7af83547c7 --- /dev/null +++ b/pandas/tests/series/test_explode.py @@ -0,0 +1,52 @@ +import numpy as np + +import pandas as pd +from pandas.util import testing as tm + + +def test_basic(): + s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)], name="foo") + result = s.explode() + expected = pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], + index=[0, 0, 0, 1, 2, 3, 3], + dtype=object, + name="foo", + ) + tm.assert_series_equal(result, expected) + + +def test_mixed_type(): + s = pd.Series([[0, 1, 2], np.nan, None, [], ("a", "b")], name="foo") + result = s.explode() + expected = pd.Series( + [0, 1, 2, np.nan, None, np.nan, "a", "b"], + index=[0, 0, 0, 1, 2, 3, 4, 4], + dtype=object, + name="foo", + ) + tm.assert_series_equal(result, expected) + + +def test_empty(): + s = pd.Series() + result = s.explode() + expected = s.copy() + tm.assert_series_equal(result, expected) + + +def test_multi_index(): + s = pd.Series( + [[0, 1, 2], np.nan, [], (3, 4)], + name="foo", + index=pd.MultiIndex.from_product([list("ab"), range(2)], names=["foo", "bar"]), + ) + result = s.explode() + index = pd.MultiIndex.from_tuples( + [("a", 0), ("a", 0), ("a", 0), ("a", 1), ("b", 0), ("b", 1), ("b", 1)], + names=["foo", "bar"], + ) + expected = pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], index=index, dtype=object, name="foo" + ) + tm.assert_series_equal(result, expected) From 1e4b4bd248a444094db5337180935412dcff3c20 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Jul 2019 15:37:12 -0400 Subject: [PATCH 03/34] handle generic list-like --- pandas/_libs/lib.pyx | 16 ++++++++++++++++ pandas/_libs/reshape.pyx | 5 +++-- pandas/tests/series/test_explode.py | 4 +++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1936404b75602..85fe3b672e2dd 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -886,6 +886,22 @@ def is_period(val: object) -> bool: return util.is_period_object(val) +cpdef inline bint is_list_like(object obj): + """ + Parameters + ---------- + obj : object + + Returns + ------- + bool : if this is a list-like object + """ + return (hasattr(obj, '__iter__') and + not isinstance(obj, (str, bytes)) + # exclude zero-dimensional numpy arrays, effectively scalars + and not (isinstance(obj, ndarray) and obj.ndim == 0)) + + _TYPE_MAP = { 'categorical': 'categorical', 'category': 'categorical', diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index ce6402c1e490a..8014e64ae6e66 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -4,6 +4,7 @@ from cython import Py_ssize_t from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t, float32_t, float64_t, ndarray) import numpy +from pandas._libs.lib import is_list_like ctypedef fused reshape_t: @@ -120,7 +121,7 @@ def explode(ndarray[object] values): counts = numpy.zeros(n, dtype='uint8') for i in range(n): v = values[i] - if isinstance(v, (list, tuple)): + if is_list_like(v): if len(v): counts[i] += len(v) else: @@ -134,7 +135,7 @@ def explode(ndarray[object] values): for i in range(n): v = values[i] - if isinstance(v, (list, tuple)): + if is_list_like(v): if len(v): for j in range(len(v)): result[count] = v[j] diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py index b5c7af83547c7..0acbdfd8808b4 100644 --- a/pandas/tests/series/test_explode.py +++ b/pandas/tests/series/test_explode.py @@ -17,7 +17,9 @@ def test_basic(): def test_mixed_type(): - s = pd.Series([[0, 1, 2], np.nan, None, [], ("a", "b")], name="foo") + s = pd.Series( + [[0, 1, 2], np.nan, None, np.array([]), pd.Series(["a", "b"])], name="foo" + ) result = s.explode() expected = pd.Series( [0, 1, 2, np.nan, None, np.nan, "a", "b"], From ef9eae86b3c75050aa25955189e24bb7e573f673 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Jul 2019 15:44:08 -0400 Subject: [PATCH 04/34] lint on asv --- asv_bench/benchmarks/reshape.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 503fbd507dc8c..c0f12f1fb75a1 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -246,8 +246,8 @@ class Explode(object): def setup(self, n_rows, max_list_length): - l = [[np.arange(np.random.randint(max_list_length)) for _ in range(n_rows)]] - self.series = pd.Series(l) + data = [[np.arange(np.random.randint(max_list_length)) for _ in range(n_rows)]] + self.series = pd.Series(data) def time_explode(self, n_rows, max_list_length): try: From 790f06ecacebd1b8e4746b47c3110116eace4922 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Jul 2019 15:51:18 -0400 Subject: [PATCH 05/34] move is_list_like to cython and share impl --- pandas/_libs/lib.pyx | 53 ++++++++++++++++++++++++++----- pandas/core/dtypes/inference.py | 56 ++------------------------------- 2 files changed, 48 insertions(+), 61 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 85fe3b672e2dd..5952b3deef0ae 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1,3 +1,4 @@ +from collections import abc from decimal import Decimal from fractions import Fraction from numbers import Number @@ -886,20 +887,58 @@ def is_period(val: object) -> bool: return util.is_period_object(val) -cpdef inline bint is_list_like(object obj): +cpdef is_list_like(obj: object, allow_sets: bool = True): """ + Check if the object is list-like. + + Objects that are considered list-like are for example Python + lists, tuples, sets, NumPy arrays, and Pandas Series. + + Strings and datetime objects, however, are not considered list-like. + Parameters ---------- - obj : object + obj : The object to check + allow_sets : boolean, default True + If this parameter is False, sets will not be considered list-like + + .. versionadded:: 0.24.0 Returns ------- - bool : if this is a list-like object + is_list_like : bool + Whether `obj` has list-like properties. + + Examples + -------- + >>> is_list_like([1, 2, 3]) + True + >>> is_list_like({1, 2, 3}) + True + >>> is_list_like(datetime(2017, 1, 1)) + False + >>> is_list_like("foo") + False + >>> is_list_like(1) + False + >>> is_list_like(np.array([2])) + True + >>> is_list_like(np.array(2))) + False """ - return (hasattr(obj, '__iter__') and - not isinstance(obj, (str, bytes)) - # exclude zero-dimensional numpy arrays, effectively scalars - and not (isinstance(obj, ndarray) and obj.ndim == 0)) + + return ( + isinstance(obj, abc.Iterable) + and + # we do not count strings/unicode/bytes as list-like + not isinstance(obj, (str, bytes)) + and + # exclude zero-dimensional numpy arrays, effectively scalars + not (isinstance(obj, np.ndarray) and obj.ndim == 0) + and + # exclude sets if allow_sets is False + not (allow_sets is False and isinstance(obj, abc.Set)) + ) _TYPE_MAP = { diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 9373ea18e8a24..461b5cc6232cd 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -23,6 +23,8 @@ is_interval = lib.is_interval +is_list_like = lib.is_list_like + def is_number(obj): """ @@ -241,60 +243,6 @@ def is_re_compilable(obj): return True -def is_list_like(obj, allow_sets=True): - """ - Check if the object is list-like. - - Objects that are considered list-like are for example Python - lists, tuples, sets, NumPy arrays, and Pandas Series. - - Strings and datetime objects, however, are not considered list-like. - - Parameters - ---------- - obj : The object to check - allow_sets : boolean, default True - If this parameter is False, sets will not be considered list-like - - .. versionadded:: 0.24.0 - - Returns - ------- - is_list_like : bool - Whether `obj` has list-like properties. - - Examples - -------- - >>> is_list_like([1, 2, 3]) - True - >>> is_list_like({1, 2, 3}) - True - >>> is_list_like(datetime(2017, 1, 1)) - False - >>> is_list_like("foo") - False - >>> is_list_like(1) - False - >>> is_list_like(np.array([2])) - True - >>> is_list_like(np.array(2))) - False - """ - - return ( - isinstance(obj, abc.Iterable) - and - # we do not count strings/unicode/bytes as list-like - not isinstance(obj, (str, bytes)) - and - # exclude zero-dimensional numpy arrays, effectively scalars - not (isinstance(obj, np.ndarray) and obj.ndim == 0) - and - # exclude sets if allow_sets is False - not (allow_sets is False and isinstance(obj, abc.Set)) - ) - - def is_array_like(obj): """ Check if the object is array-like. From d8e4801842e606482369d4c86cabf4baf644764a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Jul 2019 16:14:44 -0400 Subject: [PATCH 06/34] moar docs --- doc/source/user_guide/reshaping.rst | 18 +++++++++++++----- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/series.py | 9 ++++++++- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 4a625f0b95857..a6815b7095aa1 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -809,7 +809,7 @@ Exploding a List-like Column .. versionadded:: 0.25.0 -Sometimes the value column is list-like: +Sometimes the value column is list-like. .. ipython:: python @@ -818,15 +818,23 @@ Sometimes the value column is list-like: df = pd.DataFrame({'keys': keys, 'values': values}) df -But we actually want to put each value onto its own row. -For this purpose we can use ``~Series.explode``. This will replicate the index values: +We can 'explode' this transforming each element of a list-like to a row, by using ``~Series.explode``. This will replicate the index values: .. ipython:: python df['values'].explode() -You can merge this back into the original to expand the dataframe. +You can join this with the original to get an expanded ``DataFrame``. .. ipython:: python - pd.merge(df, df['values'].explode(), left_index=True, right_index=True) + df[['keys']]join(df['values'].explode()) + + +This routine will preserve replace empty lists with ``np.nan`` and preserve scalar entries. The dtype of the resulting ``Series`` is always ``object``. + +.. ipython:: python + + s = pd.Series([[1, 2, 3], np.nan, [], ['a', 'b']]) + s + s.explode() diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index cf9db2a87694c..0d858f0df7beb 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -187,7 +187,7 @@ The repr now looks like this: Series.explode to split list-like values to rows ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:class:`Series` has gained the :meth:`Series.explode` method to tranform list-likes to onto individual rows. See :ref:`section on Exploding list-like column ` in docs for more information (:issue:`16538`) +:class:`Series` has gained the :meth:`Series.explode` method to transform list-likes to individual rows. See :ref:`section on Exploding list-like column ` in docs for more information (:issue:`16538`) .. ipython:: python diff --git a/pandas/core/series.py b/pandas/core/series.py index ff326b96e3d2d..cedcae0f9fa88 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3635,7 +3635,7 @@ def reorder_levels(self, order): result.index = result.index.reorder_levels(order) return result - def explode(self): + def explode(self) -> 'Series': """ Create new Series expanding a list-like column. @@ -3672,6 +3672,13 @@ def explode(self): 3 3 3 4 dtype: object + + Notes + ----- + This routine will explode list-likes including lists, tuples, Series, and np.ndarray. + The result dtype of the returned Series will always be object. + Scalars will be returned unchanged. + Empty list-likes will result in a np.nan for that row. """ if not len(self): return self.copy() From f055b48aec53e7ffaa7faeec4eea47db5bc76648 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Jul 2019 16:18:07 -0400 Subject: [PATCH 07/34] test larger sides to avoid a segfault --- pandas/_libs/reshape.pyx | 4 ++-- pandas/tests/series/test_explode.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 8014e64ae6e66..ebba41dd0ffbf 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -114,11 +114,11 @@ def explode(ndarray[object] values): Py_ssize_t i, j, count, n object v ndarray[object] result - ndarray[uint8_t] counts + ndarray[int64_t] counts # find the resulting len n = len(values) - counts = numpy.zeros(n, dtype='uint8') + counts = numpy.zeros(n, dtype='int64') for i in range(n): v = values[i] if is_list_like(v): diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py index 0acbdfd8808b4..a1df947e40eec 100644 --- a/pandas/tests/series/test_explode.py +++ b/pandas/tests/series/test_explode.py @@ -52,3 +52,9 @@ def test_multi_index(): [0, 1, 2, np.nan, np.nan, 3, 4], index=index, dtype=object, name="foo" ) tm.assert_series_equal(result, expected) + + +def test_large(): + s = pd.Series([range(256)]).explode() + result = s.explode() + tm.assert_series_equal(result, s) From bd854caf1415480b100a57391d9d226a77349a05 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Jul 2019 16:19:44 -0400 Subject: [PATCH 08/34] fix ref --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 0d858f0df7beb..04248fcf62628 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -187,7 +187,7 @@ The repr now looks like this: Series.explode to split list-like values to rows ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:class:`Series` has gained the :meth:`Series.explode` method to transform list-likes to individual rows. See :ref:`section on Exploding list-like column ` in docs for more information (:issue:`16538`) +:class:`Series` has gained the :meth:`Series.explode` method to transform list-likes to individual rows. See :ref:`section on Exploding list-like column ` in docs for more information (:issue:`16538`) .. ipython:: python From 774d2858219929fc8157c3b7219967d619892808 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Jul 2019 16:22:00 -0400 Subject: [PATCH 09/34] typos --- doc/source/user_guide/reshaping.rst | 4 ++-- pandas/core/series.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index a6815b7095aa1..b46f1de769f70 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -831,10 +831,10 @@ You can join this with the original to get an expanded ``DataFrame``. df[['keys']]join(df['values'].explode()) -This routine will preserve replace empty lists with ``np.nan`` and preserve scalar entries. The dtype of the resulting ``Series`` is always ``object``. +This routine will replace empty lists with ``np.nan`` and preserve scalar entries. The dtype of the resulting ``Series`` is always ``object``. .. ipython:: python - s = pd.Series([[1, 2, 3], np.nan, [], ['a', 'b']]) + s = pd.Series([[1, 2, 3], 'foo', [], ['a', 'b']]) s s.explode() diff --git a/pandas/core/series.py b/pandas/core/series.py index cedcae0f9fa88..522c6e1d14f66 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3652,12 +3652,12 @@ def explode(self) -> 'Series': Examples -------- - In [1]: s = pd.Series([[1, 2, 3], np.nan, [], [3, 4]]) + In [1]: s = pd.Series([[1, 2, 3], 'foo', [], [3, 4]]) In [2]: s Out[2]: 0 [1, 2, 3] - 1 NaN + 1 foo 2 [] 3 [3, 4] dtype: object @@ -3667,7 +3667,7 @@ def explode(self) -> 'Series': 0 1 0 2 0 3 - 1 NaN + 1 foo 2 NaN 3 3 3 4 From 854a2af76b5e6e1b0cfdb2679ae4ea4a3701fd26 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Jul 2019 16:34:33 -0400 Subject: [PATCH 10/34] benchmarks wrong --- asv_bench/benchmarks/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index c0f12f1fb75a1..32ca8e6b68d0b 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -246,7 +246,7 @@ class Explode(object): def setup(self, n_rows, max_list_length): - data = [[np.arange(np.random.randint(max_list_length)) for _ in range(n_rows)]] + data = [np.arange(np.random.randint(max_list_length)) for _ in range(n_rows)] self.series = pd.Series(data) def time_explode(self, n_rows, max_list_length): From c4d1bd3d494207423e1c95ec770828f31ff7c70e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Jul 2019 16:46:23 -0400 Subject: [PATCH 11/34] add inversion --- pandas/tests/series/test_explode.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py index a1df947e40eec..1f861b91bd20a 100644 --- a/pandas/tests/series/test_explode.py +++ b/pandas/tests/series/test_explode.py @@ -58,3 +58,11 @@ def test_large(): s = pd.Series([range(256)]).explode() result = s.explode() tm.assert_series_equal(result, s) + + +def test_invert_array(): + df = pd.DataFrame({"a": pd.date_range("20190101", periods=3, tz="UTC")}) + + listify = df.apply(lambda x: x.array, axis=1) + result = listify.explode() + tm.assert_series_equal(result, df["a"].rename()) From 01edb45ea668abb80f19055229bb098ce494e590 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Jul 2019 16:58:16 -0400 Subject: [PATCH 12/34] add usecase --- doc/source/user_guide/reshaping.rst | 16 ++++++++++++++++ doc/source/whatsnew/v0.25.0.rst | 18 +++++++++++++++++- pandas/tests/series/test_explode.py | 12 ++++++++++++ 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index b46f1de769f70..39adbe1cd62a9 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -838,3 +838,19 @@ This routine will replace empty lists with ``np.nan`` and preserve scalar entrie s = pd.Series([[1, 2, 3], 'foo', [], ['a', 'b']]) s s.explode() + +Here is a typical usecase. You have comma separated string in a column. + +.. ipython:: python + + df = DataFrame([{'var1': 'a,b,c', 'var2': 1}, + {'var1': 'd,e,f', 'var2': 2}]) + df + +Creating a long form DataFrame is now straightforward using chained operations + +.. ipython:: python + + exploded = df.var1.str.split(',').explode() + exploded + df[['var2']].join(exploded) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 04248fcf62628..6cb3403710acf 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -187,7 +187,7 @@ The repr now looks like this: Series.explode to split list-like values to rows ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:class:`Series` has gained the :meth:`Series.explode` method to transform list-likes to individual rows. See :ref:`section on Exploding list-like column ` in docs for more information (:issue:`16538`) +:class:`Series` has gained the :meth:`Series.explode` method to transform list-likes to individual rows. See :ref:`section on Exploding list-like column ` in docs for more information (:issue:`16538`, :issue:`10511`) .. ipython:: python @@ -196,6 +196,22 @@ Series.explode to split list-like values to rows s s.explode() +Here is a typical usecase. You have comma separated string in a column. + +.. ipython:: python + + df = DataFrame([{'var1': 'a,b,c', 'var2': 1}, + {'var1': 'd,e,f', 'var2': 2}]) + df + +Creating a long form DataFrame is now straightforward using chained operations + +.. ipython:: python + + exploded = df.var1.str.split(',').explode() + exploded + df[['var2']].join(exploded) + .. _whatsnew_0250.enhancements.other: Other enhancements diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py index 1f861b91bd20a..e4e54034a26ec 100644 --- a/pandas/tests/series/test_explode.py +++ b/pandas/tests/series/test_explode.py @@ -66,3 +66,15 @@ def test_invert_array(): listify = df.apply(lambda x: x.array, axis=1) result = listify.explode() tm.assert_series_equal(result, df["a"].rename()) + + +def test_typical_usecase(): + + df = pd.DataFrame([{"var1": "a,b,c", "var2": 1}, {"var1": "d,e,f", "var2": 2}]) + exploded = df.var1.str.split(",").explode() + exploded + result = df[["var2"]].join(exploded) + expected = pd.DataFrame( + {"var2": [1, 1, 1, 2, 2, 2], "var1": list("abcdef")}, index=[0, 0, 0, 1, 1, 1] + ) + tm.assert_frame_equal(result, expected, check_like=True) From af058139d237672d5869b9816fd19e3c57db87e6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Jul 2019 17:24:27 -0400 Subject: [PATCH 13/34] cimport is_list_like --- doc/source/user_guide/reshaping.rst | 2 +- pandas/_libs/lib.pxd | 1 + pandas/_libs/lib.pyx | 8 ++++++-- pandas/_libs/reshape.pyx | 6 +++--- pandas/core/series.py | 7 ++++--- pandas/tests/series/test_explode.py | 9 +++++++++ 6 files changed, 24 insertions(+), 9 deletions(-) create mode 100644 pandas/_libs/lib.pxd diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 39adbe1cd62a9..b092c60e93f50 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -828,7 +828,7 @@ You can join this with the original to get an expanded ``DataFrame``. .. ipython:: python - df[['keys']]join(df['values'].explode()) + df[['keys']].join(df['values'].explode()) This routine will replace empty lists with ``np.nan`` and preserve scalar entries. The dtype of the resulting ``Series`` is always ``object``. diff --git a/pandas/_libs/lib.pxd b/pandas/_libs/lib.pxd new file mode 100644 index 0000000000000..12aca9dabe2e7 --- /dev/null +++ b/pandas/_libs/lib.pxd @@ -0,0 +1 @@ +cdef bint c_is_list_like(object, bint) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5952b3deef0ae..0f9fa7ef85c1b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -887,7 +887,7 @@ def is_period(val: object) -> bool: return util.is_period_object(val) -cpdef is_list_like(obj: object, allow_sets: bool = True): +def is_list_like(obj: object, allow_sets: bool = True): """ Check if the object is list-like. @@ -926,7 +926,11 @@ cpdef is_list_like(obj: object, allow_sets: bool = True): >>> is_list_like(np.array(2))) False """ + return c_is_list_like(obj, allow_sets) + + +cdef inline bint c_is_list_like(object obj, bint allow_sets): return ( isinstance(obj, abc.Iterable) and @@ -934,7 +938,7 @@ cpdef is_list_like(obj: object, allow_sets: bool = True): not isinstance(obj, (str, bytes)) and # exclude zero-dimensional numpy arrays, effectively scalars - not (isinstance(obj, np.ndarray) and obj.ndim == 0) + not (util.is_array(obj) and obj.ndim == 0) and # exclude sets if allow_sets is False not (allow_sets is False and isinstance(obj, abc.Set)) diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index ebba41dd0ffbf..0736fa7c4eccd 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -4,7 +4,7 @@ from cython import Py_ssize_t from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t, float32_t, float64_t, ndarray) import numpy -from pandas._libs.lib import is_list_like +from pandas._libs.lib cimport c_is_list_like ctypedef fused reshape_t: @@ -121,7 +121,7 @@ def explode(ndarray[object] values): counts = numpy.zeros(n, dtype='int64') for i in range(n): v = values[i] - if is_list_like(v): + if c_is_list_like(v, False): if len(v): counts[i] += len(v) else: @@ -135,7 +135,7 @@ def explode(ndarray[object] values): for i in range(n): v = values[i] - if is_list_like(v): + if c_is_list_like(v, False): if len(v): for j in range(len(v)): result[count] = v[j] diff --git a/pandas/core/series.py b/pandas/core/series.py index 522c6e1d14f66..1ddf6e5cd3e4c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -27,6 +27,7 @@ is_datetime64_dtype, is_datetimelike, is_dict_like, + is_object_dtype, is_extension_array_dtype, is_extension_type, is_hashable, @@ -3635,7 +3636,7 @@ def reorder_levels(self, order): result.index = result.index.reorder_levels(order) return result - def explode(self) -> 'Series': + def explode(self) -> "Series": """ Create new Series expanding a list-like column. @@ -3648,7 +3649,7 @@ def explode(self) -> 'Series': See Also -------- Series.str.split: Split string values on specified separator. - Series.unstakc: Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. + Series.unstack: Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. Examples -------- @@ -3680,7 +3681,7 @@ def explode(self) -> 'Series': Scalars will be returned unchanged. Empty list-likes will result in a np.nan for that row. """ - if not len(self): + if not len(self) or not is_object_dtype(self): return self.copy() values, counts = reshape.explode(np.asarray(self.array)) diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py index e4e54034a26ec..10bf488a13d53 100644 --- a/pandas/tests/series/test_explode.py +++ b/pandas/tests/series/test_explode.py @@ -1,3 +1,4 @@ +import pytest import numpy as np import pandas as pd @@ -68,6 +69,14 @@ def test_invert_array(): tm.assert_series_equal(result, df["a"].rename()) +@pytest.mark.parametrize( + "s", [pd.Series([1, 2, 3]), pd.Series(pd.date_range("2019", periods=3, tz="UTC"))] +) +def non_object_dtype(s): + result = s.explode() + tm.assert_series_equal(result, s) + + def test_typical_usecase(): df = pd.DataFrame([{"var1": "a,b,c", "var2": 1}, {"var1": "d,e,f", "var2": 2}]) From 84e19969f5260b80684bccd5eb2678ef95c13348 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Jul 2019 17:26:13 -0400 Subject: [PATCH 14/34] use cimports --- pandas/_libs/reshape.pyx | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 0736fa7c4eccd..f229de002ce5c 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -3,9 +3,10 @@ from cython import Py_ssize_t from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t, float32_t, float64_t, ndarray) -import numpy +cimport numpy as cnp +import numpy as np from pandas._libs.lib cimport c_is_list_like - +cnp.import_array() ctypedef fused reshape_t: uint8_t @@ -118,7 +119,7 @@ def explode(ndarray[object] values): # find the resulting len n = len(values) - counts = numpy.zeros(n, dtype='int64') + counts = np.zeros(n, dtype='int64') for i in range(n): v = values[i] if c_is_list_like(v, False): @@ -130,7 +131,7 @@ def explode(ndarray[object] values): else: counts[i] += 1 - result = numpy.empty(counts.sum(), dtype='object') + result = np.empty(counts.sum(), dtype='object') count = 0 for i in range(n): v = values[i] @@ -142,7 +143,7 @@ def explode(ndarray[object] values): count += 1 else: # empty list-like, use a nan marker - result[count] = numpy.nan + result[count] = np.nan count += 1 else: # replace with the existing scalar From 040ba2e5bf38c8df9ed19da51b1d95b571c229e8 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Jul 2019 17:28:36 -0400 Subject: [PATCH 15/34] doc-string --- pandas/core/series.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 1ddf6e5cd3e4c..d0755cf497758 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3651,6 +3651,13 @@ def explode(self) -> "Series": Series.str.split: Split string values on specified separator. Series.unstack: Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. + Notes + ----- + This routine will explode list-likes including lists, tuples, Series, and np.ndarray. + The result dtype of the returned Series will always be object. + Scalars will be returned unchanged. + Empty list-likes will result in a np.nan for that row. + Examples -------- In [1]: s = pd.Series([[1, 2, 3], 'foo', [], [3, 4]]) @@ -3673,13 +3680,6 @@ def explode(self) -> "Series": 3 3 3 4 dtype: object - - Notes - ----- - This routine will explode list-likes including lists, tuples, Series, and np.ndarray. - The result dtype of the returned Series will always be object. - Scalars will be returned unchanged. - Empty list-likes will result in a np.nan for that row. """ if not len(self) or not is_object_dtype(self): return self.copy() From 93b1fd6f2ff8b09927dfe8c4c39f48b12ea8ad89 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Jul 2019 18:08:33 -0400 Subject: [PATCH 16/34] docs & lint --- doc/source/user_guide/reshaping.rst | 15 +++++++-------- doc/source/whatsnew/v0.25.0.rst | 12 ++++++------ pandas/_libs/lib.pyx | 1 - 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index b092c60e93f50..e12acee72ee39 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -818,20 +818,19 @@ Sometimes the value column is list-like. df = pd.DataFrame({'keys': keys, 'values': values}) df -We can 'explode' this transforming each element of a list-like to a row, by using ``~Series.explode``. This will replicate the index values: +We can 'explode' this transforming each element of a list-like to a row, by using :meth:`~Series.explode`. This will replicate the index values: .. ipython:: python df['values'].explode() -You can join this with the original to get an expanded ``DataFrame``. +You can easily join this with the original to get an expanded ``DataFrame``. .. ipython:: python df[['keys']].join(df['values'].explode()) - -This routine will replace empty lists with ``np.nan`` and preserve scalar entries. The dtype of the resulting ``Series`` is always ``object``. +:meth:`Series.explode` will replace empty lists with ``np.nan`` and preserve scalar entries. The dtype of the resulting ``Series`` is always ``object``. .. ipython:: python @@ -839,15 +838,15 @@ This routine will replace empty lists with ``np.nan`` and preserve scalar entrie s s.explode() -Here is a typical usecase. You have comma separated string in a column. +Here is a typical usecase. You have comma separated strings in a column and want to expand this. .. ipython:: python - df = DataFrame([{'var1': 'a,b,c', 'var2': 1}, - {'var1': 'd,e,f', 'var2': 2}]) + df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1}, + {'var1': 'd,e,f', 'var2': 2}]) df -Creating a long form DataFrame is now straightforward using chained operations +Creating a long form DataFrame is now straightforward using explode and chained operations .. ipython:: python diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 6cb3403710acf..6fdd069e8d9ca 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -200,17 +200,17 @@ Here is a typical usecase. You have comma separated string in a column. .. ipython:: python - df = DataFrame([{'var1': 'a,b,c', 'var2': 1}, - {'var1': 'd,e,f', 'var2': 2}]) + df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1}, + {'var1': 'd,e,f', 'var2': 2}]) df -Creating a long form DataFrame is now straightforward using chained operations +Creating a long form ``DataFrame`` is now straightforward using chained operations .. ipython:: python - exploded = df.var1.str.split(',').explode() - exploded - df[['var2']].join(exploded) + exploded = df.var1.str.split(',').explode() + exploded + df[['var2']].join(exploded) .. _whatsnew_0250.enhancements.other: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0f9fa7ef85c1b..e6d3e839f1019 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -929,7 +929,6 @@ def is_list_like(obj: object, allow_sets: bool = True): return c_is_list_like(obj, allow_sets) - cdef inline bint c_is_list_like(object obj, bint allow_sets): return ( isinstance(obj, abc.Iterable) From fdafb0cd692cc0fb64725aaf847323b08f7696e6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Jul 2019 19:14:32 -0400 Subject: [PATCH 17/34] isort --- pandas/core/series.py | 4 ++-- pandas/tests/series/test_explode.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index d0755cf497758..e0fcbd422a23e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -12,7 +12,7 @@ from pandas._config import get_option -from pandas._libs import iNaT, index as libindex, lib, tslibs, reshape +from pandas._libs import iNaT, index as libindex, lib, reshape, tslibs from pandas.compat import PY36 from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, deprecate @@ -27,13 +27,13 @@ is_datetime64_dtype, is_datetimelike, is_dict_like, - is_object_dtype, is_extension_array_dtype, is_extension_type, is_hashable, is_integer, is_iterator, is_list_like, + is_object_dtype, is_scalar, is_string_like, is_timedelta64_dtype, diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py index 10bf488a13d53..0c97026ccb4d5 100644 --- a/pandas/tests/series/test_explode.py +++ b/pandas/tests/series/test_explode.py @@ -1,5 +1,5 @@ -import pytest import numpy as np +import pytest import pandas as pd from pandas.util import testing as tm From b47e68d9115005bc64a1fc3f275594d0418534e5 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 7 Jul 2019 10:20:39 -0400 Subject: [PATCH 18/34] clean object check & update doc-strings --- asv_bench/benchmarks/io/parsers.py | 4 ++-- asv_bench/benchmarks/reshape.py | 7 ++----- asv_bench/benchmarks/series_methods.py | 6 +++--- asv_bench/benchmarks/timeseries.py | 2 +- ci/code_checks.sh | 2 +- pandas/core/frame.py | 1 + pandas/core/series.py | 8 +++++--- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index 40256e043a008..c5e099bd44eac 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -10,7 +10,7 @@ pass -class DoesStringLookLikeDatetime(object): +class DoesStringLookLikeDatetime: params = (["2Q2005", "0.0", "10000"],) param_names = ["value"] @@ -23,7 +23,7 @@ def time_check_datetimes(self, value): _does_string_look_like_datetime(obj) -class ConcatDateCols(object): +class ConcatDateCols: params = ([1234567890, "AAAA"], [1, 2]) param_names = ["value", "dim"] diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 32ca8e6b68d0b..14505ad49dafa 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -240,7 +240,7 @@ def time_qcut_datetime(self, bins): pd.qcut(self.datetime_series, bins) -class Explode(object): +class Explode param_names = ["n_rows", "max_list_length"] params = [[100, 1000, 10000], [3, 5, 10]] @@ -250,10 +250,7 @@ def setup(self, n_rows, max_list_length): self.series = pd.Series(data) def time_explode(self, n_rows, max_list_length): - try: - self.series.explode() - except AttributeError: - pass + self.series.explode() from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index e2835c5156f55..6038a2ab4bd9f 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -219,7 +219,7 @@ def time_series_datetimeindex_repr(self): getattr(self.s, "a", None) -class All(object): +class All: params = [[10 ** 3, 10 ** 6], ["fast", "slow"]] param_names = ["N", "case"] @@ -232,7 +232,7 @@ def time_all(self, N, case): self.s.all() -class Any(object): +class Any: params = [[10 ** 3, 10 ** 6], ["fast", "slow"]] param_names = ["N", "case"] @@ -245,7 +245,7 @@ def time_any(self, N, case): self.s.any() -class NanOps(object): +class NanOps: params = [ [ diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index a74527df25f9b..1020b773f8acb 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -293,7 +293,7 @@ def time_format_YYYYMMDD(self): to_datetime(self.stringsD, format="%Y%m%d") -class ToDatetimeCacheSmallCount(object): +class ToDatetimeCacheSmallCount: params = ([True, False], [50, 500, 5000, 100000]) param_names = ["cache", "count"] diff --git a/ci/code_checks.sh b/ci/code_checks.sh index fec2a88292280..96a8440d85694 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -156,7 +156,7 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check for python2 new-style classes and for empty parentheses' ; echo $MSG - invgrep -R --include="*.py" --include="*.pyx" -E "class\s\S*\((object)?\):" pandas scripts + invgrep -R --include="*.py" --include="*.pyx" -E "class\s\S*\((object)?\):" pandas asv_bench/benchmarks scripts RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check for backticks incorrectly rendering because of missing spaces' ; echo $MSG diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f45a13249b16c..dddeac718607c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6339,6 +6339,7 @@ def unstack(self, level=-1, fill_value=None): %(other)s pivot_table DataFrame.pivot + Series.explode Examples -------- diff --git a/pandas/core/series.py b/pandas/core/series.py index e0fcbd422a23e..a4acbdfaa28a8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3644,12 +3644,14 @@ def explode(self) -> "Series": Returns ------- - exploded: Series + Series + exploded lists to rows; index will be duplicated for these rows See Also -------- - Series.str.split: Split string values on specified separator. - Series.unstack: Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. + Series.str.split : Split string values on specified separator. + Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. + DataFrame.melt : Unpivot a DataFrame from wide format to long format Notes ----- From 985673214bd031220d54350c1daf3ecb8c3c2a9c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 7 Jul 2019 21:03:21 -0400 Subject: [PATCH 19/34] lint --- asv_bench/benchmarks/reshape.py | 2 +- pandas/core/series.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 14505ad49dafa..1aed756b841a5 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -240,7 +240,7 @@ def time_qcut_datetime(self, bins): pd.qcut(self.datetime_series, bins) -class Explode +class Explode: param_names = ["n_rows", "max_list_length"] params = [[100, 1000, 10000], [3, 5, 10]] diff --git a/pandas/core/series.py b/pandas/core/series.py index a4acbdfaa28a8..1308a924a8b57 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3645,7 +3645,7 @@ def explode(self) -> "Series": Returns ------- Series - exploded lists to rows; index will be duplicated for these rows + Exploded lists to rows; index will be duplicated for these rows. See Also -------- From 0fa9c0d7ebb83005c1aff1935ee76fd049d53486 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 9 Jul 2019 15:08:49 -0400 Subject: [PATCH 20/34] test for nested --- pandas/tests/series/test_explode.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py index 0c97026ccb4d5..c3a0ad49aa256 100644 --- a/pandas/tests/series/test_explode.py +++ b/pandas/tests/series/test_explode.py @@ -38,6 +38,14 @@ def test_empty(): tm.assert_series_equal(result, expected) +def test_nested_lists(): + s = pd.Series([[[1, 2, 3]], [1, 2], 1]) + result = s.explode() + expected = pd.Series([[1, 2, 3], 1, 2, 1], + index=[0, 1, 1, 2]) + tm.assert_series_equal(result, expected) + + def test_multi_index(): s = pd.Series( [[0, 1, 2], np.nan, [], (3, 4)], From c42662756eecb30c869744873add7b617d4d917c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 9 Jul 2019 15:15:54 -0400 Subject: [PATCH 21/34] better test --- pandas/tests/series/test_explode.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py index c3a0ad49aa256..661d4cb320cca 100644 --- a/pandas/tests/series/test_explode.py +++ b/pandas/tests/series/test_explode.py @@ -6,11 +6,13 @@ def test_basic(): - s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)], name="foo") + s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)], + index=list('abcd'), + name="foo") result = s.explode() expected = pd.Series( [0, 1, 2, np.nan, np.nan, 3, 4], - index=[0, 0, 0, 1, 2, 3, 3], + index=list('aaabcdd'), dtype=object, name="foo", ) From 9d20b616ed3b6b8025e3a578016163a8e703ccba Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 9 Jul 2019 16:34:23 -0400 Subject: [PATCH 22/34] try adding frame --- doc/source/reference/frame.rst | 1 + pandas/core/frame.py | 74 +++++++++++++++++++- pandas/core/series.py | 1 + pandas/tests/frame/test_explode.py | 103 ++++++++++++++++++++++++++++ pandas/tests/series/test_explode.py | 12 +--- 5 files changed, 181 insertions(+), 10 deletions(-) create mode 100644 pandas/tests/frame/test_explode.py diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index c0b58fd2d99f5..b1c6172fb1261 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -239,6 +239,7 @@ Reshaping, sorting, transposing DataFrame.unstack DataFrame.swapaxes DataFrame.melt + DataFrame.explode DataFrame.squeeze DataFrame.to_xarray DataFrame.T diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dddeac718607c..9eeb40dd2aee5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -15,7 +15,7 @@ import itertools import sys from textwrap import dedent -from typing import FrozenSet, List, Optional, Set, Type, Union +from typing import FrozenSet, Iterable, List, Optional, Set, Type, Union import warnings import numpy as np @@ -6237,6 +6237,78 @@ def stack(self, level=-1, dropna=True): else: return stack(self, level, dropna=dropna) + def explode(self, subset: Iterable) -> "DataFrame": + """ + Create new DataFrame expanding a list-like columns. + + .. versionadded:: 0.25.0 + + Parameters + ---------- + subset : list-like + + Returns + ------- + DataFrame + Exploded lists to rows of the subset columns; index will be duplicated for these rows. + + Raises + ------ + ValueError : + if columns & subset are not unique. + ValueError : + subset must be list-like + + See Also + -------- + Series.str.split : Split string values on specified separator. + Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. + DataFrame.melt : Unpivot a DataFrame from wide format to long format + Series.explode : Explode a DataFrame from list-like columns to long format. + + Notes + ----- + This routine will explode list-likes including lists, tuples, Series, and np.ndarray. + The result dtype of the subset rows will be object. + Scalars will be returned unchanged. + Empty list-likes will result in a np.nan for that row. + + Examples + -------- + In [1]: df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1}) + + In [3]: df.explode() + Out[3]: + 0 1 + 0 2 + 0 3 + 1 foo + 2 NaN + 3 3 + 3 4 + dtype: object + """ + + if not is_list_like(subset): + raise ValueError("subset must be a list-like") + if not Index(subset).is_unique: + raise ValueError("subset must be unique") + if not self.columns.is_unique: + raise ValueError("columns must be unique") + + results = [self[s].explode() for s in subset] + result = self.drop(subset, axis=1) + + # recursive merge + from pandas.core.reshape.merge import merge + + def merger(left, right): + return merge(left, right, left_index=True, right_index=True) + + return functools.reduce(merger, [result] + results).reindex( + columns=self.columns, copy=False + ) + def unstack(self, level=-1, fill_value=None): """ Pivot a level of the (necessarily hierarchical) index labels, returning diff --git a/pandas/core/series.py b/pandas/core/series.py index 1308a924a8b57..3aa45d92d9533 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3652,6 +3652,7 @@ def explode(self) -> "Series": Series.str.split : Split string values on specified separator. Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. DataFrame.melt : Unpivot a DataFrame from wide format to long format + DataFrame.explode : Explode a DataFrame from list-like columns to long format. Notes ----- diff --git a/pandas/tests/frame/test_explode.py b/pandas/tests/frame/test_explode.py new file mode 100644 index 0000000000000..56438e7cc290d --- /dev/null +++ b/pandas/tests/frame/test_explode.py @@ -0,0 +1,103 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.util import testing as tm + + +def test_error(): + df = pd.DataFrame( + {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} + ) + df.columns = list("AA") + with pytest.raises(ValueError): + df.explode(subset=list("AA")) + + +def test_basic(): + df = pd.DataFrame( + {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} + ) + result = df.explode(subset=["A"]) + expected = pd.DataFrame( + { + "A": pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object + ), + "B": 1, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_all_columns(): + df = pd.DataFrame( + {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} + ) + result = df.explode(subset=["A", "B"]) + expected = pd.DataFrame( + { + "A": pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object + ), + "B": 1, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_multiple_columns(): + df = pd.DataFrame( + { + "A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), + "B": pd.Series([[0, 1, 2], np.nan, np.nan, 3], index=list("abcd")), + } + ) + result = df.explode(subset=["A", "B"]) + expected = pd.DataFrame( + { + "A": [0, 0, 0, 1, 1, 1, 2, 2, 2, np.nan, np.nan, 3, 4], + "B": [0, 1, 2, 0, 1, 2, 0, 1, 2, np.nan, np.nan, 3, 3], + }, + dtype=object, + index=list("aaaaaaaaabcdd"), + ) + tm.assert_frame_equal(result, expected) + + +def test_usecase(): + # explode a single column + # gh-10511 + df = pd.DataFrame( + [[11, range(5), 10], [22, range(3), 20]], columns=["A", "B", "C"] + ).set_index("C") + result = df.explode(["B"]) + + expected = pd.DataFrame( + { + "A": [11, 11, 11, 11, 11, 22, 22, 22], + "B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object), + "C": [10, 10, 10, 10, 10, 20, 20, 20], + }, + columns=list("ABC"), + ).set_index("C") + + tm.assert_frame_equal(result, expected) + + # gh-8517 + df = pd.DataFrame( + [["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]], + columns=["dt", "name", "text"], + ) + result = df.assign(text=df.text.str.split(" ")).explode(["text"]) + expected = pd.DataFrame( + [ + ["2014-01-01", "Alice", "A"], + ["2014-01-01", "Alice", "B"], + ["2014-01-02", "Bob", "C"], + ["2014-01-02", "Bob", "D"], + ], + columns=["dt", "name", "text"], + index=[0, 0, 1, 1], + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py index 661d4cb320cca..3f57f71561232 100644 --- a/pandas/tests/series/test_explode.py +++ b/pandas/tests/series/test_explode.py @@ -6,15 +6,10 @@ def test_basic(): - s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)], - index=list('abcd'), - name="foo") + s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd"), name="foo") result = s.explode() expected = pd.Series( - [0, 1, 2, np.nan, np.nan, 3, 4], - index=list('aaabcdd'), - dtype=object, - name="foo", + [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object, name="foo" ) tm.assert_series_equal(result, expected) @@ -43,8 +38,7 @@ def test_empty(): def test_nested_lists(): s = pd.Series([[[1, 2, 3]], [1, 2], 1]) result = s.explode() - expected = pd.Series([[1, 2, 3], 1, 2, 1], - index=[0, 1, 1, 2]) + expected = pd.Series([[1, 2, 3], 1, 2, 1], index=[0, 1, 1, 2]) tm.assert_series_equal(result, expected) From 5512e290db18aa9384b716f2fca19a43224d0083 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 10 Jul 2019 19:50:16 -0500 Subject: [PATCH 23/34] test for nested EA --- pandas/tests/series/test_explode.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py index 3f57f71561232..e73f23fd9bb71 100644 --- a/pandas/tests/series/test_explode.py +++ b/pandas/tests/series/test_explode.py @@ -91,3 +91,18 @@ def test_typical_usecase(): {"var2": [1, 1, 1, 2, 2, 2], "var1": list("abcdef")}, index=[0, 0, 0, 1, 1, 1] ) tm.assert_frame_equal(result, expected, check_like=True) + + +def test_nested_EA(): + # a nested EA array + s = pd.Series( + [ + pd.date_range("20170101", periods=3, tz="UTC"), + pd.date_range("20170104", periods=3, tz="UTC"), + ] + ) + result = s.explode() + expected = pd.Series( + pd.date_range("20170101", periods=6, tz="UTC"), index=[0, 0, 0, 1, 1, 1] + ) + tm.assert_series_equal(result, expected) From dc17ef6e69d8f0ff5485257d2b83942f455af113 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 10 Jul 2019 20:19:56 -0500 Subject: [PATCH 24/34] lint --- pandas/core/frame.py | 40 ++++++++++++++++++++++++---------------- pandas/core/series.py | 18 +++++++++--------- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9eeb40dd2aee5..ff2eea2b362d7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6250,7 +6250,8 @@ def explode(self, subset: Iterable) -> "DataFrame": Returns ------- DataFrame - Exploded lists to rows of the subset columns; index will be duplicated for these rows. + Exploded lists to rows of the subset columns; + index will be duplicated for these rows. Raises ------ @@ -6261,32 +6262,39 @@ def explode(self, subset: Iterable) -> "DataFrame": See Also -------- - Series.str.split : Split string values on specified separator. - Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. + DataFrame.unstack : Pivot a level of the (necessarily hierarchical) + index labels DataFrame.melt : Unpivot a DataFrame from wide format to long format Series.explode : Explode a DataFrame from list-like columns to long format. Notes ----- - This routine will explode list-likes including lists, tuples, Series, and np.ndarray. + This routine will explode list-likes including lists, tuples, + Series, and np.ndarray. The result dtype of the subset rows will be object. Scalars will be returned unchanged. Empty list-likes will result in a np.nan for that row. Examples -------- - In [1]: df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1}) - - In [3]: df.explode() - Out[3]: - 0 1 - 0 2 - 0 3 - 1 foo - 2 NaN - 3 3 - 3 4 - dtype: object + >>> df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1}) + >>> df + A B + 0 [1, 2, 3] 1 + 1 foo 1 + 2 [] 1 + 3 [3, 4] 1 + + >>> df.explode(['A']) + A B + 0 1 1 + 0 2 1 + 0 3 1 + 1 foo 1 + 2 NaN 1 + 3 3 1 + 3 4 1 + """ if not is_list_like(subset): diff --git a/pandas/core/series.py b/pandas/core/series.py index 3aa45d92d9533..0c48dd23b6902 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3650,31 +3650,31 @@ def explode(self) -> "Series": See Also -------- Series.str.split : Split string values on specified separator. - Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. + Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex + to produce DataFrame. DataFrame.melt : Unpivot a DataFrame from wide format to long format - DataFrame.explode : Explode a DataFrame from list-like columns to long format. + DataFrame.explode : Explode a DataFrame from list-like + columns to long format. Notes ----- - This routine will explode list-likes including lists, tuples, Series, and np.ndarray. + This routine will explode list-likes including lists, tuples, + Series, and np.ndarray. The result dtype of the returned Series will always be object. Scalars will be returned unchanged. Empty list-likes will result in a np.nan for that row. Examples -------- - In [1]: s = pd.Series([[1, 2, 3], 'foo', [], [3, 4]]) - - In [2]: s - Out[2]: + >>> s = pd.Series([[1, 2, 3], 'foo', [], [3, 4]]) + >>> s 0 [1, 2, 3] 1 foo 2 [] 3 [3, 4] dtype: object - In [3]: s.explode() - Out[3]: + >>> s.explode() 0 1 0 2 0 3 From 94f319e72a7b49db5f60515c7760e7fe3a39d4ad Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 11 Jul 2019 09:18:06 -0500 Subject: [PATCH 25/34] remove multi subset support --- pandas/core/frame.py | 34 ++++++++-------------- pandas/tests/frame/test_explode.py | 46 +++++------------------------- 2 files changed, 19 insertions(+), 61 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ff2eea2b362d7..a3a42dc604d76 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6237,15 +6237,15 @@ def stack(self, level=-1, dropna=True): else: return stack(self, level, dropna=dropna) - def explode(self, subset: Iterable) -> "DataFrame": + def explode(self, column: str) -> "DataFrame": """ - Create new DataFrame expanding a list-like columns. + Create new DataFrame expanding a specified list-like column. .. versionadded:: 0.25.0 Parameters ---------- - subset : list-like + column : str Returns ------- @@ -6256,9 +6256,7 @@ def explode(self, subset: Iterable) -> "DataFrame": Raises ------ ValueError : - if columns & subset are not unique. - ValueError : - subset must be list-like + if columns of the frame are not unique. See Also -------- @@ -6285,7 +6283,7 @@ def explode(self, subset: Iterable) -> "DataFrame": 2 [] 1 3 [3, 4] 1 - >>> df.explode(['A']) + >>> df.explode('A') A B 0 1 1 0 2 1 @@ -6297,24 +6295,16 @@ def explode(self, subset: Iterable) -> "DataFrame": """ - if not is_list_like(subset): - raise ValueError("subset must be a list-like") - if not Index(subset).is_unique: - raise ValueError("subset must be unique") + if not is_scalar(column): + raise ValueError("column must be a scalar") if not self.columns.is_unique: raise ValueError("columns must be unique") - results = [self[s].explode() for s in subset] - result = self.drop(subset, axis=1) - - # recursive merge - from pandas.core.reshape.merge import merge - - def merger(left, right): - return merge(left, right, left_index=True, right_index=True) - - return functools.reduce(merger, [result] + results).reindex( - columns=self.columns, copy=False + result = self[column].explode() + return ( + self.drop([column], axis=1) + .join(result) + .reindex(columns=self.columns, copy=False) ) def unstack(self, level=-1, fill_value=None): diff --git a/pandas/tests/frame/test_explode.py b/pandas/tests/frame/test_explode.py index 56438e7cc290d..debb94ded093d 100644 --- a/pandas/tests/frame/test_explode.py +++ b/pandas/tests/frame/test_explode.py @@ -9,16 +9,19 @@ def test_error(): df = pd.DataFrame( {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} ) + with pytest.raises(ValueError): + df.explode(list("AA")) + df.columns = list("AA") with pytest.raises(ValueError): - df.explode(subset=list("AA")) + df.explode("A") def test_basic(): df = pd.DataFrame( {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} ) - result = df.explode(subset=["A"]) + result = df.explode("A") expected = pd.DataFrame( { "A": pd.Series( @@ -30,48 +33,13 @@ def test_basic(): tm.assert_frame_equal(result, expected) -def test_all_columns(): - df = pd.DataFrame( - {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} - ) - result = df.explode(subset=["A", "B"]) - expected = pd.DataFrame( - { - "A": pd.Series( - [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object - ), - "B": 1, - } - ) - tm.assert_frame_equal(result, expected) - - -def test_multiple_columns(): - df = pd.DataFrame( - { - "A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), - "B": pd.Series([[0, 1, 2], np.nan, np.nan, 3], index=list("abcd")), - } - ) - result = df.explode(subset=["A", "B"]) - expected = pd.DataFrame( - { - "A": [0, 0, 0, 1, 1, 1, 2, 2, 2, np.nan, np.nan, 3, 4], - "B": [0, 1, 2, 0, 1, 2, 0, 1, 2, np.nan, np.nan, 3, 3], - }, - dtype=object, - index=list("aaaaaaaaabcdd"), - ) - tm.assert_frame_equal(result, expected) - - def test_usecase(): # explode a single column # gh-10511 df = pd.DataFrame( [[11, range(5), 10], [22, range(3), 20]], columns=["A", "B", "C"] ).set_index("C") - result = df.explode(["B"]) + result = df.explode("B") expected = pd.DataFrame( { @@ -89,7 +57,7 @@ def test_usecase(): [["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]], columns=["dt", "name", "text"], ) - result = df.assign(text=df.text.str.split(" ")).explode(["text"]) + result = df.assign(text=df.text.str.split(" ")).explode("text") expected = pd.DataFrame( [ ["2014-01-01", "Alice", "A"], From b9d8a4217f083117f7985908547a0095dab6b933 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 11 Jul 2019 09:21:36 -0500 Subject: [PATCH 26/34] update docs --- doc/source/user_guide/reshaping.rst | 8 +++----- doc/source/whatsnew/v0.25.0.rst | 12 ++---------- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index e12acee72ee39..03683f1394e21 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -824,11 +824,11 @@ We can 'explode' this transforming each element of a list-like to a row, by usin df['values'].explode() -You can easily join this with the original to get an expanded ``DataFrame``. +You can also explode the column in the ``DataFrame``. .. ipython:: python - df[['keys']].join(df['values'].explode()) + df.explode('values') :meth:`Series.explode` will replace empty lists with ``np.nan`` and preserve scalar entries. The dtype of the resulting ``Series`` is always ``object``. @@ -850,6 +850,4 @@ Creating a long form DataFrame is now straightforward using explode and chained .. ipython:: python - exploded = df.var1.str.split(',').explode() - exploded - df[['var2']].join(exploded) + df.assign(var1=df.var1.str.split(',')).explode('var1') diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 6fdd069e8d9ca..a08159e6c3199 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -187,15 +187,9 @@ The repr now looks like this: Series.explode to split list-like values to rows ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:class:`Series` has gained the :meth:`Series.explode` method to transform list-likes to individual rows. See :ref:`section on Exploding list-like column ` in docs for more information (:issue:`16538`, :issue:`10511`) +:class:`Series` and :class:`DataFrame` have gained the :meth:`DataFrame.explode` methods to transform list-likes to individual rows. See :ref:`section on Exploding list-like column ` in docs for more information (:issue:`16538`, :issue:`10511`) -.. ipython:: python - - s = pd.Series([[1, 2, 3], np.nan, [], [3, 4]]) - s - s.explode() - Here is a typical usecase. You have comma separated string in a column. .. ipython:: python @@ -208,9 +202,7 @@ Creating a long form ``DataFrame`` is now straightforward using chained operatio .. ipython:: python - exploded = df.var1.str.split(',').explode() - exploded - df[['var2']].join(exploded) + df.assign(var1=df.var1.str.split(',')).explode('var1') .. _whatsnew_0250.enhancements.other: From df0fccfc3ed5cf01cdcea5cb4c64fd5cf96c017e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 11 Jul 2019 10:03:26 -0500 Subject: [PATCH 27/34] doc-string --- pandas/core/frame.py | 2 +- pandas/core/series.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a3a42dc604d76..39e01e8d6a8fa 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6283,7 +6283,7 @@ def explode(self, column: str) -> "DataFrame": 2 [] 1 3 [3, 4] 1 - >>> df.explode('A') + >>> df.explode('A') A B 0 1 1 0 2 1 diff --git a/pandas/core/series.py b/pandas/core/series.py index 0c48dd23b6902..0a95848598178 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3683,6 +3683,7 @@ def explode(self) -> "Series": 3 3 3 4 dtype: object + """ if not len(self) or not is_object_dtype(self): return self.copy() From 720e309d727a9ba59ca9dc590c734612a7e87288 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 11 Jul 2019 10:43:19 -0500 Subject: [PATCH 28/34] add test for MI --- pandas/tests/frame/test_explode.py | 30 +++++++++++++++++++++++++++++ pandas/tests/series/test_explode.py | 2 +- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_explode.py b/pandas/tests/frame/test_explode.py index debb94ded093d..b2224a772a80c 100644 --- a/pandas/tests/frame/test_explode.py +++ b/pandas/tests/frame/test_explode.py @@ -33,6 +33,36 @@ def test_basic(): tm.assert_frame_equal(result, expected) +def test_multi_index(): + df = pd.DataFrame( + {"A": np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), "B": 1}, + index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]), + ) + + result = df.explode("A") + expected = pd.DataFrame( + { + "A": pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], + index=pd.MultiIndex.from_tuples( + [ + ("a", 1), + ("a", 1), + ("a", 1), + ("a", 2), + ("b", 1), + ("b", 2), + ("b", 2), + ] + ), + dtype=object, + ), + "B": 1, + } + ) + tm.assert_frame_equal(result, expected) + + def test_usecase(): # explode a single column # gh-10511 diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py index e73f23fd9bb71..15fe14cf6ba7b 100644 --- a/pandas/tests/series/test_explode.py +++ b/pandas/tests/series/test_explode.py @@ -90,7 +90,7 @@ def test_typical_usecase(): expected = pd.DataFrame( {"var2": [1, 1, 1, 2, 2, 2], "var1": list("abcdef")}, index=[0, 0, 0, 1, 1, 1] ) - tm.assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected) def test_nested_EA(): From 26b91c4879793ce0c57d1108cd1da44d914a7d50 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 11 Jul 2019 10:55:44 -0500 Subject: [PATCH 29/34] lint and docs --- doc/source/user_guide/reshaping.rst | 6 +++--- pandas/_libs/lib.pyx | 9 +++------ pandas/core/frame.py | 13 ++++++------- pandas/core/series.py | 11 +++++------ 4 files changed, 17 insertions(+), 22 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 03683f1394e21..0470a6c0c2f42 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -804,12 +804,12 @@ Note to subdivide over multiple columns we can pass in a list to the .. _reshaping.explode: -Exploding a List-like Column +Exploding a list-like column ---------------------------- .. versionadded:: 0.25.0 -Sometimes the value column is list-like. +Sometimes the values in a column are list-like. .. ipython:: python @@ -818,7 +818,7 @@ Sometimes the value column is list-like. df = pd.DataFrame({'keys': keys, 'values': values}) df -We can 'explode' this transforming each element of a list-like to a row, by using :meth:`~Series.explode`. This will replicate the index values: +We can 'explode' the ``values`` column, transforming each list-like to a separate row, by using :meth:`~Series.explode`. This will replicate the index values from the original row: .. ipython:: python diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e6d3e839f1019..27ee685acfde7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -932,15 +932,12 @@ def is_list_like(obj: object, allow_sets: bool = True): cdef inline bint c_is_list_like(object obj, bint allow_sets): return ( isinstance(obj, abc.Iterable) - and # we do not count strings/unicode/bytes as list-like - not isinstance(obj, (str, bytes)) - and + and not isinstance(obj, (str, bytes)) # exclude zero-dimensional numpy arrays, effectively scalars - not (util.is_array(obj) and obj.ndim == 0) - and + and not (util.is_array(obj) and obj.ndim == 0) # exclude sets if allow_sets is False - not (allow_sets is False and isinstance(obj, abc.Set)) + and not (allow_sets is False and isinstance(obj, abc.Set)) ) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 39e01e8d6a8fa..8f9dbbd9a6ae0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -15,7 +15,7 @@ import itertools import sys from textwrap import dedent -from typing import FrozenSet, Iterable, List, Optional, Set, Type, Union +from typing import FrozenSet, List, Optional, Set, Type, Union import warnings import numpy as np @@ -6239,7 +6239,8 @@ def stack(self, level=-1, dropna=True): def explode(self, column: str) -> "DataFrame": """ - Create new DataFrame expanding a specified list-like column. + Transforms each element of a list-like to a row, replicating the + index values. .. versionadded:: 0.25.0 @@ -6268,10 +6269,9 @@ def explode(self, column: str) -> "DataFrame": Notes ----- This routine will explode list-likes including lists, tuples, - Series, and np.ndarray. - The result dtype of the subset rows will be object. - Scalars will be returned unchanged. - Empty list-likes will result in a np.nan for that row. + Series, and np.ndarray. The result dtype of the subset rows will + be object. Scalars will be returned unchanged. Empty list-likes will + result in a np.nan for that row. Examples -------- @@ -6292,7 +6292,6 @@ def explode(self, column: str) -> "DataFrame": 2 NaN 1 3 3 1 3 4 1 - """ if not is_scalar(column): diff --git a/pandas/core/series.py b/pandas/core/series.py index 0a95848598178..cc6013f5a8950 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3638,7 +3638,8 @@ def reorder_levels(self, order): def explode(self) -> "Series": """ - Create new Series expanding a list-like column. + Transforms each element of a list-like to a row, replicating the + index values. .. versionadded:: 0.25.0 @@ -3659,10 +3660,9 @@ def explode(self) -> "Series": Notes ----- This routine will explode list-likes including lists, tuples, - Series, and np.ndarray. - The result dtype of the returned Series will always be object. - Scalars will be returned unchanged. - Empty list-likes will result in a np.nan for that row. + Series, and np.ndarray. The result dtype of the subset rows will + be object. Scalars will be returned unchanged. Empty list-likes will + result in a np.nan for that row. Examples -------- @@ -3683,7 +3683,6 @@ def explode(self) -> "Series": 3 3 3 4 dtype: object - """ if not len(self) or not is_object_dtype(self): return self.copy() From d51aa471cfc38697b759e0b38255eb139b7195de Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 11 Jul 2019 11:24:47 -0500 Subject: [PATCH 30/34] ordering --- pandas/tests/frame/test_explode.py | 2 +- pandas/tests/series/test_explode.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_explode.py b/pandas/tests/frame/test_explode.py index b2224a772a80c..4a970879942fe 100644 --- a/pandas/tests/frame/test_explode.py +++ b/pandas/tests/frame/test_explode.py @@ -67,7 +67,7 @@ def test_usecase(): # explode a single column # gh-10511 df = pd.DataFrame( - [[11, range(5), 10], [22, range(3), 20]], columns=["A", "B", "C"] + [[11, range(5), 10], [22, range(3), 20]], columns=list("ABC") ).set_index("C") result = df.explode("B") diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py index 15fe14cf6ba7b..331546f7dc73d 100644 --- a/pandas/tests/series/test_explode.py +++ b/pandas/tests/series/test_explode.py @@ -83,12 +83,17 @@ def non_object_dtype(s): def test_typical_usecase(): - df = pd.DataFrame([{"var1": "a,b,c", "var2": 1}, {"var1": "d,e,f", "var2": 2}]) + df = pd.DataFrame( + [{"var1": "a,b,c", "var2": 1}, {"var1": "d,e,f", "var2": 2}], + columns=["var1", "var2"], + ) exploded = df.var1.str.split(",").explode() exploded result = df[["var2"]].join(exploded) expected = pd.DataFrame( - {"var2": [1, 1, 1, 2, 2, 2], "var1": list("abcdef")}, index=[0, 0, 0, 1, 1, 1] + {"var2": [1, 1, 1, 2, 2, 2], "var1": list("abcdef")}, + columns=["var2", "var1"], + index=[0, 0, 0, 1, 1, 1], ) tm.assert_frame_equal(result, expected) From f9f82fc07b9c17a5fe137b52682d08a8f536717b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 11 Jul 2019 11:26:24 -0500 Subject: [PATCH 31/34] moar lint --- pandas/core/frame.py | 2 +- pandas/core/series.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8f9dbbd9a6ae0..8ff94385be491 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6239,7 +6239,7 @@ def stack(self, level=-1, dropna=True): def explode(self, column: str) -> "DataFrame": """ - Transforms each element of a list-like to a row, replicating the + Transform each element of a list-like to a row, replicating the index values. .. versionadded:: 0.25.0 diff --git a/pandas/core/series.py b/pandas/core/series.py index cc6013f5a8950..8082069efce3c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2008,7 +2008,7 @@ def drop_duplicates(self, keep="first", inplace=False): Examples -------- - Generate an Series with duplicated entries. + Generate a Series with duplicated entries. >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'], ... name='animal') @@ -3638,7 +3638,7 @@ def reorder_levels(self, order): def explode(self) -> "Series": """ - Transforms each element of a list-like to a row, replicating the + Transform each element of a list-like to a row, replicating the index values. .. versionadded:: 0.25.0 From 5c4635f159798a43d0a2eca526d55a566b9119d4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 17 Jul 2019 07:41:33 -0400 Subject: [PATCH 32/34] multi-index column support --- pandas/core/frame.py | 8 ++++---- pandas/tests/frame/test_explode.py | 21 ++++++++++++++++++++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8ff94385be491..c15f4ad8e1900 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -15,7 +15,7 @@ import itertools import sys from textwrap import dedent -from typing import FrozenSet, List, Optional, Set, Type, Union +from typing import FrozenSet, List, Optional, Set, Tuple, Type, Union import warnings import numpy as np @@ -6237,7 +6237,7 @@ def stack(self, level=-1, dropna=True): else: return stack(self, level, dropna=dropna) - def explode(self, column: str) -> "DataFrame": + def explode(self, column: Union[str, Tuple]) -> "DataFrame": """ Transform each element of a list-like to a row, replicating the index values. @@ -6246,7 +6246,7 @@ def explode(self, column: str) -> "DataFrame": Parameters ---------- - column : str + column : str or tuple Returns ------- @@ -6294,7 +6294,7 @@ def explode(self, column: str) -> "DataFrame": 3 4 1 """ - if not is_scalar(column): + if not (is_scalar(column) or isinstance(column, tuple)): raise ValueError("column must be a scalar") if not self.columns.is_unique: raise ValueError("columns must be unique") diff --git a/pandas/tests/frame/test_explode.py b/pandas/tests/frame/test_explode.py index 4a970879942fe..b4330aadbfba3 100644 --- a/pandas/tests/frame/test_explode.py +++ b/pandas/tests/frame/test_explode.py @@ -33,7 +33,7 @@ def test_basic(): tm.assert_frame_equal(result, expected) -def test_multi_index(): +def test_multi_index_rows(): df = pd.DataFrame( {"A": np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), "B": 1}, index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]), @@ -63,6 +63,25 @@ def test_multi_index(): tm.assert_frame_equal(result, expected) +def test_multi_index_columns(): + df = pd.DataFrame( + {("A", 1): np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), ("A", 2): 1} + ) + + result = df.explode(("A", 1)) + expected = pd.DataFrame( + { + ("A", 1): pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], + index=pd.Index([0, 0, 0, 1, 2, 3, 3]), + dtype=object, + ), + ("A", 2): 1, + } + ) + tm.assert_frame_equal(result, expected) + + def test_usecase(): # explode a single column # gh-10511 From 7fc21593a079c36860227cfa669795d0a0e0e5cd Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 17 Jul 2019 14:36:11 -0400 Subject: [PATCH 33/34] 32-bit compat --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 670a4666a3440..33521d4e3a355 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2094,7 +2094,7 @@ def repeat(self, repeats, axis=None): return MultiIndex( levels=self.levels, codes=[ - level_codes.view(np.ndarray).repeat(repeats) + level_codes.view(np.ndarray).astype(np.intp).repeat(repeats) for level_codes in self.codes ], names=self.names, From 4e7755b72a7be47aa2579ea810c6c1032a5cc81b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 17 Jul 2019 15:23:21 -0400 Subject: [PATCH 34/34] moar 32-bit compat --- pandas/core/indexes/base.py | 1 + pandas/core/indexes/multi.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e084f99ec5a2c..7bbd30e0c28b1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -963,6 +963,7 @@ def _assert_take_fillable( @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): + repeats = ensure_platform_int(repeats) nv.validate_repeat(tuple(), dict(axis=axis)) return self._shallow_copy(self._values.repeat(repeats)) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 33521d4e3a355..b673c119c0498 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2091,6 +2091,7 @@ def argsort(self, *args, **kwargs): @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) + repeats = ensure_platform_int(repeats) return MultiIndex( levels=self.levels, codes=[