From b83151619379435b3a8236d9de378eb5b263768b Mon Sep 17 00:00:00 2001 From: Piotr Jucha Date: Mon, 5 Sep 2016 00:26:37 -0400 Subject: [PATCH] BUG/TST: Empty input arrays in cartesian_product and MultiIndex (#12258) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/categorical.py | 22 ++++++++++- pandas/tests/indexes/test_multi.py | 63 ++++++++++++++++++++++++++++++ pandas/tools/tests/test_util.py | 23 +++++++++++ pandas/tools/util.py | 29 +++++++++++++- 5 files changed, 135 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 454ffc5e5c685..67beb468dce8a 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1471,6 +1471,7 @@ Bug Fixes - Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) - Bug in ``.str.replace`` does not raise ``TypeError`` for invalid replacement (:issue:`13438`) - Bug in ``MultiIndex.from_arrays`` which didn't check for input array lengths matching (:issue:`13599`) +- Bug in ``cartesian_product`` and ``MultiIndex.from_product`` which may raise with empty input arrays (:issue:`12258`) - Bug in ``pd.read_csv()`` which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 0a13c8936eeec..6b37a5e2cd202 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1979,13 +1979,16 @@ def _factorize_from_iterable(values): Returns ------- - codes : np.array + codes : ndarray categories : Index If `values` has a categorical dtype, then `categories` is a CategoricalIndex keeping the categories and order of `values`. """ from pandas.indexes.category import CategoricalIndex + if not is_list_like(values): + raise TypeError("Input must be list-like") + if is_categorical(values): if isinstance(values, (ABCCategoricalIndex, ABCSeries)): values = values._values @@ -2003,8 +2006,23 @@ def _factorize_from_iterable(values): def _factorize_from_iterables(iterables): """ A higher-level wrapper over `_factorize_from_iterable`. - See `_factorize_from_iterable` for more info. *This is an internal function* + + Parameters + ---------- + iterables : list-like of list-likes + + Returns + ------- + codes_tuple : tuple of ndarrays + categories_tuple : tuple of Indexes + + Notes + ----- + See `_factorize_from_iterable` for more info. """ + if len(iterables) == 0: + # For consistency, it should return a list of 2 tuples. + return [(), ()] return lzip(*[_factorize_from_iterable(it) for it in iterables]) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 5248f0775d22f..92061eab61b78 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -691,6 +691,32 @@ def test_from_arrays_index_series_categorical(self): tm.assert_index_equal(result3.get_level_values(0), idx1) tm.assert_index_equal(result3.get_level_values(1), idx2) + def test_from_arrays_empty(self): + # 0 levels + with tm.assertRaisesRegexp( + ValueError, "Must pass non-zero number of levels/labels"): + MultiIndex.from_arrays(arrays=[]) + + # 1 level + result = MultiIndex.from_arrays(arrays=[[]], names=['A']) + expected = Index([], name='A') + tm.assert_index_equal(result, expected) + + # N levels + for N in [2, 3]: + arrays = [[]] * N + names = list('ABC')[:N] + result = MultiIndex.from_arrays(arrays=arrays, names=names) + expected = MultiIndex(levels=[np.array([])] * N, labels=[[]] * N, + names=names) + tm.assert_index_equal(result, expected) + + def test_from_arrays_invalid_input(self): + invalid_inputs = [1, [1], [1, 2], [[1], 2], + 'a', ['a'], ['a', 'b'], [['a'], 'b']] + for i in invalid_inputs: + tm.assertRaises(TypeError, MultiIndex.from_arrays, arrays=i) + def test_from_arrays_different_lengths(self): # GH13599 idx1 = [1, 2, 3] @@ -723,6 +749,43 @@ def test_from_product(self): tm.assert_index_equal(result, expected) self.assertEqual(result.names, names) + def test_from_product_empty(self): + # 0 levels + with tm.assertRaisesRegexp( + ValueError, "Must pass non-zero number of levels/labels"): + MultiIndex.from_product([]) + + # 1 level + result = MultiIndex.from_product([[]], names=['A']) + expected = pd.Float64Index([], name='A') + tm.assert_index_equal(result, expected) + + # 2 levels + l1 = [[], ['foo', 'bar', 'baz'], []] + l2 = [[], [], ['a', 'b', 'c']] + names = ['A', 'B'] + for first, second in zip(l1, l2): + result = MultiIndex.from_product([first, second], names=names) + expected = MultiIndex(levels=[np.array(first), np.array(second)], + labels=[[], []], names=names) + tm.assert_index_equal(result, expected) + + # GH12258 + names = ['A', 'B', 'C'] + for N in range(4): + lvl2 = lrange(N) + result = MultiIndex.from_product([[], lvl2, []], names=names) + expected = MultiIndex(levels=[np.array(A) + for A in [[], lvl2, []]], + labels=[[], [], []], names=names) + tm.assert_index_equal(result, expected) + + def test_from_product_invalid_input(self): + invalid_inputs = [1, [1], [1, 2], [[1], 2], + 'a', ['a'], ['a', 'b'], [['a'], 'b']] + for i in invalid_inputs: + tm.assertRaises(TypeError, MultiIndex.from_product, iterables=i) + def test_from_product_datetimeindex(self): dt_index = date_range('2000-01-01', periods=2) mi = pd.MultiIndex.from_product([[1, 2], dt_index]) diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index d8a98bbb3fd27..8c16308d79a31 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -34,6 +34,29 @@ def test_datetimeindex(self): tm.assert_numpy_array_equal(result1, expected1) tm.assert_numpy_array_equal(result2, expected2) + def test_empty(self): + # product of empty factors + X = [[], [0, 1], []] + Y = [[], [], ['a', 'b', 'c']] + for x, y in zip(X, Y): + expected1 = np.array([], dtype=np.asarray(x).dtype) + expected2 = np.array([], dtype=np.asarray(y).dtype) + result1, result2 = cartesian_product([x, y]) + tm.assert_numpy_array_equal(result1, expected1) + tm.assert_numpy_array_equal(result2, expected2) + + # empty product (empty input): + result = cartesian_product([]) + expected = [] + tm.assert_equal(result, expected) + + def test_invalid_input(self): + invalid_inputs = [1, [1], [1, 2], [[1], 2], + 'a', ['a'], ['a', 'b'], [['a'], 'b']] + msg = "Input must be a list-like of list-likes" + for X in invalid_inputs: + tm.assertRaisesRegexp(TypeError, msg, cartesian_product, X=X) + class TestLocaleUtils(tm.TestCase): diff --git a/pandas/tools/util.py b/pandas/tools/util.py index b8b28663387cc..fec56328c1721 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -4,6 +4,7 @@ from pandas.types.common import (is_number, is_numeric_dtype, is_datetime_or_timedelta_dtype, + is_list_like, _ensure_object) from pandas.types.cast import _possibly_downcast_to_dtype @@ -24,13 +25,35 @@ def cartesian_product(X): Numpy version of itertools.product or pandas.compat.product. Sometimes faster (for large inputs)... + Parameters + ---------- + X : list-like of list-likes + + Returns + ------- + product : list of ndarrays + Examples -------- >>> cartesian_product([list('ABC'), [1, 2]]) [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'), array([1, 2, 1, 2, 1, 2])] + See also + -------- + itertools.product : Cartesian product of input iterables. Equivalent to + nested for-loops. + pandas.compat.product : An alias for itertools.product. """ + msg = "Input must be a list-like of list-likes" + if not is_list_like(X): + raise TypeError(msg) + for x in X: + if not is_list_like(x): + raise TypeError(msg) + + if len(X) == 0: + return [] lenX = np.fromiter((len(x) for x in X), dtype=int) cumprodX = np.cumproduct(lenX) @@ -38,7 +61,11 @@ def cartesian_product(X): a = np.roll(cumprodX, 1) a[0] = 1 - b = cumprodX[-1] / cumprodX + if cumprodX[-1] != 0: + b = cumprodX[-1] / cumprodX + else: + # if any factor is empty, the cartesian product is empty + b = np.zeros_like(cumprodX) return [np.tile(np.repeat(np.asarray(com._values_from_object(x)), b[i]), np.product(a[i]))