From 13be18ed75ade74e8270c692f956c6cb274baaa5 Mon Sep 17 00:00:00 2001 From: tv3141 Date: Wed, 18 Mar 2020 21:56:00 +0000 Subject: [PATCH 01/12] Enable bounds checking. --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index dc2e8c097bc14..2a123f4c2028f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -775,7 +775,7 @@ def get_level_sorter(const int64_t[:] label, const int64_t[:] starts): return out -@cython.boundscheck(False) +@cython.boundscheck(True) @cython.wraparound(False) def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, const int64_t[:] labels, From 36513c3fe91c241f794a0c1fd0cf9040dc844859 Mon Sep 17 00:00:00 2001 From: tv3141 Date: Wed, 18 Mar 2020 21:52:05 +0000 Subject: [PATCH 02/12] Add failing tests. --- pandas/tests/groupby/test_counting.py | 55 ++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index b4239d7d34a90..c6246ffa4e3b3 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -3,7 +3,8 @@ import numpy as np import pytest -from pandas import DataFrame, MultiIndex, Period, Series, Timedelta, Timestamp +from pandas import (DataFrame, Index, MultiIndex, Period, Series, Timedelta, + Timestamp) import pandas._testing as tm @@ -220,3 +221,55 @@ def test_count_with_only_nans_in_first_group(self): mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"]) expected = Series([], index=mi, dtype=np.int64, name="C") tm.assert_series_equal(result, expected, check_index_type=False) + + #def test_count_groupby_index_with_nan_in_data(self): + # # https://github.com/pandas-dev/pandas/issues/21824 + # df = DataFrame({"Person": ["John", "Myla", "John", "John", "Myla"], + # "Age": [24., np.nan, 21., 33, 26], + # "Single": [False, True, True, True, False]}) + + # res = df.set_index(["Person", "Single"]).count(level="Person") + # expected = DataFrame(index=Index(["John", "Myla"], name="Person"), + # data={"Age": [2, 1]}) + + def test_count_groupby_index_with_nan_in_index(self): + # https://github.com/pandas-dev/pandas/issues/21824 + df = DataFrame({"Person": ["John", "Myla", None, "John", "Myla"], + "Age": [24., 5, 21., 33, 26], + "Single": [False, True, True, True, False]}) + + res = df.set_index(["Person", "Single"]).count(level="Person") + expected = DataFrame(index=Index(["John", "Myla"], name="Person"), + data={"Age": [2, 2]}) + + #def test_count_groupby_index_with_nan_in_data_and_in_index(self): + # # https://github.com/pandas-dev/pandas/issues/21824 + # df = DataFrame({"Person": ["John", "Myla", None, "John", "Myla"], + # "Age": [24., np.nan, 21., 33, 26], + # "Single": [False, True, True, True, False]}) + + # res = df.set_index(["Person", "Single"]).count(level="Person") + # expected = DataFrame(index=Index(["John", "Myla"], name="Person"), + # data={"Age": [2, 1]}) + + + def test_count_groupby_column_with_nan_in_groupby_column(self): + df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.NaN, 3, 0]}) + res = df.groupby(["B"]).count() + expected = DataFrame(index=Index([0.0, 3.0, 4.0, 5.0], name="B"), + data={"A": [1, 1, 1, 1]}) + tm.assert_frame_equal(expected, res) + + #def test_count_groupby_column_with_nan_in_data(self): + # df = DataFrame({"A": [1, np.NaN, 1, 1, 1], "B": [5, 4., 2, 3, 0]}) + # res = df.groupby(["B"]).count() + # expected = DataFrame(index=Index([0.0, 2.0, 3.0, 4.0, 5.0], name="B"), + # data={"A": [1, 1, 1, 0, 1]}) + # tm.assert_frame_equal(expected, res) + + #def test_count_groupby_column_with_nan_in_data_and_in_groupby_column(self): + # df = DataFrame({"A": [1, np.NaN, 1, 1, 1], "B": [5, 4, np.NaN, 3, 0]}) + # res = df.groupby(["B"]).count() + # expected = DataFrame(index=Index([0.0, 3.0, 4.0, 5.0], name="B"), + # data={"A": [1, 1, 0, 1]}) + # tm.assert_frame_equal(expected, res) From d6df0f0c845da2319851d65f7d5978ae8a3a5649 Mon Sep 17 00:00:00 2001 From: tv3141 Date: Tue, 17 Mar 2020 20:26:56 +0000 Subject: [PATCH 03/12] Count only masked (non-na) values. --- pandas/_libs/lib.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2a123f4c2028f..f18fab9207fda 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -793,14 +793,16 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, with nogil: for i in range(n): for j in range(k): - counts[labels[i], j] += mask[i, j] + if mask[i, j]: + counts[labels[i], j] += 1 else: # axis == 1 counts = np.zeros((n, max_bin), dtype='i8') with nogil: for i in range(n): for j in range(k): - counts[i, labels[j]] += mask[i, j] + if mask[i, j]: + counts[i, labels[j]] += 1 return counts From 049b897f603a5f96cfe55ba071228a3eda7470f5 Mon Sep 17 00:00:00 2001 From: tv3141 Date: Sat, 21 Mar 2020 18:15:12 +0000 Subject: [PATCH 04/12] Mask NaNs in index and dataframe values. --- pandas/core/frame.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index baa6fb07ff233..745646b2d7a9c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7820,13 +7820,21 @@ def _count_level(self, level, axis=0, numeric_only=False): f"Can only count levels on hierarchical {self._get_axis_name(axis)}." ) + # Mask NaNs: Mask rows where the index level is NaN and all values in + # the DataFrame that are NaN if frame._is_mixed_type: # Since we have mixed types, calling notna(frame.values) might # upcast everything to object - mask = notna(frame).values + mask = ( + notna(frame.index.get_level_values(level=level)).reshape(-1, 1) & + notna(frame).values + ) else: # But use the speedup when we have homogeneous dtypes - mask = notna(frame.values) + mask = ( + notna(frame.index.get_level_values(level=level)).reshape(-1, 1) & + notna(frame.values) + ) if axis == 1: # We're transposing the mask rather than frame to avoid potential From 1e755b3449733c955177b2af880b16bf03c319b8 Mon Sep 17 00:00:00 2001 From: tv3141 Date: Sat, 21 Mar 2020 18:22:33 +0000 Subject: [PATCH 05/12] Clean up tests. --- pandas/tests/groupby/test_counting.py | 42 +++------------------------ 1 file changed, 4 insertions(+), 38 deletions(-) diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index c6246ffa4e3b3..869276ca6785e 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -4,7 +4,7 @@ import pytest from pandas import (DataFrame, Index, MultiIndex, Period, Series, Timedelta, - Timestamp) + Timestamp) import pandas._testing as tm @@ -222,54 +222,20 @@ def test_count_with_only_nans_in_first_group(self): expected = Series([], index=mi, dtype=np.int64, name="C") tm.assert_series_equal(result, expected, check_index_type=False) - #def test_count_groupby_index_with_nan_in_data(self): - # # https://github.com/pandas-dev/pandas/issues/21824 - # df = DataFrame({"Person": ["John", "Myla", "John", "John", "Myla"], - # "Age": [24., np.nan, 21., 33, 26], - # "Single": [False, True, True, True, False]}) - - # res = df.set_index(["Person", "Single"]).count(level="Person") - # expected = DataFrame(index=Index(["John", "Myla"], name="Person"), - # data={"Age": [2, 1]}) - def test_count_groupby_index_with_nan_in_index(self): # https://github.com/pandas-dev/pandas/issues/21824 df = DataFrame({"Person": ["John", "Myla", None, "John", "Myla"], - "Age": [24., 5, 21., 33, 26], - "Single": [False, True, True, True, False]}) + "Age": [24., 5, 21., 33, 26], + "Single": [False, True, True, True, False]}) res = df.set_index(["Person", "Single"]).count(level="Person") expected = DataFrame(index=Index(["John", "Myla"], name="Person"), data={"Age": [2, 2]}) - #def test_count_groupby_index_with_nan_in_data_and_in_index(self): - # # https://github.com/pandas-dev/pandas/issues/21824 - # df = DataFrame({"Person": ["John", "Myla", None, "John", "Myla"], - # "Age": [24., np.nan, 21., 33, 26], - # "Single": [False, True, True, True, False]}) - - # res = df.set_index(["Person", "Single"]).count(level="Person") - # expected = DataFrame(index=Index(["John", "Myla"], name="Person"), - # data={"Age": [2, 1]}) - - def test_count_groupby_column_with_nan_in_groupby_column(self): + # https://github.com/pandas-dev/pandas/issues/32841 df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.NaN, 3, 0]}) res = df.groupby(["B"]).count() expected = DataFrame(index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]}) tm.assert_frame_equal(expected, res) - - #def test_count_groupby_column_with_nan_in_data(self): - # df = DataFrame({"A": [1, np.NaN, 1, 1, 1], "B": [5, 4., 2, 3, 0]}) - # res = df.groupby(["B"]).count() - # expected = DataFrame(index=Index([0.0, 2.0, 3.0, 4.0, 5.0], name="B"), - # data={"A": [1, 1, 1, 0, 1]}) - # tm.assert_frame_equal(expected, res) - - #def test_count_groupby_column_with_nan_in_data_and_in_groupby_column(self): - # df = DataFrame({"A": [1, np.NaN, 1, 1, 1], "B": [5, 4, np.NaN, 3, 0]}) - # res = df.groupby(["B"]).count() - # expected = DataFrame(index=Index([0.0, 3.0, 4.0, 5.0], name="B"), - # data={"A": [1, 1, 0, 1]}) - # tm.assert_frame_equal(expected, res) From de752483534146e3c54152974858d0a5da73e321 Mon Sep 17 00:00:00 2001 From: tv3141 Date: Sat, 21 Mar 2020 18:33:40 +0000 Subject: [PATCH 06/12] Re-disable bounds checking. --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f18fab9207fda..3dfc373372859 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -775,7 +775,7 @@ def get_level_sorter(const int64_t[:] label, const int64_t[:] starts): return out -@cython.boundscheck(True) +@cython.boundscheck(False) @cython.wraparound(False) def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, const int64_t[:] labels, From 3bea5af618e505c7721126f538f99c8643688b2e Mon Sep 17 00:00:00 2001 From: tv3141 Date: Sat, 21 Mar 2020 19:21:19 +0000 Subject: [PATCH 07/12] Move test into right module. --- pandas/tests/groupby/test_counting.py | 10 ---------- pandas/tests/test_multilevel.py | 11 +++++++++++ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 869276ca6785e..4e00b3f4a44d0 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -222,16 +222,6 @@ def test_count_with_only_nans_in_first_group(self): expected = Series([], index=mi, dtype=np.int64, name="C") tm.assert_series_equal(result, expected, check_index_type=False) - def test_count_groupby_index_with_nan_in_index(self): - # https://github.com/pandas-dev/pandas/issues/21824 - df = DataFrame({"Person": ["John", "Myla", None, "John", "Myla"], - "Age": [24., 5, 21., 33, 26], - "Single": [False, True, True, True, False]}) - - res = df.set_index(["Person", "Single"]).count(level="Person") - expected = DataFrame(index=Index(["John", "Myla"], name="Person"), - data={"Age": [2, 2]}) - def test_count_groupby_column_with_nan_in_groupby_column(self): # https://github.com/pandas-dev/pandas/issues/32841 df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.NaN, 3, 0]}) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 84279d874bae1..80cc57604dc80 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -248,6 +248,17 @@ def _check_counts(frame, axis=0): result = self.frame.count(level=0, numeric_only=True) tm.assert_index_equal(result.columns, Index(list("ABC"), name="exp")) + def test_count_index_with_nan(self): + # https://github.com/pandas-dev/pandas/issues/21824 + df = DataFrame({"Person": ["John", "Myla", None, "John", "Myla"], + "Age": [24., 5, 21., 33, 26], + "Single": [False, True, True, True, False]}) + + res = df.set_index(["Person", "Single"]).count(level="Person") + expected = DataFrame(index=Index(["John", "Myla"], name="Person"), + data={"Age": [2, 2]}) + + def test_count_level_series(self): index = MultiIndex( levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], From 432cd6895d4f1710070379c3deb6e92795c33e14 Mon Sep 17 00:00:00 2001 From: tv3141 Date: Sun, 22 Mar 2020 20:10:11 +0000 Subject: [PATCH 08/12] Fix masking logic to avoid SegFaults with DataFrame.count(). --- pandas/core/frame.py | 36 +++++++++++---------------------- pandas/tests/test_multilevel.py | 27 ++++++++++++++++++++----- 2 files changed, 34 insertions(+), 29 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 745646b2d7a9c..cb1128c845a2f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7820,26 +7820,15 @@ def _count_level(self, level, axis=0, numeric_only=False): f"Can only count levels on hierarchical {self._get_axis_name(axis)}." ) - # Mask NaNs: Mask rows where the index level is NaN and all values in - # the DataFrame that are NaN - if frame._is_mixed_type: - # Since we have mixed types, calling notna(frame.values) might - # upcast everything to object - mask = ( - notna(frame.index.get_level_values(level=level)).reshape(-1, 1) & - notna(frame).values - ) - else: - # But use the speedup when we have homogeneous dtypes - mask = ( - notna(frame.index.get_level_values(level=level)).reshape(-1, 1) & - notna(frame.values) - ) + # Mask NaNs: Mask rows or columns where the index level is NaN, and all + # values in the DataFrame that are NaN + values_mask = notna(frame.values) + index_mask = notna(count_axis.get_level_values(level=level)) if axis == 1: - # We're transposing the mask rather than frame to avoid potential - # upcasts to object, which induces a ~20x slowdown - mask = mask.T + mask = index_mask & values_mask + else: + mask = index_mask.reshape(-1, 1) & values_mask if isinstance(level, str): level = count_axis._get_level_number(level) @@ -7847,15 +7836,14 @@ def _count_level(self, level, axis=0, numeric_only=False): level_name = count_axis._names[level] level_index = count_axis.levels[level]._shallow_copy(name=level_name) level_codes = ensure_int64(count_axis.codes[level]) - counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=0) - - result = DataFrame(counts, index=level_index, columns=agg_axis) + counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=axis) if axis == 1: - # Undo our earlier transpose - return result.T + result = DataFrame(counts, index=agg_axis, columns=level_index) else: - return result + result = DataFrame(counts, index=level_index, columns=agg_axis) + + return result def _reduce( self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 80cc57604dc80..0e12f7207295e 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -250,14 +250,31 @@ def _check_counts(frame, axis=0): def test_count_index_with_nan(self): # https://github.com/pandas-dev/pandas/issues/21824 - df = DataFrame({"Person": ["John", "Myla", None, "John", "Myla"], - "Age": [24., 5, 21., 33, 26], - "Single": [False, True, True, True, False]}) + df = DataFrame( + { + "Person": ["John", "Myla", None, "John", "Myla"], + "Age": [24.0, 5, 21.0, 33, 26], + "Single": [False, True, True, True, False], + } + ) + # count on row labels res = df.set_index(["Person", "Single"]).count(level="Person") - expected = DataFrame(index=Index(["John", "Myla"], name="Person"), - data={"Age": [2, 2]}) + expected = DataFrame( + index=Index(["John", "Myla"], name="Person"), + columns=Index(["Age"]), + data=np.array([2, 2]), + ) + tm.assert_frame_equal(res, expected) + # count on column labels + res = df.set_index(["Person", "Single"]).T.count(level="Person", axis=1) + expected = DataFrame( + columns=Index(["John", "Myla"], name="Person"), + index=Index(["Age"]), + data=np.array([[2, 2]]), + ) + tm.assert_frame_equal(res, expected) def test_count_level_series(self): index = MultiIndex( From 7e379b1a20abf359c2a8daa1c50ce6cc57dc2d90 Mon Sep 17 00:00:00 2001 From: tv3141 Date: Sun, 22 Mar 2020 20:10:27 +0000 Subject: [PATCH 09/12] black --- pandas/tests/groupby/test_counting.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 4e00b3f4a44d0..56a18757da6e7 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -3,8 +3,7 @@ import numpy as np import pytest -from pandas import (DataFrame, Index, MultiIndex, Period, Series, Timedelta, - Timestamp) +from pandas import DataFrame, Index, MultiIndex, Period, Series, Timedelta, Timestamp import pandas._testing as tm @@ -226,6 +225,7 @@ def test_count_groupby_column_with_nan_in_groupby_column(self): # https://github.com/pandas-dev/pandas/issues/32841 df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.NaN, 3, 0]}) res = df.groupby(["B"]).count() - expected = DataFrame(index=Index([0.0, 3.0, 4.0, 5.0], name="B"), - data={"A": [1, 1, 1, 1]}) + expected = DataFrame( + index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]} + ) tm.assert_frame_equal(expected, res) From a660d0f3b94c65e498f65dc8ef5b1a25a1815ede Mon Sep 17 00:00:00 2001 From: tv3141 Date: Mon, 23 Mar 2020 22:33:15 +0000 Subject: [PATCH 10/12] Add info to whatsnew. --- doc/source/whatsnew/v1.1.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 0d3a9a8f969a4..1565f4c16ee80 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -271,7 +271,7 @@ Numeric - Bug in :meth:`to_numeric` with string argument ``"uint64"`` and ``errors="coerce"`` silently fails (:issue:`32394`) - Bug in :meth:`to_numeric` with ``downcast="unsigned"`` fails for empty data (:issue:`32493`) - Bug in :meth:`DataFrame.mean` with ``numeric_only=False`` and either ``datetime64`` dtype or ``PeriodDtype`` column incorrectly raising ``TypeError`` (:issue:`32426`) -- +- Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`) Conversion ^^^^^^^^^^ @@ -361,6 +361,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) +- Bug in :meth:`GroupBy.count` causes segmentation fault when grouped-by column contains NaNs (:issue:`32841`) Reshaping ^^^^^^^^^ From 52a6f99b4230c958e8f8d6e813cd89661c05d81c Mon Sep 17 00:00:00 2001 From: tv3141 Date: Tue, 24 Mar 2020 20:16:17 +0000 Subject: [PATCH 11/12] Fix issue with default int32 on Windows. --- pandas/tests/test_multilevel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 0e12f7207295e..f025abd5628cf 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -263,7 +263,7 @@ def test_count_index_with_nan(self): expected = DataFrame( index=Index(["John", "Myla"], name="Person"), columns=Index(["Age"]), - data=np.array([2, 2]), + data=[2, 2], ) tm.assert_frame_equal(res, expected) @@ -272,7 +272,7 @@ def test_count_index_with_nan(self): expected = DataFrame( columns=Index(["John", "Myla"], name="Person"), index=Index(["Age"]), - data=np.array([[2, 2]]), + data=[[2, 2]], ) tm.assert_frame_equal(res, expected) From 60289c28bf9aa2e0203ef3edc5d19e85a584e953 Mon Sep 17 00:00:00 2001 From: tv3141 Date: Tue, 31 Mar 2020 22:16:09 +0100 Subject: [PATCH 12/12] Re-add notna optimisation. --- pandas/core/frame.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5ea5ec6832821..6ddb00db350af 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7895,7 +7895,13 @@ def _count_level(self, level, axis=0, numeric_only=False): # Mask NaNs: Mask rows or columns where the index level is NaN, and all # values in the DataFrame that are NaN - values_mask = notna(frame.values) + if frame._is_mixed_type: + # Since we have mixed types, calling notna(frame.values) might + # upcast everything to object + values_mask = notna(frame).values + else: + # But use the speedup when we have homogeneous dtypes + values_mask = notna(frame.values) index_mask = notna(count_axis.get_level_values(level=level)) if axis == 1: