From f58048d72c3b69dc6e7d9e11ff61d9e8d831d466 Mon Sep 17 00:00:00 2001 From: Kenil Mehta Date: Wed, 28 Apr 2021 19:45:47 +0530 Subject: [PATCH 1/8] PERF: Optimising Series.nunique() for NaN values #40865 --- pandas/core/base.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 42f52618eb07b..0b1c426b18a19 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1040,8 +1040,11 @@ def nunique(self, dropna: bool = True) -> int: >>> s.nunique() 4 """ - obj = remove_na_arraylike(self) if dropna else self - return len(obj.unique()) + uniqs = self.unique() + if dropna: + return (~np.isnan(uniqs)).sum() + else: + return len(uniqs) @property def is_unique(self) -> bool: From b09c3367f353650eb74012313edec3a678083762 Mon Sep 17 00:00:00 2001 From: Kenil Mehta Date: Thu, 29 Apr 2021 19:00:21 +0530 Subject: [PATCH 2/8] using isna inplace of np.isnan --- pandas/core/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 0b1c426b18a19..def5703b4d01f 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1042,7 +1042,7 @@ def nunique(self, dropna: bool = True) -> int: """ uniqs = self.unique() if dropna: - return (~np.isnan(uniqs)).sum() + return (~isna(uniqs)).sum() else: return len(uniqs) From 47c7911680cc915e825352aa83e6eda935d15b1e Mon Sep 17 00:00:00 2001 From: Kenil Mehta Date: Fri, 30 Apr 2021 19:20:07 +0530 Subject: [PATCH 3/8] using numpy isnan function --- pandas/core/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index def5703b4d01f..cf5941607bbf8 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1041,10 +1041,10 @@ def nunique(self, dropna: bool = True) -> int: 4 """ uniqs = self.unique() - if dropna: - return (~isna(uniqs)).sum() - else: - return len(uniqs) + l = len(uniqs) + if dropna and np.isnan(uniqs).any(): + l = l - 1 + return l @property def is_unique(self) -> bool: From b438052c24a8817fc8f9bd9ff2cfb0f3438f6b60 Mon Sep 17 00:00:00 2001 From: Kenil Mehta Date: Fri, 30 Apr 2021 20:32:59 +0530 Subject: [PATCH 4/8] using isna isplace of np.isnan --- pandas/core/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index cf5941607bbf8..23dcf385aeb3e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1041,10 +1041,10 @@ def nunique(self, dropna: bool = True) -> int: 4 """ uniqs = self.unique() - l = len(uniqs) - if dropna and np.isnan(uniqs).any(): - l = l - 1 - return l + length = len(uniqs) + if dropna and isna(uniqs).any(): + length = length - 1 + return length @property def is_unique(self) -> bool: From b2f9b0cb40f4e45a610ecb53628465ebfa6868d5 Mon Sep 17 00:00:00 2001 From: Kenil Mehta Date: Fri, 30 Apr 2021 21:32:04 +0530 Subject: [PATCH 5/8] small change --- pandas/core/base.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 23dcf385aeb3e..36a90468eff7f 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1041,10 +1041,9 @@ def nunique(self, dropna: bool = True) -> int: 4 """ uniqs = self.unique() - length = len(uniqs) - if dropna and isna(uniqs).any(): - length = length - 1 - return length + if dropna: + uniqs = remove_na_arraylike(uniqs) + return len(uniqs) @property def is_unique(self) -> bool: From 7f20a6ce8808c1e66fb6fce5a4a5294b922c15ca Mon Sep 17 00:00:00 2001 From: Kenil Mehta Date: Sat, 1 May 2021 16:34:03 +0530 Subject: [PATCH 6/8] adding asv for Series Nunique() with Nan values --- asv_bench/benchmarks/frame_methods.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 65167e6467fd5..05b1c2d7cf544 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -562,6 +562,12 @@ def setup(self): def time_frame_nunique(self): self.df.nunique() +class SeriesNuniqueWithNan: + def setup(self): + self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float) + + def time_series_nunique_nan(self): + self.ser.nunique() class Duplicated: def setup(self): From 2985b9d63f53567934866eb03c34ac3a37c83e5c Mon Sep 17 00:00:00 2001 From: Kenil Mehta Date: Sat, 1 May 2021 21:10:29 +0530 Subject: [PATCH 7/8] precommit check changes --- asv_bench/benchmarks/frame_methods.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 05b1c2d7cf544..760da36a30075 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -562,6 +562,7 @@ def setup(self): def time_frame_nunique(self): self.df.nunique() + class SeriesNuniqueWithNan: def setup(self): self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float) @@ -569,6 +570,7 @@ def setup(self): def time_series_nunique_nan(self): self.ser.nunique() + class Duplicated: def setup(self): n = 1 << 20 From 6628cae2f2afe178b4f6111cda11e7215cbb779e Mon Sep 17 00:00:00 2001 From: Kenil Mehta Date: Mon, 3 May 2021 10:32:30 +0530 Subject: [PATCH 8/8] adding entry in whatsnew doc --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 74710ca48308c..be2dbd83dd23d 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -635,7 +635,7 @@ Performance improvements - Performance improvement in the conversion of pyarrow boolean array to a pandas nullable boolean array (:issue:`41051`) - Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`) - Performance improvement in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable data types (:issue:`37493`) -- +- Performance improvement in :meth:`Series.nunique` with nan values (:issue:`40865`) .. ---------------------------------------------------------------------------