From 0296f0c1ede16bc37e7cc19b07065265506e54dc Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 15 Feb 2023 16:28:16 -0800 Subject: [PATCH 1/6] PERF: DataFrame.corr avoid copy --- pandas/core/frame.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 71dc3b523fca6..67107d2a744b4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10019,7 +10019,15 @@ def corr( data = self._get_numeric_data() if numeric_only else self cols = data.columns idx = cols.copy() - mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) + if data._can_fast_transpose: + # Avoid an expensive copy + values = self._values + if not isinstance(values, np.ndarray): + mat = values.to_numpy(dtype=np.float64, na_value=np.nan, copy=False) + else: + mat = values.astype(np.float64, copy=False) + else: + mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) if method == "pearson": correl = libalgos.nancorr(mat, minp=min_periods) From 621e8fef16e4787066b2e816c4e0f890be9573a0 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 15 Feb 2023 16:28:16 -0800 Subject: [PATCH 2/6] PERF: DataFrame.corr avoid copy --- pandas/core/frame.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 79f423b17383d..fb0720d7c1894 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10025,7 +10025,15 @@ def corr( data = self._get_numeric_data() if numeric_only else self cols = data.columns idx = cols.copy() - mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) + if data._can_fast_transpose: + # Avoid an expensive copy + values = self._values + if not isinstance(values, np.ndarray): + mat = values.to_numpy(dtype=np.float64, na_value=np.nan, copy=False) + else: + mat = values.astype(np.float64, copy=False) + else: + mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) if method == "pearson": correl = libalgos.nancorr(mat, minp=min_periods) From 531600806d66910bfd00c492b8eaad743afb7cda Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 16 Feb 2023 09:27:00 -0800 Subject: [PATCH 3/6] typo fix numeric_only --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fb0720d7c1894..0a7ee90271325 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10027,7 +10027,7 @@ def corr( idx = cols.copy() if data._can_fast_transpose: # Avoid an expensive copy - values = self._values + values = data._values if not isinstance(values, np.ndarray): mat = values.to_numpy(dtype=np.float64, na_value=np.nan, copy=False) else: From 520bd03f62c407203d1526e442ede6f2659ba219 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 11 Mar 2023 20:55:57 -0800 Subject: [PATCH 4/6] Avoid copy in to_numpy --- pandas/core/frame.py | 10 +--------- pandas/core/internals/managers.py | 17 +++++++++++------ pandas/tests/frame/methods/test_to_numpy.py | 4 ++++ 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e923e0f5dd2ff..70019030da182 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10480,15 +10480,7 @@ def corr( data = self._get_numeric_data() if numeric_only else self cols = data.columns idx = cols.copy() - if data._can_fast_transpose: - # Avoid an expensive copy - values = data._values - if not isinstance(values, np.ndarray): - mat = values.to_numpy(dtype=np.float64, na_value=np.nan, copy=False) - else: - mat = values.astype(np.float64, copy=False) - else: - mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) + mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) if method == "pearson": correl = libalgos.nancorr(mat, minp=min_periods) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1324f5aeccc0d..e3f16213a2f7f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1694,15 +1694,15 @@ def as_array( arr = np.empty(self.shape, dtype=float) return arr.transpose() - # We want to copy when na_value is provided to avoid - # mutating the original object - copy = copy or na_value is not lib.no_default - if self.is_single_block: blk = self.blocks[0] if blk.is_extension: # Avoid implicit conversion of extension blocks to object + # We want to copy when na_value is provided to avoid + # mutating the original object + copy = copy or na_value is not lib.no_default + # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no # attribute "to_numpy" arr = blk.values.to_numpy( # type: ignore[union-attr] @@ -1712,7 +1712,8 @@ def as_array( else: arr = np.asarray(blk.get_values()) if dtype: - arr = arr.astype(dtype, copy=False) + arr = arr.astype(dtype, copy=copy) + copy = False else: arr = self._interleave(dtype=dtype, na_value=na_value) # The underlying data was copied within _interleave @@ -1721,7 +1722,11 @@ def as_array( if copy: arr = arr.copy() - if na_value is not lib.no_default: + if na_value is lib.no_default: + pass + elif arr.dtype.kind == "f" and lib.is_float(na_value) and np.isnan(na_value): + pass + else: arr[isna(arr)] = na_value return arr.transpose() diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index 532f7c87557c8..20e2a63cc793c 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -30,6 +30,10 @@ def test_to_numpy_copy(self): assert df.to_numpy(copy=False).base is arr assert df.to_numpy(copy=True).base is not arr + # we still don't want a copy when na_value=np.nan is passed, + # and that can be respected because we are already numpy-float + assert df.to_numpy(copy=False, na_value=np.nan).base is arr + def test_to_numpy_mixed_dtype_to_str(self): # https://github.com/pandas-dev/pandas/issues/35455 df = DataFrame([[Timestamp("2020-01-01 00:00:00"), 100.0]]) From a8c5575fc6eb7f847bf9a05e1e0fea09cb332ee4 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 12 Mar 2023 12:14:27 -0700 Subject: [PATCH 5/6] fix test --- pandas/core/internals/managers.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index e3f16213a2f7f..78c488b4b08b6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1689,6 +1689,8 @@ def as_array( ------- arr : ndarray """ + passed_nan = lib.is_float(na_value) and np.isnan(na_value) + # TODO(CoW) handle case where resulting array is a view if len(self.blocks) == 0: arr = np.empty(self.shape, dtype=float) @@ -1696,12 +1698,22 @@ def as_array( if self.is_single_block: blk = self.blocks[0] - if blk.is_extension: - # Avoid implicit conversion of extension blocks to object + if na_value is not lib.no_default: # We want to copy when na_value is provided to avoid # mutating the original object - copy = copy or na_value is not lib.no_default + if ( + isinstance(blk.dtype, np.dtype) + and blk.dtype.kind == "f" + and passed_nan + ): + # We are already numpy-float and na_value=np.nan + pass + else: + copy = True + + if blk.is_extension: + # Avoid implicit conversion of extension blocks to object # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no # attribute "to_numpy" @@ -1724,7 +1736,7 @@ def as_array( if na_value is lib.no_default: pass - elif arr.dtype.kind == "f" and lib.is_float(na_value) and np.isnan(na_value): + elif arr.dtype.kind == "f" and passed_nan: pass else: arr[isna(arr)] = na_value From 811c4c17df5b1872ad4929595b8b57e5206670ca Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 12 Mar 2023 18:21:20 -0700 Subject: [PATCH 6/6] mypy fixup --- pandas/core/internals/managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 78c488b4b08b6..dc50014548d2f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1689,7 +1689,7 @@ def as_array( ------- arr : ndarray """ - passed_nan = lib.is_float(na_value) and np.isnan(na_value) + passed_nan = lib.is_float(na_value) and isna(na_value) # TODO(CoW) handle case where resulting array is a view if len(self.blocks) == 0: