From 6bd6ddd02efa729ec302a0d5a36b6877f3a3af02 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Mon, 29 Jan 2018 14:06:45 +0700 Subject: [PATCH 01/11] BUG: don't assume series is length > 0 --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/lib.pyx | 3 ++- pandas/tests/sparse/series/test_series.py | 11 +++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 1890636bc8e1a..8dc06b6819b3c 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -557,3 +557,4 @@ Other ^^^^^ - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) +- Bug in `Series.memory_usage` which assumes series will always have more than one element (:issue:`19368`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c3a654b01022c..b8ace1d2aa50d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -67,7 +67,8 @@ def memory_usage_of_objects(ndarray[object, ndim=1] arr): cdef int64_t s = 0 n = len(arr) - for i from 0 <= i < n: + # Hrm why was this 0 <= i < n + for i from 0 < i < n: s += arr[i].__sizeof__() return s diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 2ea1e63433520..4e997b055f25b 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -971,6 +971,17 @@ def test_combine_first(self): tm.assert_sp_series_equal(result, result2) tm.assert_sp_series_equal(result, expected) + @pytest.mark.parametrize('deep,fill_values', [([True, False], + [0, 1, np.nan, None])]) + def test_memory_usage_deep(self, deep, fill_values): + for fv in fill_values: + sparse_series = SparseSeries(fill_values, fill_value=fv) + dense_series = Series(fill_values) + sparse_usage = sparse_series.memory_usage(deep=deep) + dense_usage = dense_series.memory_usage(deep=deep) + + assert sparse_usage < dense_usage + class TestSparseHandlingMultiIndexes(object): From 7fdd03e946012bb1671a88a250eaf52f55e4ddde Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Mon, 29 Jan 2018 19:30:32 +0700 Subject: [PATCH 02/11] Take out comment and use product --- pandas/_libs/lib.pyx | 1 - pandas/tests/sparse/series/test_series.py | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index b8ace1d2aa50d..8874a1a37c0ed 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -67,7 +67,6 @@ def memory_usage_of_objects(ndarray[object, ndim=1] arr): cdef int64_t s = 0 n = len(arr) - # Hrm why was this 0 <= i < n for i from 0 < i < n: s += arr[i].__sizeof__() return s diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 4e997b055f25b..4f0aae8d61c04 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -23,6 +23,7 @@ from pandas.core.sparse.api import SparseSeries from pandas.tests.series.test_api import SharedWithSparse +from itertools import product def _test_data1(): # nan-based @@ -974,8 +975,8 @@ def test_combine_first(self): @pytest.mark.parametrize('deep,fill_values', [([True, False], [0, 1, np.nan, None])]) def test_memory_usage_deep(self, deep, fill_values): - for fv in fill_values: - sparse_series = SparseSeries(fill_values, fill_value=fv) + for deep, fill_value in product(deep, fill_values): + sparse_series = SparseSeries(fill_values, fill_value=fill_value) dense_series = Series(fill_values) sparse_usage = sparse_series.memory_usage(deep=deep) dense_usage = dense_series.memory_usage(deep=deep) From e25158713a2acb2132c97f45215ac113dc4d7091 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Fri, 2 Feb 2018 15:54:28 +0700 Subject: [PATCH 03/11] Fix failing test with indexing --- pandas/_libs/lib.pyx | 2 +- pandas/tests/sparse/series/test_series.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 87ce382584cf0..0954a74cb4a7b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -60,7 +60,7 @@ def memory_usage_of_objects(ndarray[object, ndim=1] arr): cdef int64_t s = 0 n = len(arr) - for i from 0 < i < n: + for i in range(n): s += arr[i].__sizeof__() return s diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 4f0aae8d61c04..3f5d5a59cc540 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -25,6 +25,7 @@ from itertools import product + def _test_data1(): # nan-based arr = np.arange(20, dtype=float) From 5e59e9cbcc3653d0faa4505bad1a7ed8c3f797f8 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Fri, 2 Feb 2018 16:07:40 +0700 Subject: [PATCH 04/11] Use range over 0 <= for loops --- pandas/_libs/lib.pyx | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0954a74cb4a7b..28b1b47e3bbcd 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -131,10 +131,10 @@ def fast_unique_multiple(list arrays): dict table = {} object val, stub = 0 - for i from 0 <= i < k: + for i in range(k): buf = arrays[i] n = len(buf) - for j from 0 <= j < n: + for j in range(n): val = buf[j] if val not in table: table[val] = stub @@ -158,10 +158,10 @@ def fast_unique_multiple_list(list lists): dict table = {} object val, stub = 0 - for i from 0 <= i < k: + for i in range(k): buf = lists[i] n = len(buf) - for j from 0 <= j < n: + for j in range(n): val = buf[j] if val not in table: table[val] = stub @@ -200,7 +200,7 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True): for buf in gen: n = len(buf) - for j from 0 <= j < n: + for j in range(n): val = buf[j] if val not in table: table[val] = stub @@ -830,15 +830,15 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, if axis == 0: counts = np.zeros((max_bin, k), dtype='i8') with nogil: - for i from 0 <= i < n: - for j from 0 <= j < k: + for i in range(n): + for j in range(n): counts[labels[i], j] += mask[i, j] else: # axis == 1 counts = np.zeros((n, max_bin), dtype='i8') with nogil: - for i from 0 <= i < n: - for j from 0 <= j < k: + for i in range(n): + for j in range(k): counts[i, labels[j]] += mask[i, j] return counts From 3f52a44f6ffb3ae52c5fd5b029804228215bbcb2 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Fri, 2 Feb 2018 16:23:04 +0700 Subject: [PATCH 05/11] Ah ha I think I got it --- pandas/_libs/lib.pyx | 6 ++++++ pandas/core/base.py | 14 ++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 28b1b47e3bbcd..a4b36c880fe9f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -59,6 +59,12 @@ def memory_usage_of_objects(ndarray[object, ndim=1] arr): cdef Py_ssize_t i, n cdef int64_t s = 0 + # The problem here is that... + # A SparseArray of size 1 that has fill_value = the only value + # will cause this + + # n = 1 + # n = len(arr) for i in range(n): s += arr[i].__sizeof__() diff --git a/pandas/core/base.py b/pandas/core/base.py index 54d25a16a10a3..8ee9aaa71f75f 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1069,12 +1069,18 @@ def memory_usage(self, deep=False): -------- numpy.ndarray.nbytes """ - if hasattr(self.values, 'memory_usage'): - return self.values.memory_usage(deep=deep) + # Use sparse values if they exist for memory consumption + if hasattr(self.values, 'sp_values'): + values = self.values.sp_values + else: + values = self.values + + if hasattr(values, 'memory_usage'): + return values.memory_usage(deep=deep) - v = self.values.nbytes + v = values.nbytes if deep and is_object_dtype(self) and not PYPY: - v += lib.memory_usage_of_objects(self.values) + v += lib.memory_usage_of_objects(values) return v def factorize(self, sort=False, na_sentinel=-1): From 21ae14707dfbadc365ce65769f3849dd62dee598 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Mon, 5 Feb 2018 10:19:16 +0700 Subject: [PATCH 06/11] FIX: revert change to lib.pyx --- pandas/_libs/lib.pyx | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a4b36c880fe9f..e1d59f807a7fd 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -59,14 +59,8 @@ def memory_usage_of_objects(ndarray[object, ndim=1] arr): cdef Py_ssize_t i, n cdef int64_t s = 0 - # The problem here is that... - # A SparseArray of size 1 that has fill_value = the only value - # will cause this - - # n = 1 - # n = len(arr) - for i in range(n): + for i from 0 <= i < n: s += arr[i].__sizeof__() return s @@ -137,10 +131,10 @@ def fast_unique_multiple(list arrays): dict table = {} object val, stub = 0 - for i in range(k): + for i from 0 <= i < k: buf = arrays[i] n = len(buf) - for j in range(n): + for j from 0 <= j < n: val = buf[j] if val not in table: table[val] = stub @@ -164,10 +158,10 @@ def fast_unique_multiple_list(list lists): dict table = {} object val, stub = 0 - for i in range(k): + for i from 0 <= i < k: buf = lists[i] n = len(buf) - for j in range(n): + for j from 0 <= j < n: val = buf[j] if val not in table: table[val] = stub @@ -206,7 +200,7 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True): for buf in gen: n = len(buf) - for j in range(n): + for j from 0 <= j < n: val = buf[j] if val not in table: table[val] = stub @@ -836,15 +830,15 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, if axis == 0: counts = np.zeros((max_bin, k), dtype='i8') with nogil: - for i in range(n): - for j in range(n): + for i from 0 <= i < n: + for j from 0 <= j < k: counts[labels[i], j] += mask[i, j] else: # axis == 1 counts = np.zeros((n, max_bin), dtype='i8') with nogil: - for i in range(n): - for j in range(k): + for i from 0 <= i < n: + for j from 0 <= j < k: counts[i, labels[j]] += mask[i, j] return counts From 207bc74d2e4825102b88bde86cecdd11c589adb4 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Mon, 5 Feb 2018 11:30:18 +0700 Subject: [PATCH 07/11] Define memory_usage on SparseArray --- pandas/core/base.py | 8 ++++---- pandas/core/sparse/array.py | 38 ++++++++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 8ee9aaa71f75f..ebbc50795a3db 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1070,10 +1070,10 @@ def memory_usage(self, deep=False): numpy.ndarray.nbytes """ # Use sparse values if they exist for memory consumption - if hasattr(self.values, 'sp_values'): - values = self.values.sp_values - else: - values = self.values + # if hasattr(self.values, 'sp_values'): + # values = self.values.sp_values + # else: + values = self.values if hasattr(values, 'memory_usage'): return values.memory_usage(deep=deep) diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index fa07400a0706e..b3a2a0ac7a502 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -11,7 +11,7 @@ from pandas.core.base import PandasObject from pandas import compat -from pandas.compat import range +from pandas.compat import range, PYPY from pandas.compat.numpy import function as nv from pandas.core.dtypes.generic import ABCSparseSeries @@ -30,6 +30,7 @@ from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype import pandas._libs.sparse as splib +import pandas._libs.lib as lib from pandas._libs.sparse import SparseIndex, BlockIndex, IntIndex from pandas._libs import index as libindex import pandas.core.algorithms as algos @@ -238,6 +239,41 @@ def kind(self): elif isinstance(self.sp_index, IntIndex): return 'integer' + def memory_usage(self, deep=False): + """Memory usage of SparseArray + + Parameters + ---------- + deep : bool + Introspect the data deeply, interrogate + `object` dtypes for system-level memory consumption + + Returns + ------- + scalar bytes of memory consumed + + Notes + ----- + Memory usage does not include memory of empty cells filled by + fill_value. And it does not include memory consumed by + elements that are not components of the array if deep=False + + See also + -------- + Series.memory_usage + """ + + values = self.sp_values + if hasattr(values, 'memory_usage'): + return values.memory_usage(deep=deep) + + v = values.nbytes + + if deep and is_object_dtype(self) and not PYPY: + v += lib.memory_usage_of_objects(values) + + return v + def __array_wrap__(self, out_arr, context=None): """ NumPy calls this method when ufunc is applied From cdd4141e4137375053d68eba6ccab1eecb5c464e Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Mon, 5 Feb 2018 11:31:14 +0700 Subject: [PATCH 08/11] Fix linting error --- pandas/core/sparse/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index b3a2a0ac7a502..8d5a5e1dbfe1e 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -255,7 +255,7 @@ def memory_usage(self, deep=False): Notes ----- Memory usage does not include memory of empty cells filled by - fill_value. And it does not include memory consumed by + fill_value. And it does not include memory consumed by elements that are not components of the array if deep=False See also From ae9f74d588fd34db80feacd21b630eba0599617b Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Mon, 5 Feb 2018 11:33:43 +0700 Subject: [PATCH 09/11] Revert base.py --- pandas/core/base.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index ebbc50795a3db..54d25a16a10a3 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1069,18 +1069,12 @@ def memory_usage(self, deep=False): -------- numpy.ndarray.nbytes """ - # Use sparse values if they exist for memory consumption - # if hasattr(self.values, 'sp_values'): - # values = self.values.sp_values - # else: - values = self.values + if hasattr(self.values, 'memory_usage'): + return self.values.memory_usage(deep=deep) - if hasattr(values, 'memory_usage'): - return values.memory_usage(deep=deep) - - v = values.nbytes + v = self.values.nbytes if deep and is_object_dtype(self) and not PYPY: - v += lib.memory_usage_of_objects(values) + v += lib.memory_usage_of_objects(self.values) return v def factorize(self, sort=False, na_sentinel=-1): From 4ead141c0422200638a1729bf8a109c77411c645 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Mon, 5 Feb 2018 11:37:30 +0700 Subject: [PATCH 10/11] Move whatsnew doc to Sparse --- doc/source/whatsnew/v0.23.0.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 8579d16bcb140..e8d2ec5eb0d9e 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -555,7 +555,7 @@ Sparse - Bug in which creating a ``SparseDataFrame`` from a dense ``Series`` or an unsupported type raised an uncontrolled exception (:issue:`19374`) - Bug in :class:`SparseDataFrame.to_csv` causing exception (:issue:`19384`) -- +- Bug in :class:`SparseArray.memory_usage` which caused segfault by accessing non sparse elements (:issue:`19368`) Reshaping ^^^^^^^^^ @@ -591,4 +591,3 @@ Other ^^^^^ - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) -- Bug in `Series.memory_usage` which assumes series will always have more than one element (:issue:`19368`) From f9433d844ccdda495bc17aad2da23673020aa8dc Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Mon, 5 Feb 2018 19:38:53 +0700 Subject: [PATCH 11/11] Use shared docstring and get rid of if condition --- pandas/core/base.py | 2 +- pandas/core/sparse/array.py | 28 ++-------------------------- 2 files changed, 3 insertions(+), 27 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 54d25a16a10a3..d5b204dba063e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1048,7 +1048,7 @@ def is_monotonic_decreasing(self): def memory_usage(self, deep=False): """ - Memory usage of my values + Memory usage of the values Parameters ---------- diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 8d5a5e1dbfe1e..65aefd9fb8c0a 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -8,7 +8,7 @@ import warnings import pandas as pd -from pandas.core.base import PandasObject +from pandas.core.base import PandasObject, IndexOpsMixin from pandas import compat from pandas.compat import range, PYPY @@ -239,33 +239,9 @@ def kind(self): elif isinstance(self.sp_index, IntIndex): return 'integer' + @Appender(IndexOpsMixin.memory_usage.__doc__) def memory_usage(self, deep=False): - """Memory usage of SparseArray - - Parameters - ---------- - deep : bool - Introspect the data deeply, interrogate - `object` dtypes for system-level memory consumption - - Returns - ------- - scalar bytes of memory consumed - - Notes - ----- - Memory usage does not include memory of empty cells filled by - fill_value. And it does not include memory consumed by - elements that are not components of the array if deep=False - - See also - -------- - Series.memory_usage - """ - values = self.sp_values - if hasattr(values, 'memory_usage'): - return values.memory_usage(deep=deep) v = values.nbytes