diff --git a/doc/source/api.rst b/doc/source/api.rst index bfd1c92d14acd..6bee0a1ceafb8 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -284,6 +284,7 @@ Attributes Series.itemsize Series.base Series.T + Series.memory_usage Conversion ~~~~~~~~~~ @@ -772,6 +773,7 @@ Attributes and underlying data DataFrame.ndim DataFrame.size DataFrame.shape + DataFrame.memory_usage Conversion ~~~~~~~~~~ @@ -1333,6 +1335,7 @@ Attributes Index.itemsize Index.base Index.T + Index.memory_usage Modifying and Computations ~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/faq.rst b/doc/source/faq.rst index 7714d937e15d6..82102296c4198 100644 --- a/doc/source/faq.rst +++ b/doc/source/faq.rst @@ -50,6 +50,16 @@ The ``+`` symbol indicates that the true memory usage could be higher, because pandas does not count the memory used by values in columns with ``dtype=object``. +.. versionadded:: 0.17.1 + +Passing ``memory_usage='deep'`` will enable a more accurate memory usage report, +that accounts for the full usage of the contained objects. This is optional +as it can be expensive to do this deeper introspection. + +.. ipython:: python + + df.info(memory_usage='deep') + By default the display option is set to ``True`` but can be explicitly overridden by passing the ``memory_usage`` argument when invoking ``df.info()``. diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt index 1d9b02e6a7bb1..f92ed4af50b6c 100755 --- a/doc/source/whatsnew/v0.17.1.txt +++ b/doc/source/whatsnew/v0.17.1.txt @@ -27,6 +27,19 @@ Enhancements - Improve the error message displayed in :func:`pandas.io.gbq.to_gbq` when the DataFrame does not match the schema of the destination table (:issue:`11359`) - Added ``axvlines_kwds`` to parallel coordinates plot (:issue:`10709`) +- Option to ``.info()`` and ``.memory_usage()`` to provide for deep introspection of memory consumption. Note that this can be expensive to compute and therefore is an optional parameter. (:issue:``11595``) + +.. ipython:: python + + df = DataFrame({'A' : ['foo']*1000}) + df['B'] = df['A'].astype('category') + + # shows the '+' as we have object dtypes + df.info() + + # we have an accurate memory assessment (but can be expensive to compute this) + df.info(memory_usage='deep') + - ``Index`` now has ``fillna`` method (:issue:`10089`) .. ipython:: python diff --git a/pandas/core/base.py b/pandas/core/base.py index d3850be13b6f0..173423007037a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -489,6 +489,36 @@ def nunique(self, dropna=True): n -= 1 return n + def memory_usage(self, deep=False): + """ + Memory usage of my values + + Parameters + ---------- + deep : bool + Introspect the data deeply, interrogate + `object` dtypes for system-level memory consumption + + Returns + ------- + bytes used + + Notes + ----- + Memory usage does not include memory consumed by elements that + are not components of the array if deep=False + + See Also + -------- + numpy.ndarray.nbytes + """ + if hasattr(self.values,'memory_usage'): + return self.values.memory_usage(deep=deep) + + v = self.values.nbytes + if deep and com.is_object_dtype(self): + v += lib.memory_usage_of_objects(self.values) + return v def factorize(self, sort=False, na_sentinel=-1): """ diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 514f907d943a8..ccfd4e657fe39 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -924,6 +924,31 @@ def T(self): def nbytes(self): return self._codes.nbytes + self._categories.values.nbytes + def memory_usage(self, deep=False): + """ + Memory usage of my values + + Parameters + ---------- + deep : bool + Introspect the data deeply, interrogate + `object` dtypes for system-level memory consumption + + Returns + ------- + bytes used + + Notes + ----- + Memory usage does not include memory consumed by elements that + are not components of the array if deep=False + + See Also + -------- + numpy.ndarray.nbytes + """ + return self._codes.nbytes + self._categories.memory_usage(deep=deep) + def searchsorted(self, v, side='left', sorter=None): """Find indices where elements should be inserted to maintain order. diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 751a530ce73cc..35689030d9c09 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -215,9 +215,9 @@ """ pc_memory_usage_doc = """ -: bool or None +: bool, string or None This specifies if the memory usage of a DataFrame should be displayed when - df.info() is called. + df.info() is called. Valid values True,False,'deep' """ style_backup = dict() @@ -292,7 +292,7 @@ def mpl_style_cb(key): cf.register_option('line_width', get_default_val('display.width'), pc_line_width_doc) cf.register_option('memory_usage', True, pc_memory_usage_doc, - validator=is_instance_factory([type(None), bool])) + validator=is_one_of_factory([None, True, False, 'deep'])) cf.register_option('unicode.east_asian_width', False, pc_east_asian_width_doc, validator=is_bool) cf.register_option('unicode.ambiguous_as_wide', False, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 538b9d3f8e712..22d0026f27742 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1582,11 +1582,12 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_co max_cols : int, default None Determines whether full summary or short summary is printed. None follows the `display.max_info_columns` setting. - memory_usage : boolean, default None + memory_usage : boolean/string, default None Specifies whether total memory usage of the DataFrame elements (including index) should be displayed. None follows the `display.memory_usage` setting. True or False overrides - the `display.memory_usage` setting. Memory usage is shown in + the `display.memory_usage` setting. A value of 'deep' is equivalent + of True, with deep introspection. Memory usage is shown in human-readable units (base-2 representation). null_counts : boolean, default None Whether to show the non-null counts @@ -1676,20 +1677,27 @@ def _sizeof_fmt(num, size_qualifier): counts = self.get_dtype_counts() dtypes = ['%s(%d)' % k for k in sorted(compat.iteritems(counts))] lines.append('dtypes: %s' % ', '.join(dtypes)) + if memory_usage is None: memory_usage = get_option('display.memory_usage') - if memory_usage: # append memory usage of df to display - # size_qualifier is just a best effort; not guaranteed to catch all - # cases (e.g., it misses categorical data even with object - # categories) - size_qualifier = ('+' if 'object' in counts - or is_object_dtype(self.index) else '') - mem_usage = self.memory_usage(index=True).sum() + if memory_usage: + # append memory usage of df to display + size_qualifier = '' + if memory_usage == 'deep': + deep=True + else: + # size_qualifier is just a best effort; not guaranteed to catch all + # cases (e.g., it misses categorical data even with object + # categories) + deep=False + if 'object' in counts or is_object_dtype(self.index): + size_qualifier = '+' + mem_usage = self.memory_usage(index=True, deep=deep).sum() lines.append("memory usage: %s\n" % _sizeof_fmt(mem_usage, size_qualifier)) _put_lines(buf, lines) - def memory_usage(self, index=False): + def memory_usage(self, index=False, deep=False): """Memory usage of DataFrame columns. Parameters @@ -1698,6 +1706,9 @@ def memory_usage(self, index=False): Specifies whether to include memory usage of DataFrame's index in returned Series. If `index=True` (default is False) the first index of the Series is `Index`. + deep : bool + Introspect the data deeply, interrogate + `object` dtypes for system-level memory consumption Returns ------- @@ -1708,17 +1719,17 @@ def memory_usage(self, index=False): Notes ----- Memory usage does not include memory consumed by elements that - are not components of the array. + are not components of the array if deep=False See Also -------- numpy.ndarray.nbytes """ - result = Series([ c.values.nbytes for col, c in self.iteritems() ], + result = Series([ c.memory_usage(index=False, deep=deep) for col, c in self.iteritems() ], index=self.columns) if index: - result = Series(self.index.nbytes, - index=['Index']).append(result) + result = Series(self.index.memory_usage(deep=deep), + index=['Index']).append(result) return result def transpose(self): diff --git a/pandas/core/series.py b/pandas/core/series.py index b12a31d64eaf7..5106225cdd3c9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2281,6 +2281,35 @@ def reindex_axis(self, labels, axis=0, **kwargs): raise ValueError("cannot reindex series on non-zero axis!") return self.reindex(index=labels, **kwargs) + def memory_usage(self, index=False, deep=False): + """Memory usage of the Series + + Parameters + ---------- + index : bool + Specifies whether to include memory usage of Series index + deep : bool + Introspect the data deeply, interrogate + `object` dtypes for system-level memory consumption + + Returns + ------- + scalar bytes of memory consumed + + Notes + ----- + Memory usage does not include memory consumed by elements that + are not components of the array if deep=False + + See Also + -------- + numpy.ndarray.nbytes + """ + v = super(Series, self).memory_usage(deep=deep) + if index: + v += self.index.memory_usage(deep=deep) + return v + def take(self, indices, axis=0, convert=True, is_copy=False): """ return Series corresponding to requested indices diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 74842d9a165fe..1a1f04cba1cb9 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -182,6 +182,19 @@ def ismember_int64(ndarray[int64_t] arr, set values): return result.view(np.bool_) +@cython.wraparound(False) +@cython.boundscheck(False) +def memory_usage_of_objects(ndarray[object, ndim=1] arr): + """ return the memory usage of an object array in bytes, + does not include the actual bytes of the pointers """ + cdef Py_ssize_t i, n + cdef int64_t s = 0 + + n = len(arr) + for i from 0 <= i < n: + s += arr[i].__sizeof__() + return s + #---------------------------------------------------------------------- # datetime / io related diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 3a42059a63b0d..5fb37abbbb3d2 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -877,6 +877,28 @@ def get_fill_value(obj): self.assertFalse(o is result) + def test_memory_usage(self): + for o in self.objs: + res = o.memory_usage() + res2 = o.memory_usage(deep=True) + + if com.is_object_dtype(o): + self.assertTrue(res2 > res) + else: + self.assertEqual(res, res2) + + if isinstance(o, Series): + res = o.memory_usage(index=True) + res2 = o.memory_usage(index=True, deep=True) + if com.is_object_dtype(o) or com.is_object_dtype(o.index): + self.assertTrue(res2 > res) + else: + self.assertEqual(res, res2) + + self.assertEqual(o.memory_usage(index=False) + o.index.memory_usage(), + o.memory_usage(index=True)) + + class TestFloat64HashTable(tm.TestCase): def test_lookup_nan(self): from pandas.hashtable import Float64HashTable diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 1d143236e285b..23484ee8c7e05 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1197,6 +1197,15 @@ def test_nbytes(self): exp = cat._codes.nbytes + cat._categories.values.nbytes self.assertEqual(cat.nbytes, exp) + def test_memory_usage(self): + cat = pd.Categorical([1,2,3]) + self.assertEqual(cat.nbytes, cat.memory_usage()) + self.assertEqual(cat.nbytes, cat.memory_usage(deep=True)) + + cat = pd.Categorical(['foo','foo','bar']) + self.assertEqual(cat.nbytes, cat.memory_usage()) + self.assertTrue(cat.memory_usage(deep=True) > cat.nbytes) + def test_searchsorted(self): # https://github.com/pydata/pandas/issues/8420 s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk' ]) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index a924d17aa7e8f..a743ce4ffef61 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -7614,6 +7614,17 @@ def test_info_memory_usage(self): res = buf.getvalue().splitlines() self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1])) + df_with_object_index.info(buf=buf, memory_usage='deep') + res = buf.getvalue().splitlines() + self.assertTrue(re.match(r"memory usage: [^+]+$", res[-1])) + + self.assertTrue(df_with_object_index.memory_usage(index=True, deep=True).sum() \ + > df_with_object_index.memory_usage(index=True).sum()) + + df_object = pd.DataFrame({'a': ['a']}) + self.assertTrue(df_object.memory_usage(deep=True).sum() \ + > df_object.memory_usage().sum()) + # Test a DataFrame with duplicate columns dtypes = ['int64', 'int64', 'int64', 'float64'] data = {} @@ -7630,6 +7641,9 @@ def test_info_memory_usage(self): size_df = np.size(df.columns.values) # index=False; default self.assertEqual(size_df, np.size(df.memory_usage())) + # assert deep works only on object + self.assertEqual(df.memory_usage().sum(),df.memory_usage(deep=True).sum()) + # test for validity DataFrame(1,index=['a'],columns=['A']).memory_usage(index=True) DataFrame(1,index=['a'],columns=['A']).index.nbytes