Skip to content

PERF/DOC: Option to .info() and .memory_usage() to provide for deep introspection of memory consumption #11595 #11596

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 13, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ Attributes
Series.itemsize
Series.base
Series.T
Series.memory_usage

Conversion
~~~~~~~~~~
Expand Down Expand Up @@ -772,6 +773,7 @@ Attributes and underlying data
DataFrame.ndim
DataFrame.size
DataFrame.shape
DataFrame.memory_usage

Conversion
~~~~~~~~~~
Expand Down Expand Up @@ -1333,6 +1335,7 @@ Attributes
Index.itemsize
Index.base
Index.T
Index.memory_usage

Modifying and Computations
~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
10 changes: 10 additions & 0 deletions doc/source/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,16 @@ The ``+`` symbol indicates that the true memory usage could be higher, because
pandas does not count the memory used by values in columns with
``dtype=object``.

.. versionadded:: 0.17.1

Passing ``memory_usage='deep'`` will enable a more accurate memory usage report,
that accounts for the full usage of the contained objects. This is optional
as it can be expensive to do this deeper introspection.

.. ipython:: python

df.info(memory_usage='deep')

By default the display option is set to ``True`` but can be explicitly
overridden by passing the ``memory_usage`` argument when invoking ``df.info()``.

Expand Down
13 changes: 13 additions & 0 deletions doc/source/whatsnew/v0.17.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,19 @@ Enhancements
- Improve the error message displayed in :func:`pandas.io.gbq.to_gbq` when the DataFrame does not match the schema of the destination table (:issue:`11359`)
- Added ``axvlines_kwds`` to parallel coordinates plot (:issue:`10709`)

- Option to ``.info()`` and ``.memory_usage()`` to provide for deep introspection of memory consumption. Note that this can be expensive to compute and therefore is an optional parameter. (:issue:``11595``)

.. ipython:: python

df = DataFrame({'A' : ['foo']*1000})
df['B'] = df['A'].astype('category')

# shows the '+' as we have object dtypes
df.info()

# we have an accurate memory assessment (but can be expensive to compute this)
df.info(memory_usage='deep')

- ``Index`` now has ``fillna`` method (:issue:`10089`)

.. ipython:: python
Expand Down
30 changes: 30 additions & 0 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,36 @@ def nunique(self, dropna=True):
n -= 1
return n

def memory_usage(self, deep=False):
"""
Memory usage of my values

Parameters
----------
deep : bool
Introspect the data deeply, interrogate
`object` dtypes for system-level memory consumption

Returns
-------
bytes used

Notes
-----
Memory usage does not include memory consumed by elements that
are not components of the array if deep=False

See Also
--------
numpy.ndarray.nbytes
"""
if hasattr(self.values,'memory_usage'):
return self.values.memory_usage(deep=deep)

v = self.values.nbytes
if deep and com.is_object_dtype(self):
v += lib.memory_usage_of_objects(self.values)
return v

def factorize(self, sort=False, na_sentinel=-1):
"""
Expand Down
25 changes: 25 additions & 0 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -924,6 +924,31 @@ def T(self):
def nbytes(self):
return self._codes.nbytes + self._categories.values.nbytes

def memory_usage(self, deep=False):
"""
Memory usage of my values

Parameters
----------
deep : bool
Introspect the data deeply, interrogate
`object` dtypes for system-level memory consumption

Returns
-------
bytes used

Notes
-----
Memory usage does not include memory consumed by elements that
are not components of the array if deep=False

See Also
--------
numpy.ndarray.nbytes
"""
return self._codes.nbytes + self._categories.memory_usage(deep=deep)

def searchsorted(self, v, side='left', sorter=None):
"""Find indices where elements should be inserted to maintain order.

Expand Down
6 changes: 3 additions & 3 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,9 +215,9 @@
"""

pc_memory_usage_doc = """
: bool or None
: bool, string or None
This specifies if the memory usage of a DataFrame should be displayed when
df.info() is called.
df.info() is called. Valid values True,False,'deep'
"""

style_backup = dict()
Expand Down Expand Up @@ -292,7 +292,7 @@ def mpl_style_cb(key):
cf.register_option('line_width', get_default_val('display.width'),
pc_line_width_doc)
cf.register_option('memory_usage', True, pc_memory_usage_doc,
validator=is_instance_factory([type(None), bool]))
validator=is_one_of_factory([None, True, False, 'deep']))
cf.register_option('unicode.east_asian_width', False,
pc_east_asian_width_doc, validator=is_bool)
cf.register_option('unicode.ambiguous_as_wide', False,
Expand Down
39 changes: 25 additions & 14 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1582,11 +1582,12 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_co
max_cols : int, default None
Determines whether full summary or short summary is printed.
None follows the `display.max_info_columns` setting.
memory_usage : boolean, default None
memory_usage : boolean/string, default None
Specifies whether total memory usage of the DataFrame
elements (including index) should be displayed. None follows
the `display.memory_usage` setting. True or False overrides
the `display.memory_usage` setting. Memory usage is shown in
the `display.memory_usage` setting. A value of 'deep' is equivalent
of True, with deep introspection. Memory usage is shown in
human-readable units (base-2 representation).
null_counts : boolean, default None
Whether to show the non-null counts
Expand Down Expand Up @@ -1676,20 +1677,27 @@ def _sizeof_fmt(num, size_qualifier):
counts = self.get_dtype_counts()
dtypes = ['%s(%d)' % k for k in sorted(compat.iteritems(counts))]
lines.append('dtypes: %s' % ', '.join(dtypes))

if memory_usage is None:
memory_usage = get_option('display.memory_usage')
if memory_usage: # append memory usage of df to display
# size_qualifier is just a best effort; not guaranteed to catch all
# cases (e.g., it misses categorical data even with object
# categories)
size_qualifier = ('+' if 'object' in counts
or is_object_dtype(self.index) else '')
mem_usage = self.memory_usage(index=True).sum()
if memory_usage:
# append memory usage of df to display
size_qualifier = ''
if memory_usage == 'deep':
deep=True
else:
# size_qualifier is just a best effort; not guaranteed to catch all
# cases (e.g., it misses categorical data even with object
# categories)
deep=False
if 'object' in counts or is_object_dtype(self.index):
size_qualifier = '+'
mem_usage = self.memory_usage(index=True, deep=deep).sum()
lines.append("memory usage: %s\n" %
_sizeof_fmt(mem_usage, size_qualifier))
_put_lines(buf, lines)

def memory_usage(self, index=False):
def memory_usage(self, index=False, deep=False):
"""Memory usage of DataFrame columns.

Parameters
Expand All @@ -1698,6 +1706,9 @@ def memory_usage(self, index=False):
Specifies whether to include memory usage of DataFrame's
index in returned Series. If `index=True` (default is False)
the first index of the Series is `Index`.
deep : bool
Introspect the data deeply, interrogate
`object` dtypes for system-level memory consumption

Returns
-------
Expand All @@ -1708,17 +1719,17 @@ def memory_usage(self, index=False):
Notes
-----
Memory usage does not include memory consumed by elements that
are not components of the array.
are not components of the array if deep=False

See Also
--------
numpy.ndarray.nbytes
"""
result = Series([ c.values.nbytes for col, c in self.iteritems() ],
result = Series([ c.memory_usage(index=False, deep=deep) for col, c in self.iteritems() ],
index=self.columns)
if index:
result = Series(self.index.nbytes,
index=['Index']).append(result)
result = Series(self.index.memory_usage(deep=deep),
index=['Index']).append(result)
return result

def transpose(self):
Expand Down
29 changes: 29 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2281,6 +2281,35 @@ def reindex_axis(self, labels, axis=0, **kwargs):
raise ValueError("cannot reindex series on non-zero axis!")
return self.reindex(index=labels, **kwargs)

def memory_usage(self, index=False, deep=False):
"""Memory usage of the Series

Parameters
----------
index : bool
Specifies whether to include memory usage of Series index
deep : bool
Introspect the data deeply, interrogate
`object` dtypes for system-level memory consumption

Returns
-------
scalar bytes of memory consumed

Notes
-----
Memory usage does not include memory consumed by elements that
are not components of the array if deep=False

See Also
--------
numpy.ndarray.nbytes
"""
v = super(Series, self).memory_usage(deep=deep)
if index:
v += self.index.memory_usage(deep=deep)
return v

def take(self, indices, axis=0, convert=True, is_copy=False):
"""
return Series corresponding to requested indices
Expand Down
13 changes: 13 additions & 0 deletions pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,19 @@ def ismember_int64(ndarray[int64_t] arr, set values):

return result.view(np.bool_)

@cython.wraparound(False)
@cython.boundscheck(False)
def memory_usage_of_objects(ndarray[object, ndim=1] arr):
""" return the memory usage of an object array in bytes,
does not include the actual bytes of the pointers """
cdef Py_ssize_t i, n
cdef int64_t s = 0

n = len(arr)
for i from 0 <= i < n:
s += arr[i].__sizeof__()
return s

#----------------------------------------------------------------------
# datetime / io related

Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,28 @@ def get_fill_value(obj):
self.assertFalse(o is result)


def test_memory_usage(self):
for o in self.objs:
res = o.memory_usage()
res2 = o.memory_usage(deep=True)

if com.is_object_dtype(o):
self.assertTrue(res2 > res)
else:
self.assertEqual(res, res2)

if isinstance(o, Series):
res = o.memory_usage(index=True)
res2 = o.memory_usage(index=True, deep=True)
if com.is_object_dtype(o) or com.is_object_dtype(o.index):
self.assertTrue(res2 > res)
else:
self.assertEqual(res, res2)

self.assertEqual(o.memory_usage(index=False) + o.index.memory_usage(),
o.memory_usage(index=True))


class TestFloat64HashTable(tm.TestCase):
def test_lookup_nan(self):
from pandas.hashtable import Float64HashTable
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1197,6 +1197,15 @@ def test_nbytes(self):
exp = cat._codes.nbytes + cat._categories.values.nbytes
self.assertEqual(cat.nbytes, exp)

def test_memory_usage(self):
cat = pd.Categorical([1,2,3])
self.assertEqual(cat.nbytes, cat.memory_usage())
self.assertEqual(cat.nbytes, cat.memory_usage(deep=True))

cat = pd.Categorical(['foo','foo','bar'])
self.assertEqual(cat.nbytes, cat.memory_usage())
self.assertTrue(cat.memory_usage(deep=True) > cat.nbytes)

def test_searchsorted(self):
# https://github.com/pydata/pandas/issues/8420
s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk' ])
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7614,6 +7614,17 @@ def test_info_memory_usage(self):
res = buf.getvalue().splitlines()
self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1]))

df_with_object_index.info(buf=buf, memory_usage='deep')
res = buf.getvalue().splitlines()
self.assertTrue(re.match(r"memory usage: [^+]+$", res[-1]))

self.assertTrue(df_with_object_index.memory_usage(index=True, deep=True).sum() \
> df_with_object_index.memory_usage(index=True).sum())

df_object = pd.DataFrame({'a': ['a']})
self.assertTrue(df_object.memory_usage(deep=True).sum() \
> df_object.memory_usage().sum())

# Test a DataFrame with duplicate columns
dtypes = ['int64', 'int64', 'int64', 'float64']
data = {}
Expand All @@ -7630,6 +7641,9 @@ def test_info_memory_usage(self):
size_df = np.size(df.columns.values) # index=False; default
self.assertEqual(size_df, np.size(df.memory_usage()))

# assert deep works only on object
self.assertEqual(df.memory_usage().sum(),df.memory_usage(deep=True).sum())

# test for validity
DataFrame(1,index=['a'],columns=['A']).memory_usage(index=True)
DataFrame(1,index=['a'],columns=['A']).index.nbytes
Expand Down