Skip to content

Groupy chunks are garbage-collected even if still referenced #17718

Closed
@toobaz

Description

@toobaz

Code Sample, a copy-pastable example if possible

In [2]: df = pd.DataFrame(1, index=list(range(10))*10, columns=[0]).reset_index()

In [3]: groups = []

In [4]: def store(group):
   ...:     groups.append(group)
   ...:     

In [5]: df.groupby('index').apply(store)
Out[5]: 
Empty DataFrame
Columns: []
Index: []

In [6]: groups[-1]
Out[6]: ---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/home/nobackup/repo/ipython/IPython/core/formatters.py in __call__(self, obj)
    668                 type_pprinters=self.type_printers,
    669                 deferred_pprinters=self.deferred_printers)
--> 670             printer.pretty(obj)
    671             printer.flush()
    672             return stream.getvalue()

/home/nobackup/repo/ipython/IPython/lib/pretty.py in pretty(self, obj)
    381                             if callable(meth):
    382                                 return meth(obj, self, cycle)
--> 383             return _default_pprint(obj, self, cycle)
    384         finally:
    385             self.end_group()

/home/nobackup/repo/ipython/IPython/lib/pretty.py in _default_pprint(obj, p, cycle)
    501     if _safe_getattr(klass, '__repr__', None) not in _baseclass_reprs:
    502         # A user-provided repr. Find newlines and replace them with p.break_()
--> 503         _repr_pprint(obj, p, cycle)
    504         return
    505     p.begin_group(1, '<')

/home/nobackup/repo/ipython/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
    692     """A pprint that just redirects to the normal repr function."""
    693     # Find newlines and replace them with p.break_()
--> 694     output = repr(obj)
    695     for idx,output_line in enumerate(output.splitlines()):
    696         if idx:

/home/nobackup/repo/pandas/pandas/core/base.py in __repr__(self)
     78         Yields Bytestring in Py2, Unicode String in py3.
     79         """
---> 80         return str(self)
     81 
     82 

/home/nobackup/repo/pandas/pandas/core/base.py in __str__(self)
     57 
     58         if compat.PY3:
---> 59             return self.__unicode__()
     60         return self.__bytes__()
     61 

/home/nobackup/repo/pandas/pandas/core/frame.py in __unicode__(self)
    627             width = None
    628         self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
--> 629                        line_width=width, show_dimensions=show_dimensions)
    630 
    631         return buf.getvalue()

/home/nobackup/repo/pandas/pandas/core/frame.py in to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, line_width, max_rows, max_cols, show_dimensions)
   1646                                            max_cols=max_cols,
   1647                                            show_dimensions=show_dimensions)
-> 1648         formatter.to_string()
   1649 
   1650         if buf is None:

/home/nobackup/repo/pandas/pandas/io/formats/format.py in to_string(self)
    590         else:
    591 
--> 592             strcols = self._to_str_columns()
    593             if self.line_width is None:  # no need to wrap around just print
    594                 # the whole frame

/home/nobackup/repo/pandas/pandas/io/formats/format.py in _to_str_columns(self)
    530                                                adj=self.adj)
    531 
--> 532                 max_len = max(np.max([self.adj.len(x) for x in fmt_values]),
    533                               header_colwidth)
    534                 cheader = self.adj.justify(cheader, max_len, mode=self.justify)

/home/pietro/.local/lib/python3.5/site-packages/numpy/core/fromnumeric.py in amax(a, axis, out, keepdims)
   2250 
   2251     return _methods._amax(a, axis=axis,
-> 2252                           out=out, **kwargs)
   2253 
   2254 

/home/pietro/.local/lib/python3.5/site-packages/numpy/core/_methods.py in _amax(a, axis, out, keepdims)
     24 # small reductions
     25 def _amax(a, axis=None, out=None, keepdims=False):
---> 26     return umr_maximum(a, axis, None, out, keepdims)
     27 
     28 def _amin(a, axis=None, out=None, keepdims=False):

ValueError: zero-size array to reduction operation maximum which has no identity

Problem description

I understand that just replacing groups.append(group) with groups.append(group.copy()) solves this specific case, that freeing memory of pandas objects is difficult because of cyclic references, and that this is an atypical function to apply()... but still it looks like some dereferencing code could be made more robust.

Expected Output

In [7]: groups[0]
Out[7]: 
    index  0
0       0  1
10      0  1
20      0  1
30      0  1
40      0  1
50      0  1
60      0  1
70      0  1
80      0  1
90      0  1

Output of pd.show_versions()

INSTALLED VERSIONS

commit: ad7d051
python: 3.5.3.final.0
python-bits: 64
OS: Linux
OS-release: 4.9.0-3-amd64
machine: x86_64
processor:
byteorder: little
LC_ALL: None
LANG: it_IT.UTF-8
LOCALE: it_IT.UTF-8

pandas: 0.21.0.dev+546.gad7d051bd
pytest: 3.0.6
pip: 9.0.1
setuptools: None
Cython: 0.25.2
numpy: 1.12.1
scipy: 0.19.0
pyarrow: None
xarray: None
IPython: 5.1.0.dev
sphinx: 1.5.6
patsy: 0.4.1
dateutil: 2.6.0
pytz: 2017.2
blosc: None
bottleneck: 1.2.1
tables: 3.3.0
numexpr: 2.6.1
feather: 0.3.1
matplotlib: 2.0.2
openpyxl: None
xlrd: 1.0.0
xlwt: 1.1.2
xlsxwriter: 0.9.6
lxml: None
bs4: 4.5.3
html5lib: 0.999999999
sqlalchemy: 1.0.15
pymysql: None
psycopg2: None
jinja2: 2.9.6
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: 0.2.1

Metadata

Metadata

Assignees

Labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions