Skip to content

ENH: Implemented lazy iteration #20796

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Dec 25, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 62 additions & 1 deletion asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def setup(self):
self.df2 = DataFrame(np.random.randn(N * 50, 10))
self.df3 = DataFrame(np.random.randn(N, 5 * N),
columns=['C' + str(c) for c in range(N * 5)])
self.df4 = DataFrame(np.random.randn(N * 1000, 10))

def time_iteritems(self):
# (monitor no-copying behaviour)
Expand All @@ -119,10 +120,70 @@ def time_iteritems_indexing(self):
for col in self.df3:
self.df3[col]

def time_itertuples_start(self):
self.df4.itertuples()

def time_itertuples_read_first(self):
next(self.df4.itertuples())

def time_itertuples(self):
for row in self.df2.itertuples():
for row in self.df4.itertuples():
pass

def time_itertuples_to_list(self):
list(self.df4.itertuples())

def mem_itertuples_start(self):
return self.df4.itertuples()

def peakmem_itertuples_start(self):
self.df4.itertuples()

def mem_itertuples_read_first(self):
return next(self.df4.itertuples())

def peakmem_itertuples(self):
for row in self.df4.itertuples():
pass

def mem_itertuples_to_list(self):
return list(self.df4.itertuples())

def peakmem_itertuples_to_list(self):
list(self.df4.itertuples())

def time_itertuples_raw_start(self):
self.df4.itertuples(index=False, name=None)

def time_itertuples_raw_read_first(self):
next(self.df4.itertuples(index=False, name=None))

def time_itertuples_raw_tuples(self):
for row in self.df4.itertuples(index=False, name=None):
pass

def time_itertuples_raw_tuples_to_list(self):
list(self.df4.itertuples(index=False, name=None))

def mem_itertuples_raw_start(self):
return self.df4.itertuples(index=False, name=None)

def peakmem_itertuples_raw_start(self):
self.df4.itertuples(index=False, name=None)

def peakmem_itertuples_raw_read_first(self):
next(self.df4.itertuples(index=False, name=None))

def peakmem_itertuples_raw(self):
for row in self.df4.itertuples(index=False, name=None):
pass

def mem_itertuples_raw_to_list(self):
return list(self.df4.itertuples(index=False, name=None))

def peakmem_itertuples_raw_to_list(self):
list(self.df4.itertuples(index=False, name=None))

def time_iterrows(self):
for row in self.df.iterrows():
pass
Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1251,6 +1251,8 @@ Performance Improvements
- Fixed a performance regression on Windows with Python 3.7 of :func:`read_csv` (:issue:`23516`)
- Improved performance of :class:`Categorical` constructor for ``Series`` objects (:issue:`23814`)
- Improved performance of :meth:`~DataFrame.where` for Categorical data (:issue:`24077`)
- Improved performance of iterating over a :class:`Series`. Using :meth:`DataFrame.itertuples` now creates iterators
without internally allocating lists of all elements (:issue:`20783`)

.. _whatsnew_0240.docs:

Expand Down
10 changes: 8 additions & 2 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import pandas._libs.lib as lib
import pandas.compat as compat
from pandas.compat import PYPY, OrderedDict, builtins
from pandas.compat import PYPY, OrderedDict, builtins, map, range
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError
from pandas.util._decorators import Appender, Substitution, cache_readonly
Expand Down Expand Up @@ -1072,7 +1072,13 @@ def __iter__(self):
(for str, int, float) or a pandas scalar
(for Timestamp/Timedelta/Interval/Period)
"""
return iter(self.tolist())
# We are explicity making element iterators.
if is_datetimelike(self._values):
return map(com.maybe_box_datetimelike, self._values)
elif is_extension_array_dtype(self._values):
return iter(self._values)
else:
return map(self._values.item, range(self._values.size))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ha, I am looking at this now and I do not get anymore why we are mapping over a range here? We should map over values of self._values to cast them to Python using self._values.item. Was this my original code?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, here's your commit.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I know. item wants an index of the item you want to get back.


@cache_readonly
def hasnans(self):
Expand Down
9 changes: 4 additions & 5 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -898,10 +898,10 @@ def itertuples(self, index=True, name="Pandas"):
Animal(Index='hawk', num_legs=2, num_wings=2)
"""
arrays = []
fields = []
fields = list(self.columns)
if index:
arrays.append(self.index)
fields.append("Index")
fields.insert(0, "Index")

# use integer indexing because of possible duplicate column names
arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
Expand All @@ -911,10 +911,9 @@ def itertuples(self, index=True, name="Pandas"):
if name is not None and len(self.columns) + index < 256:
# `rename` is unsupported in Python 2.6
try:
itertuple = collections.namedtuple(name,
fields + list(self.columns),
rename=True)
itertuple = collections.namedtuple(name, fields, rename=True)
return map(itertuple._make, zip(*arrays))

except Exception:
pass

Expand Down