diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 3c0dd646aa502..ba2e63c20d3f8 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -103,6 +103,7 @@ def setup(self): self.df2 = DataFrame(np.random.randn(N * 50, 10)) self.df3 = DataFrame(np.random.randn(N, 5 * N), columns=['C' + str(c) for c in range(N * 5)]) + self.df4 = DataFrame(np.random.randn(N * 1000, 10)) def time_iteritems(self): # (monitor no-copying behaviour) @@ -119,10 +120,70 @@ def time_iteritems_indexing(self): for col in self.df3: self.df3[col] + def time_itertuples_start(self): + self.df4.itertuples() + + def time_itertuples_read_first(self): + next(self.df4.itertuples()) + def time_itertuples(self): - for row in self.df2.itertuples(): + for row in self.df4.itertuples(): + pass + + def time_itertuples_to_list(self): + list(self.df4.itertuples()) + + def mem_itertuples_start(self): + return self.df4.itertuples() + + def peakmem_itertuples_start(self): + self.df4.itertuples() + + def mem_itertuples_read_first(self): + return next(self.df4.itertuples()) + + def peakmem_itertuples(self): + for row in self.df4.itertuples(): + pass + + def mem_itertuples_to_list(self): + return list(self.df4.itertuples()) + + def peakmem_itertuples_to_list(self): + list(self.df4.itertuples()) + + def time_itertuples_raw_start(self): + self.df4.itertuples(index=False, name=None) + + def time_itertuples_raw_read_first(self): + next(self.df4.itertuples(index=False, name=None)) + + def time_itertuples_raw_tuples(self): + for row in self.df4.itertuples(index=False, name=None): pass + def time_itertuples_raw_tuples_to_list(self): + list(self.df4.itertuples(index=False, name=None)) + + def mem_itertuples_raw_start(self): + return self.df4.itertuples(index=False, name=None) + + def peakmem_itertuples_raw_start(self): + self.df4.itertuples(index=False, name=None) + + def peakmem_itertuples_raw_read_first(self): + next(self.df4.itertuples(index=False, name=None)) + + def peakmem_itertuples_raw(self): + for row in self.df4.itertuples(index=False, name=None): + pass + + def mem_itertuples_raw_to_list(self): + return list(self.df4.itertuples(index=False, name=None)) + + def peakmem_itertuples_raw_to_list(self): + list(self.df4.itertuples(index=False, name=None)) + def time_iterrows(self): for row in self.df.iterrows(): pass diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index a2abda019812a..7090acdc37382 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1251,6 +1251,8 @@ Performance Improvements - Fixed a performance regression on Windows with Python 3.7 of :func:`read_csv` (:issue:`23516`) - Improved performance of :class:`Categorical` constructor for ``Series`` objects (:issue:`23814`) - Improved performance of :meth:`~DataFrame.where` for Categorical data (:issue:`24077`) +- Improved performance of iterating over a :class:`Series`. Using :meth:`DataFrame.itertuples` now creates iterators + without internally allocating lists of all elements (:issue:`20783`) .. _whatsnew_0240.docs: diff --git a/pandas/core/base.py b/pandas/core/base.py index 4a64ea0e56574..0a4111b51ba4e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -8,7 +8,7 @@ import pandas._libs.lib as lib import pandas.compat as compat -from pandas.compat import PYPY, OrderedDict, builtins +from pandas.compat import PYPY, OrderedDict, builtins, map, range from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly @@ -1072,7 +1072,13 @@ def __iter__(self): (for str, int, float) or a pandas scalar (for Timestamp/Timedelta/Interval/Period) """ - return iter(self.tolist()) + # We are explicity making element iterators. + if is_datetimelike(self._values): + return map(com.maybe_box_datetimelike, self._values) + elif is_extension_array_dtype(self._values): + return iter(self._values) + else: + return map(self._values.item, range(self._values.size)) @cache_readonly def hasnans(self): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c4537db254132..c8ef958750379 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -898,10 +898,10 @@ def itertuples(self, index=True, name="Pandas"): Animal(Index='hawk', num_legs=2, num_wings=2) """ arrays = [] - fields = [] + fields = list(self.columns) if index: arrays.append(self.index) - fields.append("Index") + fields.insert(0, "Index") # use integer indexing because of possible duplicate column names arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) @@ -911,10 +911,9 @@ def itertuples(self, index=True, name="Pandas"): if name is not None and len(self.columns) + index < 256: # `rename` is unsupported in Python 2.6 try: - itertuple = collections.namedtuple(name, - fields + list(self.columns), - rename=True) + itertuple = collections.namedtuple(name, fields, rename=True) return map(itertuple._make, zip(*arrays)) + except Exception: pass