Merge branch 'master' of https://github.com/pandas-dev/pandas into less24024b

jbrockmendel · jbrockmendel · commit df39bd783be5 · 2018-12-25T12:53:42.000-08:00
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -103,6 +103,7 @@ def setup(self):
         self.df2 = DataFrame(np.random.randn(N * 50, 10))
         self.df3 = DataFrame(np.random.randn(N, 5 * N),
                              columns=['C' + str(c) for c in range(N * 5)])
+        self.df4 = DataFrame(np.random.randn(N * 1000, 10))
 
     def time_iteritems(self):
         # (monitor no-copying behaviour)
@@ -119,10 +120,70 @@ def time_iteritems_indexing(self):
         for col in self.df3:
             self.df3[col]
 
+    def time_itertuples_start(self):
+        self.df4.itertuples()
+
+    def time_itertuples_read_first(self):
+        next(self.df4.itertuples())
+
     def time_itertuples(self):
-        for row in self.df2.itertuples():
+        for row in self.df4.itertuples():
+            pass
+
+    def time_itertuples_to_list(self):
+        list(self.df4.itertuples())
+
+    def mem_itertuples_start(self):
+        return self.df4.itertuples()
+
+    def peakmem_itertuples_start(self):
+        self.df4.itertuples()
+
+    def mem_itertuples_read_first(self):
+        return next(self.df4.itertuples())
+
+    def peakmem_itertuples(self):
+        for row in self.df4.itertuples():
+            pass
+
+    def mem_itertuples_to_list(self):
+        return list(self.df4.itertuples())
+
+    def peakmem_itertuples_to_list(self):
+        list(self.df4.itertuples())
+
+    def time_itertuples_raw_start(self):
+        self.df4.itertuples(index=False, name=None)
+
+    def time_itertuples_raw_read_first(self):
+        next(self.df4.itertuples(index=False, name=None))
+
+    def time_itertuples_raw_tuples(self):
+        for row in self.df4.itertuples(index=False, name=None):
             pass
 
+    def time_itertuples_raw_tuples_to_list(self):
+        list(self.df4.itertuples(index=False, name=None))
+
+    def mem_itertuples_raw_start(self):
+        return self.df4.itertuples(index=False, name=None)
+
+    def peakmem_itertuples_raw_start(self):
+        self.df4.itertuples(index=False, name=None)
+
+    def peakmem_itertuples_raw_read_first(self):
+        next(self.df4.itertuples(index=False, name=None))
+
+    def peakmem_itertuples_raw(self):
+        for row in self.df4.itertuples(index=False, name=None):
+            pass
+
+    def mem_itertuples_raw_to_list(self):
+        return list(self.df4.itertuples(index=False, name=None))
+
+    def peakmem_itertuples_raw_to_list(self):
+        list(self.df4.itertuples(index=False, name=None))
+
     def time_iterrows(self):
         for row in self.df.iterrows():
             pass
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1,5 +1,8 @@
 .. _io:
 
+.. currentmodule:: pandas
+
+
 {{ header }}
 
 .. ipython:: python
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1251,6 +1251,8 @@ Performance Improvements
 - Fixed a performance regression on Windows with Python 3.7 of :func:`read_csv` (:issue:`23516`)
 - Improved performance of :class:`Categorical` constructor for ``Series`` objects (:issue:`23814`)
 - Improved performance of :meth:`~DataFrame.where` for Categorical data (:issue:`24077`)
+- Improved performance of iterating over a :class:`Series`. Using :meth:`DataFrame.itertuples` now creates iterators
+  without internally allocating lists of all elements (:issue:`20783`)
 
 .. _whatsnew_0240.docs:
 
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -8,7 +8,7 @@
 
 import pandas._libs.lib as lib
 import pandas.compat as compat
-from pandas.compat import PYPY, OrderedDict, builtins
+from pandas.compat import PYPY, OrderedDict, builtins, map, range
 from pandas.compat.numpy import function as nv
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import Appender, Substitution, cache_readonly
@@ -1072,7 +1072,13 @@ def __iter__(self):
         (for str, int, float) or a pandas scalar
         (for Timestamp/Timedelta/Interval/Period)
         """
-        return iter(self.tolist())
+        # We are explicity making element iterators.
+        if is_datetimelike(self._values):
+            return map(com.maybe_box_datetimelike, self._values)
+        elif is_extension_array_dtype(self._values):
+            return iter(self._values)
+        else:
+            return map(self._values.item, range(self._values.size))
 
     @cache_readonly
     def hasnans(self):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -898,10 +898,10 @@ def itertuples(self, index=True, name="Pandas"):
         Animal(Index='hawk', num_legs=2, num_wings=2)
         """
         arrays = []
-        fields = []
+        fields = list(self.columns)
         if index:
             arrays.append(self.index)
-            fields.append("Index")
+            fields.insert(0, "Index")
 
         # use integer indexing because of possible duplicate column names
         arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
@@ -911,10 +911,9 @@ def itertuples(self, index=True, name="Pandas"):
         if name is not None and len(self.columns) + index < 256:
             # `rename` is unsupported in Python 2.6
             try:
-                itertuple = collections.namedtuple(name,
-                                                   fields + list(self.columns),
-                                                   rename=True)
+                itertuple = collections.namedtuple(name, fields, rename=True)
                 return map(itertuple._make, zip(*arrays))
+
             except Exception:
                 pass
 
diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py
@@ -69,35 +69,22 @@ def _gotitem(self, key, ndim, subset=None):
 
 # special case to prevent duplicate plots when catching exceptions when
 # forwarding methods from NDFrames
-plotting_methods = frozenset(['plot', 'boxplot', 'hist'])
+plotting_methods = frozenset(['plot', 'hist'])
 
 common_apply_whitelist = frozenset([
-    'last', 'first',
-    'head', 'tail', 'median',
-    'mean', 'sum', 'min', 'max',
-    'cumcount', 'ngroup',
-    'resample',
-    'rank', 'quantile',
-    'fillna',
-    'mad',
-    'any', 'all',
-    'take',
-    'idxmax', 'idxmin',
-    'shift', 'tshift',
-    'ffill', 'bfill',
-    'pct_change', 'skew',
-    'corr', 'cov', 'diff',
+    'quantile', 'fillna', 'mad', 'take',
+    'idxmax', 'idxmin', 'tshift',
+    'skew', 'corr', 'cov', 'diff'
 ]) | plotting_methods
 
 series_apply_whitelist = ((common_apply_whitelist |
                            {'nlargest', 'nsmallest',
                             'is_monotonic_increasing',
-                            'is_monotonic_decreasing'}) -
-                          {'boxplot'}) | frozenset(['dtype', 'unique'])
+                            'is_monotonic_decreasing'})
+                          ) | frozenset(['dtype', 'unique'])
 
 dataframe_apply_whitelist = ((common_apply_whitelist |
-                              frozenset(['dtypes', 'corrwith'])) -
-                             {'boxplot'})
+                              frozenset(['dtypes', 'corrwith'])))
 
 cython_transforms = frozenset(['cumprod', 'cumsum', 'shift',
                                'cummin', 'cummax'])
diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py
@@ -14,35 +14,16 @@
 AGG_FUNCTIONS_WITH_SKIPNA = ['skew', 'mad']
 
 df_whitelist = [
-    'last',
-    'first',
-    'mean',
-    'sum',
-    'min',
-    'max',
-    'head',
-    'tail',
-    'cumcount',
-    'ngroup',
-    'resample',
-    'rank',
     'quantile',
     'fillna',
     'mad',
-    'any',
-    'all',
     'take',
     'idxmax',
     'idxmin',
-    'shift',
     'tshift',
-    'ffill',
-    'bfill',
-    'pct_change',
     'skew',
     'plot',
     'hist',
-    'median',
     'dtypes',
     'corrwith',
     'corr',
@@ -57,35 +38,16 @@ def df_whitelist_fixture(request):
 
 
 s_whitelist = [
-    'last',
-    'first',
-    'mean',
-    'sum',
-    'min',
-    'max',
-    'head',
-    'tail',
-    'cumcount',
-    'ngroup',
-    'resample',
-    'rank',
     'quantile',
     'fillna',
     'mad',
-    'any',
-    'all',
     'take',
     'idxmax',
     'idxmin',
-    'shift',
     'tshift',
-    'ffill',
-    'bfill',
-    'pct_change',
     'skew',
     'plot',
     'hist',
-    'median',
     'dtype',
     'corr',
     'cov',
@@ -150,17 +112,8 @@ def test_groupby_whitelist(df_letters, whitelist):
 def check_whitelist(obj, df, m):
     # check the obj for a particular whitelist m
 
-    # these are aliases so ok to have the alias __name__
-    alias = {'bfill': 'backfill',
-             'ffill': 'pad',
-             'boxplot': None}
-
     gb = obj.groupby(df.letters)
 
-    m = alias.get(m, m)
-    if m is None:
-        return
-
     f = getattr(type(gb), m)
 
     # name