diff --git a/doc/source/basics.rst b/doc/source/basics.rst index e11c612a510db..757cff43f87e7 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1211,9 +1211,10 @@ To iterate over the rows of a DataFrame, you can use the following methods: * :meth:`~DataFrame.iterrows`: Iterate over the rows of a DataFrame as (index, Series) pairs. This converts the rows to Series objects, which can change the dtypes and has some performance implications. -* :meth:`~DataFrame.itertuples`: Iterate over the rows of a DataFrame as tuples of the values. - This is a lot faster as :meth:`~DataFrame.iterrows`, and is in most cases preferable to - use to iterate over the values of a DataFrame. +* :meth:`~DataFrame.itertuples`: Iterate over the rows of a DataFrame + as namedtuples of the values. This is a lot faster as + :meth:`~DataFrame.iterrows`, and is in most cases preferable to use + to iterate over the values of a DataFrame. .. warning:: @@ -1307,7 +1308,7 @@ index value along with a Series containing the data in each row: df_orig['int'].dtype To preserve dtypes while iterating over the rows, it is better - to use :meth:`~DataFrame.itertuples` which returns tuples of the values + to use :meth:`~DataFrame.itertuples` which returns namedtuples of the values and which is generally much faster as ``iterrows``. For instance, a contrived way to transpose the DataFrame would be: @@ -1325,9 +1326,9 @@ itertuples ~~~~~~~~~~ The :meth:`~DataFrame.itertuples` method will return an iterator -yielding a tuple for each row in the DataFrame. The first element -of the tuple will be the row's corresponding index value, -while the remaining values are the row values. +yielding a namedtuple for each row in the DataFrame. The first element +of the tuple will be the row's corresponding index value, while the +remaining values are the row values. For instance, @@ -1336,9 +1337,16 @@ For instance, for row in df.itertuples(): print(row) -This method does not convert the row to a Series object but just returns the -values inside a tuple. Therefore, :meth:`~DataFrame.itertuples` preserves the -data type of the values and is generally faster as :meth:`~DataFrame.iterrows`. +This method does not convert the row to a Series object but just +returns the values inside a namedtuple. Therefore, +:meth:`~DataFrame.itertuples` preserves the data type of the values +and is generally faster as :meth:`~DataFrame.iterrows`. + +.. note:: + + The columns names will be renamed to positional names if they are + invalid Python identifiers, repeated, or start with an underscore. + With a large number of columns (>255), regular tuples are returned. .. _basics.dt_accessors: diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt index e303adfd356da..84db16e338d87 100755 --- a/doc/source/whatsnew/v0.17.1.txt +++ b/doc/source/whatsnew/v0.17.1.txt @@ -38,6 +38,7 @@ API changes Legacy Python syntax (``set([x, y])``) (:issue:`11215`) - Indexing with a null key will raise a ``TypeError``, instead of a ``ValueError`` (:issue:`11356`) - ``Series.sort_index()`` now correctly handles the ``inplace`` option (:issue:`11402`) +- ``DataFrame.itertuples()`` now returns ``namedtuple`` objects, when possible. (:issue:`11269`) .. _whatsnew_0171.deprecations: @@ -71,7 +72,7 @@ Bug Fixes - Bug in ``HDFStore.append`` with strings whose encoded length exceded the max unencoded length (:issue:`11234`) - Bug in merging ``datetime64[ns, tz]`` dtypes (:issue:`11405`) - Bug in ``HDFStore.select`` when comparing with a numpy scalar in a where clause (:issue:`11283`) -- Bug in using ``DataFrame.ix`` with a multi-index indexer(:issue:`11372`) +- Bug in using ``DataFrame.ix`` with a multi-index indexer(:issue:`11372`) - Bug in tz-conversions with an ambiguous time and ``.dt`` accessors (:issue:`11295`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4774fc4f17a91..b06f1b947bbe7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -584,7 +584,7 @@ def iteritems(self): See also -------- iterrows : Iterate over the rows of a DataFrame as (index, Series) pairs. - itertuples : Iterate over the rows of a DataFrame as tuples of the values. + itertuples : Iterate over the rows of a DataFrame as namedtuples of the values. """ if self.columns.is_unique and hasattr(self, '_item_cache'): @@ -617,7 +617,7 @@ def iterrows(self): int64 To preserve dtypes while iterating over the rows, it is better - to use :meth:`itertuples` which returns tuples of the values + to use :meth:`itertuples` which returns namedtuples of the values and which is generally faster as ``iterrows``. 2. You should **never modify** something you are iterating over. @@ -632,7 +632,7 @@ def iterrows(self): See also -------- - itertuples : Iterate over the rows of a DataFrame as tuples of the values. + itertuples : Iterate over the rows of a DataFrame as namedtuples of the values. iteritems : Iterate over (column name, Series) pairs. """ @@ -641,15 +641,23 @@ def iterrows(self): s = Series(v, index=columns, name=k) yield k, s - def itertuples(self, index=True): + def itertuples(self, index=True, name="Pandas"): """ - Iterate over the rows of DataFrame as tuples, with index value + Iterate over the rows of DataFrame as namedtuples, with index value as first element of the tuple. Parameters ---------- index : boolean, default True If True, return the index as the first element of the tuple. + name : string, default "Pandas" + The name of the returned namedtuple. + + Notes + ----- + The columns names will be renamed to positional names if they are + invalid Python identifiers, repeated, or start with an underscore. + With a large number of columns (>255), regular tuples are returned. See also -------- @@ -666,16 +674,32 @@ def itertuples(self, index=True): b 2 0.2 >>> for row in df.itertuples(): ... print(row) - ('a', 1, 0.10000000000000001) - ('b', 2, 0.20000000000000001) + ... + Pandas(Index='a', col1=1, col2=0.10000000000000001) + Pandas(Index='b', col1=2, col2=0.20000000000000001) """ arrays = [] + fields = [] if index: arrays.append(self.index) + fields.append("Index") # use integer indexing because of possible duplicate column names arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) + + # Python 3 supports at most 255 arguments to constructor, and + # things get slow with this many fields in Python 2 + if len(self.columns) + index < 256: + # `rename` is unsupported in Python 2.6 + try: + itertuple = collections.namedtuple( + name, fields+list(self.columns), rename=True) + return (itertuple(*row) for row in zip(*arrays)) + except: + pass + + # fallback to regular tuples return zip(*arrays) if compat.PY3: # pragma: no cover @@ -1213,7 +1237,7 @@ def to_panel(self): def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, - mode='w', encoding=None, compression=None, quoting=None, + mode='w', encoding=None, compression=None, quoting=None, quotechar='"', line_terminator='\n', chunksize=None, tupleize_cols=False, date_format=None, doublequote=True, escapechar=None, decimal='.', **kwds): @@ -1251,7 +1275,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, A string representing the encoding to use in the output file, defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. compression : string, optional - a string representing the compression to use in the output file, + a string representing the compression to use in the output file, allowed values are 'gzip', 'bz2', only used when the first argument is a filename line_terminator : string, default '\\n' diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index dfbd21997568d..1b57d53a548f3 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -5545,6 +5545,27 @@ def test_itertuples(self): dfaa = df[['a', 'a']] self.assertEqual(list(dfaa.itertuples()), [(0, 1, 1), (1, 2, 2), (2, 3, 3)]) + tup = next(df.itertuples(name='TestName')) + + # no support for field renaming in Python 2.6, regular tuples are returned + if sys.version >= LooseVersion('2.7'): + self.assertEqual(tup._fields, ('Index', 'a', 'b')) + self.assertEqual((tup.Index, tup.a, tup.b), tup) + self.assertEqual(type(tup).__name__, 'TestName') + + df.columns = ['def', 'return'] + tup2 = next(df.itertuples(name='TestName')) + self.assertEqual(tup2, (0, 1, 4)) + + if sys.version >= LooseVersion('2.7'): + self.assertEqual(tup2._fields, ('Index', '_1', '_2')) + + df3 = DataFrame(dict(('f'+str(i), [i]) for i in range(1024))) + # will raise SyntaxError if trying to create namedtuple + tup3 = next(df3.itertuples()) + self.assertFalse(hasattr(tup3, '_fields')) + self.assertIsInstance(tup3, tuple) + def test_len(self): self.assertEqual(len(self.frame), len(self.frame.index))